In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression as lm
from sklearn.model_selection import train_test_split
from sklearn import tree
from matplotlib import pyplot as plt    
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

Cancer Prediction data set from Kaggle:
URL :https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/home

In [2]:
cancer = pd.read_csv('C:/Exercise/breast_cancer/data.csv')
print(cancer.head(4))
cancer.shape

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   

      ...       texture_worst  perimeter_worst  area_worst  smoothness_worst  \
0     ...               17.33           184.60      2019.0            0.1622   
1     ...               23.41           158.80      1956.0

(569, 33)

Dropping off unwanted columns and converting categorical variable into binary format

In [None]:
cancer = cancer.drop(['id','Unnamed: 32'],axis=1)
#convert diagnosis into binary(1=M, 0=B)
diagnosis ={'diagnosis':{'M':1,'B':0}}
cancer =cancer.replace(diagnosis)

print(cancer.head(1))

In [7]:
%matplotlib inline
plt.figure()
cancer.diff().hist(alpha=0.5,figsize=(14, 14))
plt.legend()
plt.show()

<matplotlib.figure.Figure at 0x144acbce2b0>

Splitting of dataset into training and test data set with a ratio of 70/30

In [47]:
train,test=train_test_split(cancer,test_size=0.3,random_state=14)
print(type(train))

<class 'pandas.core.frame.DataFrame'>


Method for building models using all features:
Fitting the model, predicting for data and calculating Mean Squared Error , Variance Score and Accuracy score

In [52]:
def building_model_all(model,train,test):
    y_train=train[['diagnosis']]
    x_train=train[train.columns[1:]]
    y_test=test[['diagnosis']]
    x_test=test[test.columns[1:]]
    model.fit(x_train,y_train)
    y_prd = model.predict(x_test)
    #print("coefficient: "+ str(model.coef_))
    print("\nmean Squared error : \n")
    print(mean_squared_error(y_test,y_prd))
    print('\nVariance score :\n')
    print(r2_score(y_test,y_prd))
    print("\naccuracy score: \n")
    print(accuracy_score(y_test,y_prd))



Method for building models using selective features:
Fitting the model, predicting for data and calculating Mean Squared Error , Variance Score and Accuracy score

In [50]:
def building_model_selective(model,train,test):
    y_train = train[['diagnosis']]
    names = ['radius_mean','perimeter_mean', 'area_mean', 'compactness_mean']
    x_train = train[names]
    y_test = test[['diagnosis']]
    x_test = test[names]
    model.fit(x_train,y_train)
    y_prd = model.predict(x_test)
    #print("\ncoefficient: "+ str(model.coef_))
    print("\nmean Squared error : \n")
    print(mean_squared_error(y_test,y_prd))
    print('\nVariance score :\n')
    print(r2_score(y_test,y_prd))
    print("\naccuracy score: \n")
    print(accuracy_score(y_test,y_prd))

Building of model using Logistic Regression using all and selective features

In [45]:
model = lm()
building_model_all(model,train,test)
building_model_selective(model,train,test)

coefficient: [[-2.07399668e+00  5.66123599e-03  7.91022194e-02 -1.16873026e-03
   1.46915759e-01  3.98101950e-01  6.10957793e-01  3.30724547e-01
   1.60054813e-01  2.63930866e-02  6.48284137e-02 -7.12604311e-01
  -1.40020173e-01  7.58595480e-02  2.03734205e-02  3.23485792e-02
   7.88794485e-02  4.61684771e-02  3.96996344e-02  2.57858166e-03
  -1.31233328e+00  2.34335133e-01  1.24445693e-01  2.73564190e-02
   2.66041942e-01  9.61537751e-01  1.37169206e+00  6.24215225e-01
   4.10801141e-01  9.31487457e-02]]

mean Squared error : 

0.03508771929824561

Variance score :

0.8492063492063492

accuracy score: 

0.9649122807017544
coefficient: [[-4.39993226  0.54954519  0.01842406  1.01795449]]

mean Squared error : 

0.0935672514619883

Variance score :

0.5978835978835979

accuracy score: 

0.9064327485380117


  y = column_or_1d(y, warn=True)


Building of model using Decision tree using all and selective features

In [None]:
#decision tree
model_tree = tree.DecisionTreeClassifier()
building_model_all(model_tree,train,test)
building_model_selective(model_tree,train,test)
