## PREDICTING WHETHER A MAMMOGRAM MASS IS BENIGN OR MALIGNANT

In [1]:
import pandas as pd
masses_data=pd.read_csv("mammographic_masses.data.txt")
masses_data.head(5)

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0


In [2]:
# convert missing data(?) into Nan and give column names
masses_data=pd.read_csv("mammographic_masses.data.txt",na_values=['?'],names=['BI-RADS','age','shape','margin','density','severity'])
masses_data.head(5)

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [3]:
masses_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 6 columns):
BI-RADS     959 non-null float64
age         956 non-null float64
shape       930 non-null float64
margin      913 non-null float64
density     885 non-null float64
severity    961 non-null int64
dtypes: float64(5), int64(1)
memory usage: 45.1 KB


In [4]:
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [5]:
masses_data.loc[(masses_data['age'].isnull()) | (masses_data['shape'].isnull()) |
               (masses_data['margin'].isnull()) | (masses_data['density'].isnull())]

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
9,5.0,60.0,,5.0,1.0,1
12,4.0,64.0,1.0,,3.0,0
19,4.0,40.0,1.0,,,0
20,,66.0,,,1.0,1
22,4.0,43.0,1.0,,,0


In [6]:
# missing data seems andomly distributed, drop rows have miss data
masses_data.dropna(inplace=True)
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [8]:
#convert pandas dataframes into numpy arrays
all_features=masses_data[['age','shape','margin','density']].values
all_classes=masses_data['severity'].values
feature_names=['age','shape','margin','density']
all_features

array([[ 67.,   3.,   5.,   3.],
       [ 58.,   4.,   5.,   3.],
       [ 28.,   1.,   1.,   3.],
       ..., 
       [ 64.,   4.,   5.,   3.],
       [ 66.,   4.,   5.,   3.],
       [ 62.,   3.,   3.,   3.]])

In [11]:
# normalize data to put in model
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
all_features_scaled=scaler.fit_transform(all_features)
all_features_scaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ..., 
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [12]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [17]:
#Using Decision tree classifier to fit model
np.random.seed(1234)
clf=DecisionTreeClassifier(random_state=1)
(training_inputs,testing_inputs,training_classes,testing_classes)=train_test_split(all_features_scaled,all_classes,test_size=0.30,random_state=1)
clf.fit(training_inputs,training_classes)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [18]:
from IPython.display import Image,HTML

In [19]:
clf.score(testing_inputs,testing_classes)

0.73493975903614461

In [20]:
#use k fold cross validation score to increase accuracy
from sklearn.model_selection import cross_val_score
clf=DecisionTreeClassifier(random_state=1)
cv_scores=cross_val_score(clf,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.73735569455522443

In [22]:
# Now using Random forest classifier to improve accuracy
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=10,random_state=1)
cv_scores=cross_val_score(clf,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.75404964806963037

In [46]:
# now trying with SVM
from sklearn.svm import SVC
svc=SVC(C=1.0,kernel='linear')

In [47]:
cv_scores=cross_val_score(svc,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.79408889915060943

In [43]:
# now try using kernel as sigmoid
svc=SVC(C=1.0,kernel='sigmoid')
cv_scores=cross_val_score(svc,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.79165022459174672

In [48]:
# now try using kernel as rbf
svc=SVC(C=1.0,kernel='rbf')
cv_scores=cross_val_score(svc,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.79522130333179397

In [None]:
# above shows clearly that using kernel as rbf gives better accuracy

In [38]:
# now try with KNN
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier(n_neighbors=10)

In [39]:
cv_scores=cross_val_score(clf,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.78547954885745075

In [40]:
#now trying with different values of k
for n in range(1,50):
    clf=KNeighborsClassifier(n_neighbors=n)
    cv_scores=cross_val_score(clf,all_features_scaled,all_classes,cv=10)
    print(n,cv_scores.mean())

1 0.723912374236
2 0.688983809804
3 0.75410806991
4 0.730081300813
5 0.773546450611
6 0.762616318934
7 0.794059513315
8 0.774708240628
9 0.788020024348
10 0.785479548857
11 0.79153338091
12 0.779425716805
13 0.781908470117
14 0.791503995074
15 0.787874844325
16 0.779441109385
17 0.781807368848
18 0.775681121699
19 0.780514741894
20 0.782866658271
21 0.785392790675
22 0.78173425409
23 0.780558820648
24 0.780587506822
25 0.787817122147
26 0.786626995788
27 0.785436519598
28 0.790227110533
29 0.786597959783
30 0.787831465234
31 0.791417236892
32 0.787831465234
33 0.786597609952
34 0.786611953039
35 0.786626296125
36 0.785435819935
37 0.786684368135
38 0.78665533213
39 0.787889187412
40 0.785479199026
41 0.785464506108
42 0.781850048277
43 0.78306921064
44 0.783054867554
45 0.783054867554
46 0.785464855939
47 0.786684368135
48 0.789065320516
49 0.790299525629


In [42]:
#now trying Naive Bayes
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
all_features_scaled=scaler.fit_transform(all_features)
clf=GaussianNB()
cv_scores=cross_val_score(clf,all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.7867865188978912

In [None]:
#using neural networks through keras library
from keras.layers import  Dense
from keras.models import Sequential
from kernel.wrappers.scikit_learn import KerasClassifier
def create_model():
    model=Sequential()
    #4-feature i/p going into 6-neuron layer
    model.add(Dense(6,input_dim=4,kernel_initializer='normal',activation='relu'))
    #o/p layer with binary classification (beningn or malignant)
    model.add(Dense(1,kernel_initializer='normal',activation='sigmoid'))
    #compile model
    model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
    return model
estimator=KerasClassifier(build_fn=create_model,nb_epoch=100)
cv_scores=cross_val_score(estimator,all_features_scaled,all_classes,cv=10)
cv_scores.mean()