# Breast Cancer Campaign Part 2

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Import Data and Explore

In [2]:
bc = pd.read_csv('data_refined.csv')

In [3]:
bc.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,1,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,1,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,1,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


In [4]:
bc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
diagnosis                  569 non-null int64
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
concave points_se          569 no

### Feature Selection

In [5]:
bc.corr()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,1.0,0.730029,0.415185,0.742636,0.708984,0.35856,0.596534,0.69636,0.776614,0.330499,...,0.776454,0.456903,0.782914,0.733825,0.421465,0.590998,0.65961,0.793566,0.416294,0.323872
radius_mean,0.730029,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.415185,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.742636,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.708984,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,0.35856,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,0.596534,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.69636,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.776614,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
symmetry_mean,0.330499,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413


In [6]:
a = bc[bc.columns[0:]].corr()['diagnosis'][:]

result = a[abs(a)>0.5]
result

diagnosis               1.000000
radius_mean             0.730029
perimeter_mean          0.742636
area_mean               0.708984
compactness_mean        0.596534
concavity_mean          0.696360
concave points_mean     0.776614
radius_se               0.567134
perimeter_se            0.556141
area_se                 0.548236
radius_worst            0.776454
perimeter_worst         0.782914
area_worst              0.733825
compactness_worst       0.590998
concavity_worst         0.659610
concave points_worst    0.793566
Name: diagnosis, dtype: float64

In [7]:
bc2 = bc[['radius_mean','perimeter_mean','area_mean','compactness_mean','concavity_mean','concave points_mean',
          'radius_se','perimeter_se','area_se','radius_worst','perimeter_worst','area_worst','compactness_worst',
          'concavity_worst','concave points_worst']]

## Full Dataset

### Splitting the Data

In [8]:
X = bc.drop(['diagnosis'],axis=1)
y = bc['diagnosis']

In [9]:
# Split the data, 80% training, 10% test, 10% validation
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_train,y_train,test_size=0.5,random_state=1)

### KNN Classifier

In [10]:
# Find the optimal value of K for KNN
k_range = range(1,15)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn,X,y,cv=10,scoring='accuracy')
    k_scores.append(scores.mean())

print(k_scores)
print('best k:',(np.asarray(k_scores,dtype=float)).argmax()-1)

[0.950745397977703, 0.9560722928009678, 0.9647523982369716, 0.9647837265577737, 0.9665694408434881, 0.9648453029124534, 0.9683551551292023, 0.9649382075879354, 0.9649079595540575, 0.9649392878748596, 0.9684480598046841, 0.9614294356581107, 0.966693673839772, 0.9596750496931984]
best k: 9


In [11]:
# Using k=9
model = KNeighborsClassifier(n_neighbors=9).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('score ',score)
y_pred=model.predict(X_test)
print('accuracy_score: '+ str(accuracy_score(y_test,y_pred)))

y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

score  0.986784140969163
accuracy_score: 0.986784140969163
[[141   1]
 [  2  83]]


### Random Forest Classifier

In [12]:
model = RandomForestClassifier(n_estimators=50,criterion='entropy',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  1.0
[[142   0]
 [  0  85]]


### Support Vector Classifier

In [13]:
model = SVC(C=1.0,kernel='linear',gamma='auto',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.9955947136563876
[[142   0]
 [  1  84]]


## Reduced Dataset

### Splitting the Data

In [14]:
X = bc2
y = bc['diagnosis']

In [15]:
# Split the data, 80% training, 10% test, 10% validation
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_train,y_train,test_size=0.5,random_state=1)

### KNN Classifier

In [16]:
# Find the optimal value of K for KNN
k_range = range(1,15)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn,X,y,cv=10,scoring='accuracy')
    k_scores.append(scores.mean())

print(k_scores)
print('best k:',(np.asarray(k_scores,dtype=float)).argmax()-1)

[0.9210709964566588, 0.9228567107423731, 0.9262725779967159, 0.9404340592861464, 0.9351698211044852, 0.9387099213551118, 0.9404027309653443, 0.9369252873563217, 0.9422197735718607, 0.9422197735718605, 0.9368626307147178, 0.9333225304640912, 0.933352778497969, 0.9298753348889465]
best k: 7


In [17]:
# Using k=7
model = KNeighborsClassifier(n_neighbors=7).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('score ',score)
y_pred=model.predict(X_test)
print('accuracy_score: '+ str(accuracy_score(y_test,y_pred)))

y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

score  0.960352422907489
accuracy_score: 0.960352422907489
[[139   3]
 [  6  79]]


### Random Forest Classifier

In [18]:
model = RandomForestClassifier(n_estimators=50,criterion='entropy',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  1.0
[[142   0]
 [  0  85]]


### Support Vector Classifier

In [19]:
model = SVC(C=1.0,kernel='linear',gamma='auto',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.973568281938326
[[140   2]
 [  4  81]]


In [20]:
### Full dataset had better accuracy scores