# Breast Cancer Diagnosis
----

In [181]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy import stats
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
%matplotlib inline

In [199]:
df = pd.read_csv('./data/wdbc.csv')

In [200]:
print(df.shape)
df.head()

(569, 33)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [201]:
# DROP Unnamed: 32 AND id column
df = df.loc[:, ~df.columns.isin(['Unnamed: 32', 'id'])]

In [202]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [203]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [204]:
# Create diagnosisi catigorical variable
df['diagnosisCat'] = np.where(df['diagnosis'] == 'M', 1, 0)

In [205]:
X = df.loc[:, ~df.columns.isin(['diagnosis', 'diagnosisCat'])]
Y = df['diagnosisCat']

### Feature Selection

In [206]:
X_kb = SelectKBest(chi2, k=15)
X_kb.fit_transform(X, Y)

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 6.656e-01, 7.119e-01,
        2.654e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.866e-01, 2.416e-01,
        1.860e-01],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 4.245e-01, 4.504e-01,
        2.430e-01],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 3.094e-01, 3.403e-01,
        1.418e-01],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 8.681e-01, 9.387e-01,
        2.650e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 6.444e-02, 0.000e+00,
        0.000e+00]])

In [207]:
idxs_selected  = X_kb.get_support(indices=True)
X_new = X.columns[idxs_selected]
print(X_new)

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'concavity_mean', 'radius_se', 'perimeter_se', 'area_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst'],
      dtype='object')


In [208]:
X = X[X_new]
X.shape

(569, 15)

### Display correlation Matrix to features that need to be dropped

In [209]:
correlation_matrix = X.corr()
display(correlation_matrix)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,concavity_mean,radius_se,perimeter_se,area_se,radius_worst,texture_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
radius_mean,1.0,0.323782,0.997855,0.987357,0.676764,0.67909,0.674172,0.735864,0.969539,0.297008,0.965137,0.941082,0.413463,0.526911,0.744214
texture_mean,0.323782,1.0,0.329533,0.321086,0.302418,0.275869,0.281673,0.259845,0.352573,0.912045,0.35804,0.343546,0.27783,0.301025,0.295316
perimeter_mean,0.997855,0.329533,1.0,0.986507,0.716136,0.691765,0.693135,0.744983,0.969476,0.303038,0.970387,0.94155,0.455774,0.563879,0.771241
area_mean,0.987357,0.321086,0.986507,1.0,0.685983,0.732562,0.726628,0.800086,0.962746,0.287489,0.95912,0.959213,0.39041,0.512606,0.722017
concavity_mean,0.676764,0.302418,0.716136,0.685983,1.0,0.631925,0.660391,0.617427,0.688236,0.299879,0.729565,0.675987,0.754968,0.884103,0.861323
radius_se,0.67909,0.275869,0.691765,0.732562,0.631925,1.0,0.972794,0.95183,0.715065,0.194799,0.719684,0.751548,0.287103,0.380585,0.531062
perimeter_se,0.674172,0.281673,0.693135,0.726628,0.660391,0.972794,1.0,0.937655,0.697201,0.200371,0.721031,0.730713,0.341919,0.418899,0.554897
area_se,0.735864,0.259845,0.744983,0.800086,0.617427,0.95183,0.937655,1.0,0.757373,0.196497,0.761213,0.811408,0.283257,0.3851,0.538166
radius_worst,0.969539,0.352573,0.969476,0.962746,0.688236,0.715065,0.697201,0.757373,1.0,0.359921,0.993708,0.984015,0.47582,0.573975,0.787424
texture_worst,0.297008,0.912045,0.303038,0.287489,0.299879,0.194799,0.200371,0.196497,0.359921,1.0,0.365098,0.345842,0.360832,0.368366,0.359755


In [210]:
# Correlated features to be dropped
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(to_drop)

['perimeter_mean', 'area_mean', 'perimeter_se', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst']


In [211]:
cols = list(X.columns)

for col in to_drop: 
    cols.remove(col)

len(cols)

X = X[cols]

In [212]:
X.columns

Index(['radius_mean', 'texture_mean', 'concavity_mean', 'radius_se',
       'compactness_worst', 'concavity_worst', 'concave points_worst'],
      dtype='object')

## Random Forests

In [216]:
rfc = ensemble.RandomForestClassifier(n_estimators=100)

In [217]:
cvs = cross_val_score(rfc, X, Y, cv=10)

In [218]:
cvs

array([0.94827586, 0.89655172, 0.96491228, 0.94736842, 0.96491228,
       0.92982456, 0.9122807 , 0.98214286, 0.98214286, 0.98214286])

In [219]:
np.average(cvs)

0.9510554403249504

## Logistic Regression

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)

In [221]:
X_train['intercept'] = 1
X_test['intercept'] = 1

# Declare and fit the model.
logit = sm.Logit(y_train, X_train)
result = logit.fit()

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.099964
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:           diagnosisCat   No. Observations:                  426
Model:                          Logit   Df Residuals:                      418
Method:                           MLE   Df Model:                            7
Date:                Wed, 22 Aug 2018   Pseudo R-squ.:                  0.8487
Time:                        10:56:41   Log-Likelihood:                -42.585
converged:                       True   LL-Null:                       -281.44
                                        LLR p-value:                 4.951e-99
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
radius_mean              0.7985      0.198      4.028      0.000       0.410       1.18

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [224]:
# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = result.predict(X_test)

# Code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(y_test, pred_y_statsmod)

print('\nAccuracy by M status')
print(table)
print('\nPercentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


Accuracy by M status
col_0          0   1
diagnosisCat        
0             89   1
1              3  50

Percentage accuracy
0.972027972027972


In [226]:
y_test.value_counts()

0    90
1    53
Name: diagnosisCat, dtype: int64

The Logistic Regression model seems to me slightly more accurate.

## SKlearn Logistic Regression

In [271]:
lr = LogisticRegression(C=1e5)

# Fit the model.
fit = lr.fit(X_train, y_train)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X_test)

p_sklearn = np.where(pred_y_sklearn < .5, 0, 1)

print('\n Accuracy by admission status')
print(pd.crosstab(y_test, p_sklearn))

print('\n Percentage accuracy')
print(lr.score(X_test, y_test))

Coefficients
[[  0.79690068   0.43699271 -27.01714364   9.3270303   -9.77857558
   10.18294514  84.30435592 -16.23689663]]
[-16.23689663]

 Accuracy by admission status
col_0          0   1
diagnosisCat        
0             89   1
1              3  50

 Percentage accuracy
0.972027972027972
