In [None]:
import pandas as pd
import numpy as np
import math

import sklearn
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import ensemble


from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix




from scipy import stats

import time

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')

import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)
plotly.offline.init_notebook_mode() 


# Cancer diagnosis prediction
Source http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29 <br>
Challenge: Classification <br>
Using set of breast cancer data, create a model to predict breast cancer. Also, what traits are most indicative of whether or not an individual will be diagnosed?

# Summary

The data contains more bening than malignant information.But only small numbers of na data, that can be drop.There are few outliers. I decided to keep them. When exploring the features with the outcome variable 'CLASS', it becomes obvious that the data for benign findings has most of the time a much smaller variance.<br>
I decided to generate two combination feature: One that combines all cell feautures(UniCellSize,UniCellShape,MarginalAdhesion and SingleEpithelialCellSize) and the other combines all nucleus findings(NucleusFeatureSum, BareNuclei, 'BlandChromatin, 'NormalNucleoli'). <br>
Random forest, KNN, logistic regression and support vector machines perform equally well on the dataset, with scores of over 0.95. To figure out with features are most valuable for these decision, I analyse the scores when leaving one feature after the other out. The most important features for those four models have a high overlap. Finally, I try to improve the model performance, by only using the 3 top features of all models, but surprisingly this does not improve the scores.


# Load Data

In [None]:
raw = pd.read_csv('breast-cancer-wisconsin.data.csv')
columnNames = ['id','Clump Thickness','UniCellSize','UniCellShape','MarginalAdhesion','SingleEpithelialCellSize','BareNuclei',
               'BlandChromatin','NormalNucleoli','Mitoses','Class']
raw.columns=columnNames
raw.head()

# Inspect Data

In [None]:
raw.describe()

In [None]:
raw.dtypes

In [None]:
# There should only be numeric data here. so I convert the column to numeric.
raw['BareNuclei']= pd.to_numeric(raw['BareNuclei'], errors='coerce')
raw.dtypes

In [None]:
# Check how many values are NA for the target 'Class'
print('Sum of missing datapoints for Class', raw['Class'].isnull().sum())

In [None]:
# How many missing values are there per column
for c in raw.columns:
    print(c)
    print(raw[c].isnull().sum())

In [None]:
print(raw.shape)
nona=raw.dropna()
print(nona.shape)

Removed 16 rows with NA in BareNuclei.

In [None]:
# Convert Class into from 2= benign and 4 = malignant , into 0= benign and 1= malignant
nona['CLASS']= np.where(nona['Class']==2,0,1)
data=nona.drop(['Class','id'], axis=1)
data.head()

# Explore Features 
## Explore variance

In [None]:
#def LOO_Scores(Name,input_df, y,model):
rand_forest_class = ensemble.RandomForestClassifier()
RFCScores=LOO_Scores('Random Forest Classifier',inputdata, target, rand_forest_class)

## Most important features KNN

In [None]:
knn_w = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')
KNNcores=LOO_Scores('K-nearest Neighbor',inputdata, target, knn_w)

## Most important features Logit

In [None]:
logreg = linear_model.LogisticRegression(C=1e9)
logScores=LOO_Scores('Logistic Regression',inputdata, target, logreg)

## Most important features SVM

In [None]:
svm = SVC(kernel = 'linear') 
svmScores=LOO_Scores('Support vector machine',inputdata, target, svm)

## Compare Feature importance between models

In [None]:
# Combine Feature Importance from the best performing models into dataframe
Scores = {'RFC': RFCScores,'KNN':KNNcores,'LogReg':logScores,'SVM':svmScores}
Scoreresults = pd.DataFrame(data=Scores, index=(range(11)))

#Scale the values to calcualte mean
Scores_sc= StandardScaler().fit_transform(Scoreresults.dropna())
Scores_sc_df=pd.DataFrame(data=Scores_sc)
Scores_sc_df['mean_Imp']=np.mean(Scores_sc_df,axis=1)
Scores_sc_df.index=inputdata.columns
Scores_sc_df=Scores_sc_df.sort_values('mean_Imp')
Scores_sc_df.head()

In [None]:
# Make a barplot for the mean importances
meanImp=Scores_sc_df['mean_Imp']
meanImpt=pd.DataFrame(meanImp).transpose()
meanImpt.columns= Scores_sc_df.index
meanImpt

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15,8)
im = sns.barplot(data=meanImpt, ax=ax, orient='h')
#plt.setp(im.get_xticklabels(), rotation=45)
im.set_title('Scaled Scores after Feature Loss')
im.set_ylabel('Feature left out')
im.set_xlabel('Scaled Scores')
plt.show()

# Rerun models with 3 most important features

In [None]:
# Generate skinny data
dfs=pd.melt(data, id_vars=['CLASS'])
dfs.head()

In [None]:
g = sns.FacetGrid(dfs, col="variable", sharey=True,sharex=False, col_wrap=3, size=5, aspect=.5)
g = g.map(sns.boxplot, "value")
plt.show()

## Explore relation to target

# Generate new features

In [None]:
data['CellFeatureSUM']=data['UniCellSize'] + data['UniCellShape'] +data['MarginalAdhesion'] +data['SingleEpithelialCellSize']
data['NucleusFeatureSum']=data['BareNuclei'] + data['BlandChromatin'] + data['NormalNucleoli'] + data['Mitoses']

# Generate skinny data
dfs2=pd.melt(data, id_vars=['CLASS'])
dfs2.head()

In [None]:
# Comapre each feature by group
fig, ax = plt.subplots()
fig.set_size_inches(10,5)

ax = sns.violinplot(x="variable", y="value", hue="CLASS", data=dfs2, palette="muted", split=True)
plt.setp(ax.get_xticklabels(), rotation=45)
plt.show()

# Classfication
## Functions

In [None]:
def runRFC_class(input_df, target, no_folds):
    start_time=time.time()
    rand_forest_class = ensemble.RandomForestClassifier()   
    cvs = cross_val_score(rand_forest_class, input_df, target, cv=no_folds)    
    print('Time taken: {} seconds.'.format('%.3f' % (time.time() - start_time)))
    print('Average accuracy RFC: {}'.format('%.3f' % cvs.mean()))
    print('Standard deviation of accuracy: {}'.format('%.3f' % np.std(cvs, ddof=1)))
    return(cvs)

In [None]:
def runKNN_class(input_df, target,numNeigh, no_folds):
    start_time=time.time()
    knn_w = neighbors.KNeighborsClassifier(n_neighbors=numNeigh, weights='distance')
    # cross validation
    cvs = cross_val_score(knn_w, input_df, target, cv=no_folds)   
    print('Time taken: {} seconds.'.format('%.3f' % (time.time() - start_time)))
    print('Average accuracy KNN with weights: {}'.format('%.3f' % cvs.mean()))
    print('Standard deviation of accuracy: {}'.format('%.3f' % np.std(cvs, ddof=1)))
    return(cvs)

In [None]:
# Logistic regession
# Feature importance link:
#https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
def runLogit(input_df,target,no_folds):
    start_time=time.time()

    logreg = linear_model.LogisticRegression(C=1e9)
    # Very high C=1e9 in order to barely get any l2 penalties
    logreg.fit(input_df, target)
    print('Coefficients Log Regression:',logreg.coef_)
      
    cvs = cross_val_score(logreg, input_df, target, cv=no_folds)

    print('Time taken: {} seconds.'.format('%.3f' % (time.time() - start_time)))
    print('Average accuracy: {}'.format('%.3f' % cvs.mean()))
    print('Standard deviation of accuracy: {}'.format('%.3f' % np.std(cvs, ddof=1)))
    return(cvs)

In [None]:
def runLogit_Ridge(input_df,target,lambd,no_folds):
    start_time=time.time()

    ridge = linear_model.Ridge (alpha = lambd)# C defaults to 1 => l2 penalties => Ridge
    ridge.fit(input_df, target)
    print('Coefficients Ridge Regression:',ridge.coef_)
      
    cvs = cross_val_score(ridge, input_df, target, cv=no_folds)

    print('Time taken: {} seconds.'.format('%.3f' % (time.time() - start_time)))
    print('Average accuracy: {}'.format('%.3f' % cvs.mean()))
    print('Standard deviation of accuracy: {}'.format('%.3f' % np.std(cvs, ddof=1)))
    return(cvs)

In [None]:
def runLogit_Lasso(input_df,target,lambd,no_folds):
    start_time=time.time()

    lasso = linear_model.Lasso(alpha = lambd)# C defaults to 1 => l2 penalties => Ridge
    lasso.fit(input_df, target)
    print('Coefficients Lasso Regression:',lasso.coef_)
      
    cvs = cross_val_score(lasso, input_df, target, cv=no_folds)

    print('Time taken: {} seconds.'.format('%.3f' % (time.time() - start_time)))
    print('Average accuracy: {}'.format('%.3f' % cvs.mean()))
    print('Standard deviation of accuracy: {}'.format('%.3f' % np.std(cvs, ddof=1)))
    return(cvs)

In [None]:
# run a support vector as classifier
# Instantiate our model and fit the data.

def runSVM(input_df,target,no_folds):
    start_time=time.time()
    svm = SVC(kernel = 'linear')   
    cvs=cross_val_score(svm,input_df, target, cv=no_folds)
    print('Time taken: {} seconds.'.format('%.3f' % (time.time() - start_time)))
    print('Average accuracy: {}'.format('%.3f' % cvs.mean()))
    print('Standard deviation of accuracy: {}'.format('%.3f' % np.std(cvs, ddof=1)))
    return(cvs)

In [None]:
# Function to run all classifier models at once and plot scores
def predictCLASS(input_df, target,numNeigh,lowestalpha, no_folds):

    print('Random Forest:')
    RFC = runRFC_class(input_df, target, no_folds)
    print()
    print('K-Nearest Neighbors:')
    KNN = runKNN_class(input_df, target,numNeigh, no_folds)
    print()
    print('Logistic Regression')
    Logit=runLogit(input_df,target,no_folds)
    print()
    print('Suport Vector Maschine')
    SVM=runSVM(input_df,target,no_folds)
    
    lambd=lowestalpha
    
    print()
    print('Ridge Regression')
    Ridge_lamda = runLogit_Ridge(input_df, target,lambd, 10)
    Ridge_lamdax10 = runLogit_Ridge(input_df, target,lambd*10, 10)
    Ridge_lamdax100 = runLogit_Ridge(input_df, target,lambd*100, 10)
    Ridge_lamdax1000 = runLogit_Ridge(input_df, target,lambd*1000, 10)
    print()
    print('Lasso Regression')
    Lasso_lamda = runLogit_Lasso(input_df, target,lambd, 10)
    Lasso_lamdax10 = runLogit_Lasso(input_df, target,lambd*10, 10)
    Lasso_lamdax100 = runLogit_Lasso(input_df, target,lambd*100, 10)
    Lasso_lamdax1000 = runLogit_Lasso(input_df, target,lambd*1000, 10)
    
   
    Scores = {'RFC': RFC,'KNN':KNN, 'Logit':Logit,'SVM':SVM, 'Ridge_L':Ridge_lamda,
             'Ridge_10xL':Ridge_lamdax10, 'Ridge_100xL':Ridge_lamdax100,
             'Ridge_1000xL':Ridge_lamdax1000,'Lasso_L':Lasso_lamda,
             'Lasso_10xL':Ridge_lamdax10, 'Lasso_100xL':Lasso_lamdax100,
             'Lasso_1000xL':Ridge_lamdax1000}
    Scoreresults = pd.DataFrame(data=Scores, index=(range(no_folds)))

    # Make a boxplot for comparison
    fig, ax = plt.subplots()
    fig.set_size_inches(5,5)
    im = sns.boxplot(data=Scoreresults[Scoreresults.columns], ax=ax)
    plt.setp(ax.get_xticklabels(), rotation=45)
    im.set_title('Scores of Classification Models')
    im.set_ylabel('Scores')
    im.set_xlabel('Models tried')

## Prepare data for modeling

In [None]:
# For modeling scale data
print(data.shape)
data_sc=StandardScaler().fit_transform(data.dropna())# calcualting z-scores
#bringing the data back into shape
data_sc = pd.DataFrame(data_sc, columns = data.columns)
data_sc= data_sc.reset_index(drop=True)
print(data_sc.shape)
data_sc.head()

In [None]:
# define data and target for regression
target=data_sc['CLASS'].astype('int64') # Random Forest needs integers as input
inputdata=data_sc.drop('CLASS', axis=1).astype('int64')

## Model data

In [None]:
#def predictCLASS(input_df, target,numNeigh,lowestalpha, no_folds):
predictCLASS(inputdata, target, 5, 0.1, 10)

# Extract most important features

The best performing models are RFC, KNN, Logit and SVM, all with accuracies above 0.9. Let's see which features carry the highest importance for all of those models.

In [None]:
def LOO_Scores(Name,input_df, y,model):
    i=0
    print('Feature       Accuracy')
    FeatImp=[]
    for C in input_df.columns:
        i=i+1
        X = input_df.drop(C, axis=1)
        scores = cross_val_score(model, X, target)
        FeatImp.append(scores.mean())
        print(C,scores.mean())
    
    # Convert FeatImp into plotable dataframe
    FeatImpdf=pd.DataFrame(FeatImp)
    FeatImpdft=FeatImpdf.transpose()
    FeatImpdft.columns=input_df.columns
    
    # Sort columns by their mean values
    Impdf=FeatImpdft.reindex(FeatImpdft.mean().sort_values().index, axis=1)

    # Make a barplot for the importances
    print(Name)
    fig, ax = plt.subplots()
    fig.set_size_inches(15,8)
    im = sns.barplot(data=Impdf, ax=ax, orient='h')
    #plt.setp(im.get_xticklabels(), rotation=45)
    im.set_title('Score, when feature is left out')
    im.set_ylabel('Feature left out')
    im.set_xlabel('Scores')
    ax.set_xlim(min(FeatImp)-0.05,1)
    plt.show()
        
    return(FeatImp)

## Most important features RFC

In [None]:
# Comapre each feature by group
fig, ax = plt.subplots()
fig.set_size_inches(10,5)

ax = sns.violinplot(x="variable", y="value", hue="CLASS", data=dfs, palette="muted", split=True)
plt.setp(ax.get_xticklabels(), rotation=45)
plt.show()

## Explore relation between features

In [None]:
# To compare the correlation between features, we need to add some jitter as the data is categorical
def rand_jitter(arr):
    stdev = .01*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev

jitterdata= pd.DataFrame()
for c in data.columns:
    jitterdata[c]=rand_jitter(data[c])
print(jitterdata.head())

In [None]:
print(jitterdata.shape)
print(jitterdata.dropna().shape)
# Don't run this it takes forever
#g = sns.PairGrid(jitterdata.dropna())
#g.map_diag(plt.hist)
#g.map_offdiag(plt.scatter)
#g.add_legend()

In [None]:
# define data and target for regression
targetbf=data_sc['CLASS'].astype('int64') # Random Forest needs integers as input
inputdatabf=data_sc[['BareNuclei', 'NormalNucleoli','Clump Thickness']].astype('int64')

In [None]:
#def predictCLASS(input_df, target,numNeigh,lowestalpha, no_folds):
predictCLASS(inputdatabf, targetbf, 5, 0.1, 10)

# Determine sensitivity and specificity 