In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
pd.options.display.max_columns = 999

  from pandas.core import datetools


In [None]:
# label our diagnosis as 1 and 0 for training purposes M: 1 , B :0
df['diagnosis'] = df.diagnosis.astype("category").cat.codes
df['diagnosis'] = df['diagnosis'].astype('float64')


In [None]:
# drop independent variable and NaN column
X = df.drop(['diagnosis','Unnamed: 32'], axis=1)
# what we are predicting on
y = df['diagnosis']

In [None]:
# percent of dataset saved for testing
test_size = 0.33

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_size, random_state=42))

In [None]:
# make predictions for test data and evaluate
y_pred = test.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#data description with swarm plot
sns.set(style='whitegrid', palette='muted')
x_n = (x-x.mean())/(x.std())
data = pd.concat([y, x_n], axis=1)
data = pd.melt(data, id_vars='diagnosis', var_name='features', value_name='value')
plt.figure(figsize=(30,10))
sns.swarmplot(x='features', y='value', hue='diagnosis', data=data)
plt.xticks(rotation=90)

In [None]:
#check correlation for multicollinearity
corr=x.corr()
plt.subplots(figsize=(18,18))
sns.heatmap(corr,annot=True,fmt='.1f')

In [None]:
#select features
drop_list=['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','compactness_se','concave points_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','area_worst','texture_worst']
x_1=x.drop(drop_list,axis=1)

In [None]:
#feature elimination with cross validation and random forest
from sklearn.feature_selection import RFECV
clf_2 = RandomForestClassifier(n_estimators=100)
rfecv = RFECV(estimator=clf_2, step=1, cv=5, scoring='accuracy')
rfecv = rfecv.fit(x_train, y_train)

print('optimal number of features: ', rfecv.n_features_)
print('Best features: ', x_train.columns[rfecv.support_])

In [None]:
drop_list_2 = ['symmetry_mean', 'texture_se','smoothness_se']
x_2 = x_1.drop(drop_list_2, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_2, y, test_size=test_size)

In [None]:

test_2 = clf_2.fit(x_train, y_train)
y_pred_2 = test_2.predict(x_test)
predictions_2 = [round(value) for value in y_pred_2]
accuracy = metrics.accuracy_score(y_test, predictions_2)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#feature importance
featimp = pd.Series(clf_2.feature_importances_).sort_values(ascending=False)
print(featimp)

In [None]:
#use only feature 3, 1, 5, 10
x_3 = df[['area_mean','concavity_mean','area_se','concavity_worst']]
x_train, x_test, y_train, y_test = train_test_split(x_3, y, test_size=0.33, random_state=42)
test_3 = clf_2.fit(x_train, y_train)
y_pred_3 = test_3.predict(x_test)
predictions_3 = [round(value) for value in y_pred_3]
accuracy = metrics.accuracy_score(y_test, predictions_3)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
recall = metrics.recall_score(y_test, predictions_3)
print("Sensitivity: %.2f%%" % (recall * 100.0))

In [None]:
#KNN

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
y_pred4=knn.predict(x_test)
predictions4 = [round(value) for value in y_pred4]
accuracy = metrics.accuracy_score(y_test, predictions4)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
recall = metrics.recall_score(y_test, predictions4)
print("Sensitivity: %.2f%%" % (recall * 100.0))

In [None]:
#AUC curve for Random Forest
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions_3)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Area Under the Curve Random Forest')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#AUC Curve for KNN
fpr_2, tpr_2, threshold = metrics.roc_curve(y_test, predictions4)
roc_auc_2 = metrics.auc(fpr_2, tpr_2)

plt.title('Area Under the Curve KNN')
plt.plot(fpr_2, tpr_2, 'b', label = 'AUC = %0.2f' % roc_auc_2)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#SVM 
clf_1 = svm.SVC(kernel = 'linear')
test_1 = clf_1.fit(X_train, y_train)

y_pred1 = clf.predict(X_test)
predictions1 = [round(value) for value in y_pred1]
accuracy = metrics.accuracy_score(y_test, predictions1)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
recall = metrics.recall_score(y_test, predictions1)
print("Sensitivity: %.2f%%" % (recall * 100.0))

In [None]:
#AUC Curve for SVM
fpr_3, tpr_3, threshold = metrics.roc_curve(y_test, predictions1)
roc_auc_3 = metrics.auc(fpr_3, tpr_3)

plt.title('Area Under the Curve KNN')
plt.plot(fpr_3, tpr_3, 'b', label = 'AUC = %0.2f' % roc_auc_3)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()