In [70]:
# Load all relevant Datasets

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [72]:
df = pd.read_csv('Breast_cancer_data.csv') 

In [73]:
# Preliminary Data Exploration

In [74]:
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [75]:
df.isnull().sum() 

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [76]:
df.dtypes 

mean_radius        float64
mean_texture       float64
mean_perimeter     float64
mean_area          float64
mean_smoothness    float64
diagnosis            int64
dtype: object

In [77]:
df.corr() 

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
mean_radius,1.0,0.323782,0.997855,0.987357,0.170581,-0.730029
mean_texture,0.323782,1.0,0.329533,0.321086,-0.023389,-0.415185
mean_perimeter,0.997855,0.329533,1.0,0.986507,0.207278,-0.742636
mean_area,0.987357,0.321086,0.986507,1.0,0.177028,-0.708984
mean_smoothness,0.170581,-0.023389,0.207278,0.177028,1.0,-0.35856
diagnosis,-0.730029,-0.415185,-0.742636,-0.708984,-0.35856,1.0


In [78]:
# Split the dataset for training and testing

In [79]:
from sklearn.model_selection import train_test_split

In [80]:
X = np.asanyarray(df[['mean_radius','mean_perimeter','mean_area']])
y= np.asanyarray(df[['diagnosis']])

In [81]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4 ) 

In [82]:
# Creating and running models

In [83]:
# 1) Logisitic Regression

In [84]:
from sklearn.linear_model import LogisticRegression

In [85]:
LR = LogisticRegression(C=0.01, solver='liblinear')

In [86]:
LR.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [87]:
y_hat = LR.predict(X_test)

In [88]:
y_hat_prob = LR.predict_proba(X_test)
y_hat_prob

array([[4.21109231e-01, 5.78890769e-01],
       [7.01321664e-02, 9.29867834e-01],
       [9.99772152e-01, 2.27848314e-04],
       [9.83190115e-01, 1.68098852e-02],
       [9.27561925e-01, 7.24380750e-02],
       [9.64946045e-01, 3.50539552e-02],
       [5.74735303e-02, 9.42526470e-01],
       [3.88785962e-02, 9.61121404e-01],
       [4.16729487e-01, 5.83270513e-01],
       [1.57761139e-02, 9.84223886e-01],
       [9.99200861e-01, 7.99139460e-04],
       [1.98091827e-01, 8.01908173e-01],
       [9.72062297e-02, 9.02793770e-01],
       [4.22394757e-02, 9.57760524e-01],
       [2.97451169e-01, 7.02548831e-01],
       [3.11009918e-02, 9.68899008e-01],
       [5.88663310e-02, 9.41133669e-01],
       [1.15430193e-01, 8.84569807e-01],
       [3.09763206e-02, 9.69023679e-01],
       [9.03911544e-02, 9.09608846e-01],
       [6.10463806e-02, 9.38953619e-01],
       [1.77513546e-01, 8.22486454e-01],
       [8.37537476e-01, 1.62462524e-01],
       [2.34751021e-01, 7.65248979e-01],
       [4.416553

In [89]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import accuracy_score

In [90]:
jaccard_similarity_score(y_test,y_hat)



0.8596491228070176

In [91]:
# 2) K-Nearest Nieghbor

In [92]:
from sklearn.neighbors import KNeighborsClassifier

In [93]:
for k in range(1,10):
    neigh = KNeighborsClassifier( n_neighbors = k).fit(X_train, y_train)
    y_hat_knn = neigh.predict(X_test)
    print(jaccard_similarity_score(y_test, y_hat_knn))

0.8508771929824561
0.7631578947368421
0.8245614035087719
0.7894736842105263
0.8508771929824561
0.8157894736842105
0.8333333333333334
0.8245614035087719
0.8508771929824561


  
  
  
  
  
  
  
  
  


In [94]:
# 3) Support Vector Machines

In [95]:
from sklearn import svm

In [96]:
clf = svm.SVC(kernel='rbf')

In [97]:
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [98]:
y_hat_svm = clf.predict(X_test)

In [99]:
y_hat_svm[:5]

array([1, 1, 0, 0, 0])

In [100]:
jaccard_similarity_score(y_test, y_hat_svm)



0.8771929824561403

In [101]:
# 4) Decision Trees

In [102]:
from sklearn.tree import DecisionTreeClassifier

In [103]:
Tree = DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [104]:
Tree.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [105]:
y_hat_tree = Tree.predict(X_test)
y_hat_tree[:5]

array([1, 1, 0, 0, 0])

In [106]:
jaccard_similarity_score(y_test,y_hat_tree)



0.8596491228070176

In [107]:
# Out of all our models, SVM works the best with a 87.719% accuracy.