## Breast  Cancer Classifier

### Prepares the Data
* Reads the data
* Deletes the Index Column
* Creates columns for the unlabeled data
* Renamed the first column to TYPE

In [116]:
import pandas as pd

In [117]:
data = pd.read_csv('wdbc_data.csv', index_col=0).reset_index()
data.columns = [str(x) for x in range(0,32)]
dt = data.drop('0', axis=1)
dt = dt.rename(columns = {'1':'TYPE'})
dt.head()

Unnamed: 0,TYPE,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
0,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


### Standardize the Variables

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [118]:
from sklearn.preprocessing import StandardScaler

In [119]:
scaler = StandardScaler()

In [120]:
scaler.fit(dt.drop('TYPE', axis=1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [121]:
scaled_features = scaler.transform(dt.drop('TYPE', axis=1))

In [122]:
df_feat = pd.DataFrame(scaled_features,columns=dt.columns[1:])
df_feat.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,22,23,24,25,26,27,28,29,30,31
0,1.832084,-0.358327,1.689106,1.910392,-0.825266,-0.485498,-0.019278,0.555261,0.005315,-0.867817,...,1.813349,-0.371875,1.545063,1.899054,-0.373544,-0.428051,-0.143472,1.095262,-0.240444,0.285294
1,1.582106,0.45386,1.56959,1.560576,0.946192,1.06796,1.375494,2.051509,0.946867,-0.395465,...,1.518626,-0.026387,1.356695,1.463694,0.530042,1.093192,0.861314,1.966468,1.163858,0.205301
2,-0.767117,0.250813,-0.590772,-0.762711,3.290559,3.438483,1.930879,1.46317,2.881262,4.932714,...,-0.278773,0.13169,-0.246823,-0.547953,3.398711,3.918265,1.999398,2.18809,6.086236,4.950457
3,1.752545,-1.158844,1.779774,1.827915,0.283499,0.549886,1.383068,1.439844,-0.005676,-0.560504,...,1.304847,-1.470265,1.347725,1.227507,0.222998,-0.310393,0.618779,0.736081,-0.868555,-0.394649
4,-0.474529,-0.841438,-0.385121,-0.503904,2.243076,1.261043,0.875648,0.833102,1.012812,1.900845,...,-0.162544,-0.316467,-0.111378,-0.241438,2.052104,1.735196,1.270831,0.913379,1.769188,2.250684


### Splits the data in Train Pack, and Test Pack

In [123]:
from sklearn.model_selection import train_test_split

In [124]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, dt['TYPE'], test_size=0.3, train_size=0.7)

### Train the KNN

In [125]:
from sklearn.neighbors import KNeighborsClassifier

In [126]:
knn = KNeighborsClassifier(n_neighbors=1)

In [127]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [128]:
pred = knn.predict(X_test)

### Prediction Evaluation

In [129]:
from sklearn.metrics import confusion_matrix, classification_report

In [132]:
print(confusion_matrix(y_test, pred))

[[101   4]
 [  2  64]]


In [133]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          B       0.98      0.96      0.97       105
          M       0.94      0.97      0.96        66

avg / total       0.97      0.96      0.97       171

