### Preliminary Operations

In [1]:
# importing basic libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# ignoring warning

import warnings
warnings.filterwarnings("ignore")

In [3]:
# importing our training data

training_data=pd.read_csv('/Users/adityabanerjee/Documents/Cancer /Datasets/Cleaned/cleaned_training_data.csv')

In [4]:
training_data.head()

Unnamed: 0.1,Unnamed: 0,radius_mean,texture_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,...,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,smoothness_worst,symmetry_worst,fractal_dimension_worst,Diagnosis
0,0,-0.000978,-1.488563,-0.213777,-0.975139,-0.911513,-0.567478,-1.608084,-0.826902,-0.54339,...,-0.808855,-0.826971,-0.837374,-0.759152,-0.493934,-0.150443,-0.177312,-1.279028,-0.726861,0
1,1,0.506535,0.991255,-0.867296,-0.075592,0.150732,0.139026,0.179306,-1.336731,0.019055,...,0.3476,0.597654,1.330367,1.1794,-0.299065,0.12846,-0.827231,-0.124155,-1.149868,1
2,2,0.255696,1.271271,-0.157638,-1.008533,-0.829242,-0.522902,-0.884078,-1.102942,3.137893,...,-0.830894,-0.43059,1.208061,-1.536803,-0.761416,-0.265411,-1.660427,-2.150476,-1.565968,1
3,3,-0.391821,0.659717,-0.39659,-0.888313,-0.528622,-0.575265,-0.808661,-0.312849,-0.621871,...,-0.768257,-0.267114,-0.463446,-0.473015,-0.437252,-0.371862,0.420613,-0.091577,-0.203947,1
4,4,1.136551,0.231853,0.290756,0.472074,0.379739,0.782963,1.193668,-0.295948,1.371308,...,0.100534,0.179741,0.769393,1.07535,0.069187,1.126973,-0.055994,0.595808,-0.438833,1


In [5]:
# splitting our training data as independent and dependent features

X=training_data.drop(columns=['Diagnosis'], axis=1)         # independent variables
y=training_data['Diagnosis']                                # target variable

In [6]:
# splitting our data into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

### Basic KNeighbors Classifier

In [7]:
# creating a basic knn classifier model

from sklearn.neighbors import KNeighborsClassifier
knn_basic=KNeighborsClassifier()

In [8]:
# fitting our data to the model

knn_basic.fit(X_train, y_train)

In [9]:
# evaluating our model

y_pred_basic=knn_basic.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cr=classification_report(y_test, y_pred_basic)
cm=confusion_matrix(y_test, y_pred_basic)
acc_score=accuracy_score(y_test, y_pred_basic)

print("KNN (Basic):\n")
print(f"Accuracy Score:{acc_score}\n\nConfusion Matrix:\n{cm}\n\nClassification Report:\n{cr}")

KNN (Basic):

Accuracy Score:0.6833333333333333

Confusion Matrix:
[[63  6]
 [32 19]]

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.91      0.77        69
           1       0.76      0.37      0.50        51

    accuracy                           0.68       120
   macro avg       0.71      0.64      0.63       120
weighted avg       0.70      0.68      0.65       120



### Hyper-Parameter Tuning our KneighborsClassifier Model

In [10]:
# hyperparamter tuning our knn model

params={
    'n_neighbors':[1,2,3,4,5,10,15,20,25],
    'weights':['uniform','distance'],
    'algorithm':[ 'auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size':[10,15,20,25,30,50,60,90,100,200],
    'p':[1,2]
}

from sklearn.model_selection import GridSearchCV, StratifiedKFold
cv=StratifiedKFold(n_splits=5)
grid=GridSearchCV(estimator=knn_basic, param_grid=params, scoring='accuracy', n_jobs=-1, cv=cv)
grid.fit(X_train, y_train)

In [11]:
grid.best_params_

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 1,
 'p': 1,
 'weights': 'uniform'}

In [12]:
grid.best_score_

np.float64(0.8522727272727273)

### Creating and Exporting the Best Model

In [13]:
# creating best model (knn)

knn_bestmodel=grid.best_estimator_
knn_bestmodel.fit(X_train, y_train)

In [14]:
# evaluating best model (knn)

y_pred_best=knn_bestmodel.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cr=classification_report(y_test, y_pred_best)
cm=confusion_matrix(y_test, y_pred_best)
acc_score=accuracy_score(y_test, y_pred_best)

print("KNN(Best Estimator):\n")
print(f"Accuracy Score:{acc_score}\n\nConfusion Matrix:\n{cm}\n\nClassification Report:\n{cr}")

KNN(Best Estimator):

Accuracy Score:0.85

Confusion Matrix:
[[61  8]
 [10 41]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87        69
           1       0.84      0.80      0.82        51

    accuracy                           0.85       120
   macro avg       0.85      0.84      0.85       120
weighted avg       0.85      0.85      0.85       120



In [15]:
# exporting our model

import pickle 
with open("/Users/adityabanerjee/Documents/Cancer /Exported Models/knn.pkl", "wb") as f:
    pickle.dump(obj=knn_bestmodel, file=f)