In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
data.drop(["id", "Unnamed: 32"], inplace = True, axis = 1)
data = data.rename(columns = {"diagnosis": "target"})
data["target"] = [1 if i.strip() == "M" else 0 for i in data.target]
data["target"].value_counts()

In [None]:
data.shape

In [None]:
data.isnull().sum()

***EDA***

* Correlation*

In [None]:
corr_matrix = data.corr()
sns.clustermap(corr_matrix, annot = True, fmt = ".2f")
plt.title("Corralation Between Features")
plt.show()

We want to use the variables has a low correlation for the model. Cause they will have a different impact on the model.
Ex: Radius mean and area mean has a high correlation. These two variables have the same effect on the model.

In [None]:
th = 0.75
selection = np.abs(corr_matrix["target"]) > th
corr_features = corr_matrix.columns[selection].tolist()
sns.clustermap(data[corr_features].corr(), annot = True, fmt = ".2f")
plt.title("Corralation Between Features w Corr Treshold 0.75")
plt.show()

In [None]:
data_melted = pd.melt(data, id_vars = "target",
                      var_name = "features",
                      value_name = "value")    

plt.figure()
sns.boxplot(x = "features", y = "value", hue = "target", data = data_melted)
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.pairplot(data[corr_features], diag_kind = "kde", markers = "+", hue = "target")
plt.show()

* Outlier Detection*

In [None]:
y = data["target"]
X = data.drop(["target"], axis = 1)
columns = X.columns.tolist()

In [None]:
lof = LocalOutlierFactor()
y_pred = lof.fit_predict(X)
lof_scores = lof.negative_outlier_factor_

In [None]:
len(lof_scores)

In [None]:
np.sort(lof_scores)[:20]

There is only 1(one) outlier.

In [None]:
outlier_scores = pd.DataFrame() 
outlier_scores["scores"] = lof_scores

In [None]:
th = np.sort(lof_scores)[1]
filtre = outlier_scores["scores"] < th
outlier_index = outlier_scores[filtre].index.tolist()

In [None]:
X = X.drop(outlier_index)
y = y.drop(outlier_index).values

***Modelling***

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = 0.3,
                                                   random_state = 42)

* Standardization*

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

*KNN*

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
score = knn.score(X_test, y_test)

In [None]:
print("Score: ", score)
print("Accuracy: ", accuracy)
print("Confusion Matrix: ", matrix)

* Found Best Paramaters*

In [None]:
def get_best_paramaters(X_train, X_test, y_train, y_test):
    k_range = list(range(1,31))
    weight_options = ["uniform","distance"]
    print()
    param_grid =  dict(n_neighbors = k_range, weights = weight_options)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy")
    grid.fit(X_train, y_train)
    
    print("Best training score: {} with parameters: {}".format(grid.best_score_, grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(X_train, y_train)
    
    y_pred_test = knn.predict(X_test)
    y_pred_train = knn.predict(X_train)
    
    matrix_test = confusion_matrix(y_test, y_pred_test)
    matrix_train = confusion_matrix(y_train, y_pred_train)
    
    accuracy_test = accuracy_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print("Test score: {}, Train score: {}".format(accuracy_test, accuracy_train))
    print()
    print("Matrix test: ", matrix_test)
    print("Matrix train: ", matrix_train)
    
    return grid

In [None]:
grid = get_best_paramaters(X_train, X_test, y_train, y_test)