## Libraries

In [162]:
# To Exploratory Data Analysis
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# To KNN Classification
# + Pandas and Numpy already imported
import matplotlib.pyplot as plt
import scipy as scp
import warnings
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import classification_report, confusion_matrix

## Dataset

In [163]:
df = pd.read_csv('data.csv')

In [164]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


## Dataset Initial Investigation

In [165]:
# Checking dataframe information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [166]:
# Dropping colummn Unnamed 32 because it has only null values
df.drop(columns='Unnamed: 32', inplace=True)
# Dropping column id because it will not contribute to next analysis 
df.drop(columns='id', inplace=True)

In [167]:
# Checking correlations between the features
df.corr()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
radius_mean,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,-0.311631,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,-0.076437,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,-0.261477,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,-0.28311,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,0.584792,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,0.565369,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,0.336783,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,0.166917,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
symmetry_mean,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,0.479921,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413
fractal_dimension_mean,-0.311631,-0.076437,-0.261477,-0.28311,0.584792,0.565369,0.336783,0.166917,0.479921,1.0,...,-0.253691,-0.051269,-0.205151,-0.231854,0.504942,0.458798,0.346234,0.175325,0.334019,0.767297


In [168]:
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [169]:
# Selecting only mean columns
mean_df = df.iloc[:, 0:11]
mean_df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [170]:
# Transforming categorical variables into values
mean_df['diagnosis'].replace({'M':1, 'B':0}, inplace = True)

In [171]:
def lower_triangular_matrix(df:pd.DataFrame):
    '''
    This function replaces the values above main diagonal
    to NaN, discarding duplicated correlations
    '''
    mask = np.zeros_like(df, dtype = bool)
    mask[np.triu_indices_from(mask)] = True
    correlations = df.mask(mask)
    return correlations

In [172]:
# Creating a dataframe with correlation values
df_corr_mean = mean_df.corr()

# Showing heatmap
fig = px.imshow(lower_triangular_matrix(df_corr_mean), text_auto=True)
fig.show()

In [173]:
# Correlation between sorted mean features correlations
correlation = df_corr_mean.sort_values(by='diagnosis', ascending=False)

In [252]:
# Bar plot with correlation values 
fig = px.bar(x=correlation.index[1:], y=correlation['diagnosis'][1:], color_discrete_sequence=['#AB63FA'])
fig.update_layout(title_text=f'Correlation Between Features and Diagnosis', title_x=0.5) 
fig.update_yaxes(title_text='Correlation')
fig.update_xaxes(title_text='Features')
fig.show()

In [239]:
# To exhibition 
local_df = mean_df.copy()
local_df.replace({0:'Benign', 1:'Malignant'}, inplace=True)

In [254]:
# Checking Concave Points Mean
fig = px.histogram(data_frame=local_df, x='concave points_mean', color='diagnosis', color_discrete_sequence=['#EF553B', '#636EFA'])
fig.update_layout(barmode='overlay', legend_title_text = "Diagnosis", title_text='Concave Points Mean By Diagnosis', title_x=0.5)
fig.update_traces(opacity=0.80)
fig.update_xaxes(title_text='Concave Points Mean')
fig.update_yaxes(title_text='Count')

In [255]:
# Checking Concave Points Mean
fig = px.histogram(data_frame=local_df, x='radius_mean', color='diagnosis', color_discrete_sequence=['#EF553B', '#636EFA'])
fig.update_layout(barmode='overlay', legend_title_text = "Diagnosis", title_text='Radius Mean By Diagnosis', title_x=0.5)
fig.update_traces(opacity=0.80)
fig.update_xaxes(title_text='Radius Mean')
fig.update_yaxes(title_text='Count')
fig.show()

In [276]:
# Diagnosis value counts
diagnosis_value_counts = mean_df['diagnosis'].value_counts().to_frame()
diagnosis_value_counts.rename(index={0:'Benign',1:'Malignant'}, inplace=True)

# Donut Chart
fig = go.Figure(data=[go.Pie(labels=diagnosis_value_counts.index, values=diagnosis_value_counts['diagnosis'], hole=.3)])
fig.update_layout(legend_title_text = "Diagnosis", title_text='Occurencies by Diagnosis', title_x=0.5)
fig.show()

## K-Nearest Neighbourhood

In [176]:
warnings.filterwarnings("ignore")

In [177]:
# Independent variables 
x = mean_df.drop('diagnosis', axis=1)
# Dependent variables
y = mean_df['diagnosis']

In [182]:
# Subdividing training and testing data
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2)

In [183]:
# Reducing values scale
scaler = StandardScaler()  
scaler.fit(x_train)

x_train = scaler.transform(x_train)  
x_test = scaler.transform(x_test)

In [270]:
# Generating confusion matrix
def confusion_matrix_generator(y_test, y_pred):
    matrix = confusion_matrix(y_test, y_pred)
    dict_matrix = {'Positive':{'Positive':matrix[0][0], 'Negative':matrix[0][1]},\
        'Negative':{'Positive': matrix[1][0], 'Negative':matrix[1][1]}}

    df_matrix = pd.DataFrame(dict_matrix)
    return df_matrix

In [211]:
# Metrics
def report(K_value, y_test, y_pred):   
    report_list = classification_report(y_test, y_pred).split('\n')
    rows_list = []

    for i in range(len(report_list)):
        rows_list.append(report_list[i].split())

    dict_to_frame = {}
    diagnosis = ['Benign', 'Malignant']
    for i in range (len(diagnosis)):
        dict_to_frame[diagnosis[i]] = {'Precision':float(rows_list[i+2][1]), 'Recall':float(rows_list[i+2][2]),\
        'F1-Score':float(rows_list[i+2][3]), 'Support':float(rows_list[i+2][4])}

    report_df = pd.DataFrame(dict_to_frame).T
    report_df['K-Value'] = [K_value, K_value]
    return report_df


In [256]:
# Testing different k values 
all_df = []
for i in range(1, 10):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    df_report = report(K_value=i, y_test=y_test, y_pred=y_pred)
    all_df.append(df_report)

    output = pd.concat(all_df).reset_index()
    output.rename(columns={'index':'diagnosis'}, inplace=True)


In [257]:
# Output 
output

Unnamed: 0,diagnosis,Precision,Recall,F1-Score,Support,K-Value
0,Benign,0.95,0.96,0.95,74.0,1
1,Malignant,0.92,0.9,0.91,40.0,1
2,Benign,0.91,1.0,0.95,74.0,2
3,Malignant,1.0,0.82,0.9,40.0,2
4,Benign,0.96,0.96,0.96,74.0,3
5,Malignant,0.93,0.93,0.93,40.0,3
6,Benign,0.96,0.97,0.97,74.0,4
7,Malignant,0.95,0.93,0.94,40.0,4
8,Benign,0.96,0.97,0.97,74.0,5
9,Malignant,0.95,0.93,0.94,40.0,5


In [260]:
'''
False Negatives between malignant diagnosis are most concern, 
that's why I focused on Recall metric to find best K-Value
'''
output[output.loc[:,'diagnosis']=='Malignant'].sort_values(by='Recall', ascending=False)

Unnamed: 0,diagnosis,Precision,Recall,F1-Score,Support,K-Value
13,Malignant,0.95,0.97,0.96,40.0,7
17,Malignant,0.95,0.97,0.96,40.0,9
15,Malignant,0.95,0.95,0.95,40.0,8
5,Malignant,0.93,0.93,0.93,40.0,3
7,Malignant,0.95,0.93,0.94,40.0,4
9,Malignant,0.95,0.93,0.94,40.0,5
11,Malignant,0.95,0.93,0.94,40.0,6
1,Malignant,0.92,0.9,0.91,40.0,1
3,Malignant,1.0,0.82,0.9,40.0,2


In [263]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
report(K_value=7, y_test=y_test, y_pred=y_pred)

Unnamed: 0,Precision,Recall,F1-Score,Support,K-Value
Benign,0.99,0.97,0.98,74.0,7
Malignant,0.95,0.97,0.96,40.0,7


In [271]:
confusion_matrix_generator(y_test=y_test, y_pred=y_pred)

Unnamed: 0,Positive,Negative
Positive,72,1
Negative,2,39


In [273]:
template = {'Positive': ['True Positive', 'False Positive'], 'Negative':['False Negative', 'True Negative']}
pd.DataFrame(template)

Unnamed: 0,Positive,Negative
0,True Positive,False Negative
1,False Positive,True Negative


In [275]:
fig = px.line(output, x='K-Value', y='Recall', color='diagnosis', markers=True)
fig.update_layout(legend_title_text = "Diagnosis", title_text='Recall by K-Value', title_x=0.5)
fig.show()

In [278]:
fig = px.line(output, x='K-Value', y='Precision', color='diagnosis', markers=True)
fig.update_layout(legend_title_text = "Diagnosis", title_text='Precision by K-Value', title_x=0.5)
fig.show()