## Libraries

In [120]:
import pandas as pd
import plotly.express as px

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from scipy import stats

from functions import confusion_matrix_generator

## Generating Results Dataframe

In [121]:
all_df = []

for number_of_execution in range(1,11):
    df = pd.read_csv(f'results_exec_{number_of_execution}.csv')
    all_df.append(df)

output = pd.concat(all_df).reset_index()

In [122]:
'''
Due the external saving of the files some unnecessary columns were inserted
'''
output.columns

Index(['index', 'Unnamed: 0', 'diagnosis', 'Error', 'Accuracy', 'Precision',
       'Recall', 'F1-Score', 'Support', 'K-Value'],
      dtype='object')

In [123]:
output.drop(columns=['Unnamed: 0','index'], inplace=True)

In [124]:
output.head()

Unnamed: 0,diagnosis,Error,Accuracy,Precision,Recall,F1-Score,Support,K-Value
0,Benign,0.061404,0.938596,0.983871,0.910448,0.945736,67.0,1
1,Malignant,0.061404,0.938596,0.884615,0.978723,0.929293,47.0,1
2,Benign,0.017544,0.982456,0.985075,0.985075,0.985075,67.0,2
3,Malignant,0.017544,0.982456,0.978723,0.978723,0.978723,47.0,2
4,Benign,0.04386,0.95614,0.984375,0.940299,0.961832,67.0,3


## Results Analysis 

In [125]:
error_df = output.groupby('K-Value')['Error'].mean().to_frame()
fig = px.line(error_df, x=error_df.index, y='Error', color_discrete_sequence=['black'])
fig.update_layout(title_text=f'Error by K-Value', title_x=0.5) 
fig.update_xaxes(title_text='K-Value')
fig.show()

In [126]:
# Sorting mean by F1-Score
f1_score_df = output.groupby('K-Value')['F1-Score'].mean().to_frame()
fig = px.line(f1_score_df, x=f1_score_df.index, y='F1-Score', color_discrete_sequence=['black'])
fig.update_layout(title_text=f'F1-Score by K-Value', title_x=0.5) 
fig.update_xaxes(title_text='K-Value')
fig.show()

In [127]:
# Sorting mean by recall
recall = output.groupby('K-Value')['Recall'].mean().to_frame()
fig = px.line(recall, x=recall.index, y='Recall', color_discrete_sequence=['black'])
fig.update_layout(title_text=f'Recall by K-Value', title_x=0.5) 
fig.update_xaxes(title_text='K-Value')
fig.show()

In [128]:
# Sorting mean by recall
precision = output.groupby('K-Value')['Precision'].mean().to_frame()
fig = px.line(precision, x=precision.index, y='Precision', color_discrete_sequence=['black'])
fig.update_layout(title_text=f'Precision by K-Value', title_x=0.5) 
fig.update_xaxes(title_text='K-Value')
fig.show()

In [129]:
fig = px.box(output, x='K-Value', y='Recall', color_discrete_sequence=['black'])
fig.update_layout(title_text='Recall by K-Value (10 Executions)', title_x=0.5)
fig.show()

In [130]:
# Sorting mean by precision
output.groupby('K-Value')[['Precision','Recall','F1-Score']].mean().sort_values(by='Precision', ascending=False)

Unnamed: 0_level_0,Precision,Recall,F1-Score
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.948471,0.921586,0.930838
5,0.94134,0.932588,0.936063
4,0.940874,0.922984,0.929532
8,0.940521,0.926303,0.931755
6,0.94018,0.926501,0.931833
9,0.939753,0.930135,0.933848
7,0.938778,0.929009,0.933004
3,0.937634,0.927159,0.931143
1,0.925755,0.921555,0.923188


In [141]:
fig = px.box(output, x='K-Value', y='Precision', color_discrete_sequence=['black'])
fig.update_layout(title_text='Precision by K-Value (10 Executions)', title_x=0.5)
fig.show()

In [132]:
# Sorting mean by f1-score
output.groupby('K-Value')[['Precision','Recall','F1-Score']].mean().sort_values(by='F1-Score', ascending=False)

Unnamed: 0_level_0,Precision,Recall,F1-Score
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,0.94134,0.932588,0.936063
9,0.939753,0.930135,0.933848
7,0.938778,0.929009,0.933004
6,0.94018,0.926501,0.931833
8,0.940521,0.926303,0.931755
3,0.937634,0.927159,0.931143
2,0.948471,0.921586,0.930838
4,0.940874,0.922984,0.929532
1,0.925755,0.921555,0.923188


In [142]:
fig = px.box(output, x='K-Value', y='F1-Score', color_discrete_sequence=['black'])
fig.update_layout(title_text='F1-Score by K-Value (10 Executions)', title_x=0.5)
fig.show()

In [143]:
k_value_5 = output[output.loc[:,'K-Value']==5]['F1-Score'].values
k_value_9 = output[output.loc[:,'K-Value']==9]['F1-Score'].values

In [144]:
stats.kruskal(k_value_5, k_value_9)

KruskalResult(statistic=0.23716216216217392, pvalue=0.6262633060629751)

In [147]:
# Sorting mean by Error
k_5 = output[output.loc[:,'K-Value']==5].reset_index()
fig = px.line(k_7, x=k_7.index, y='F1-Score', color='diagnosis')
fig.update_layout(title_text=f'F1-Score by K-Value', title_x=0.5)
fig.update_xaxes(title_text='K-Value')
fig.show()

In [148]:
k_5.rename(columns={'diagnosis':'Diagnosis'}, inplace=True)
k_5.to_csv('results_k_5.csv')

In [149]:
fig = px.box(k_5, x='K-Value', y='F1-Score', color='Diagnosis')
fig.update_layout(title_text='F1-Score by Diagnosis (10 Executions)', title_x=0.5)
fig.show()

In [150]:
k_5.groupby('K-Value')[['F1-Score','Recall','Precision', 'Error']].mean()

Unnamed: 0_level_0,F1-Score,Recall,Precision,Error
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,0.936063,0.932588,0.94134,0.060526
