## Libraries

In [38]:
import pandas as pd
import plotly.express as px

from functions import confusion_matrix_generator

## Generating Results Dataframe

In [15]:
all_df = []

for number_of_execution in range(1,11):
    df = pd.read_csv(f'results_exec_{number_of_execution}.csv')
    all_df.append(df)

output = pd.concat(all_df).reset_index()

In [16]:
'''
Due the external saving of the files some unnecessary columns must be inserted
'''
output.columns

Index(['index', 'Unnamed: 0', 'diagnosis', 'Precision', 'Recall', 'F1-Score',
       'Support', 'K-Value'],
      dtype='object')

In [17]:
output.drop(columns=['Unnamed: 0','index'], inplace=True)

In [18]:
output.head()

Unnamed: 0,diagnosis,Precision,Recall,F1-Score,Support,K-Value
0,Benign,0.86,0.85,0.85,71.0,1
1,Malignant,0.75,0.77,0.76,43.0,1
2,Benign,0.84,0.93,0.88,71.0,2
3,Malignant,0.86,0.7,0.77,43.0,2
4,Benign,0.88,0.92,0.9,71.0,3


## Results Analysis 

In [23]:
# Sorting std by recall
output.groupby('K-Value')[['Precision','Recall','F1-Score']].std().sort_values(by='Recall')

Unnamed: 0_level_0,Precision,Recall,F1-Score
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.060739,0.076398,0.058891
9,0.056127,0.082262,0.054483
7,0.056798,0.083193,0.050617
3,0.056743,0.091445,0.061067
5,0.054044,0.092532,0.056798
8,0.054799,0.095394,0.054856
6,0.049513,0.103741,0.058319
4,0.055939,0.111572,0.068894
2,0.059496,0.136455,0.0757


In [25]:
# Sorting mean by recall
output.groupby('K-Value')[['Precision','Recall','F1-Score']].mean().sort_values(by='Recall', ascending=False)

Unnamed: 0_level_0,Precision,Recall,F1-Score
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,0.8885,0.8775,0.88
5,0.8895,0.876,0.8795
7,0.8855,0.875,0.876
3,0.8875,0.874,0.8785
6,0.889,0.866,0.873
8,0.8865,0.865,0.8725
4,0.8885,0.862,0.871
1,0.8595,0.8545,0.8555
2,0.8835,0.841,0.854


In [52]:
fig = px.box(output, x='K-Value', y='Recall')
fig.update_layout(title_text='Recall by K-Value (10 Executions)', title_x=0.5)
fig.show()

In [27]:
# Sorting mean by precision
output.groupby('K-Value')[['Precision','Recall','F1-Score']].mean().sort_values(by='Precision', ascending=False)

Unnamed: 0_level_0,Precision,Recall,F1-Score
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,0.8895,0.876,0.8795
6,0.889,0.866,0.873
9,0.8885,0.8775,0.88
4,0.8885,0.862,0.871
3,0.8875,0.874,0.8785
8,0.8865,0.865,0.8725
7,0.8855,0.875,0.876
2,0.8835,0.841,0.854
1,0.8595,0.8545,0.8555


In [53]:
fig = px.box(output, x='K-Value', y='Precision')
fig.update_layout(title_text='Precision by K-Value (10 Executions)', title_x=0.5)
fig.show()

In [28]:
# Sorting mean by f1-score
output.groupby('K-Value')[['Precision','Recall','F1-Score']].mean().sort_values(by='F1-Score', ascending=False)

Unnamed: 0_level_0,Precision,Recall,F1-Score
K-Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,0.8885,0.8775,0.88
5,0.8895,0.876,0.8795
3,0.8875,0.874,0.8785
7,0.8855,0.875,0.876
6,0.889,0.866,0.873
8,0.8865,0.865,0.8725
4,0.8885,0.862,0.871
1,0.8595,0.8545,0.8555
2,0.8835,0.841,0.854


In [55]:
fig = px.box(output, x='K-Value', y='F1-Score')
fig.update_layout(title_text='F1-Score by K-Value (10 Executions)', title_x=0.5)
fig.show()

In [46]:
'''
The value 9 was chosen because it presented the second lowest mean standard-deviation between recall,
the highest mean recall (most import metric in this case), the third highest mean precision
and the highest mean f1-score.
'''

best_k_value = 9
best_k_df = output[output.loc[:,'K-Value']==9].reset_index()

In [45]:
best_k_df[best_k_df.loc[:,'Recall']<0.8]

Unnamed: 0,index,diagnosis,Precision,Recall,F1-Score,Support,K-Value
5,53,Malignant,0.94,0.71,0.81,41.0,9
11,107,Malignant,0.8,0.76,0.78,37.0,9
17,161,Malignant,0.86,0.7,0.77,43.0,9
