In [1]:
from PlantReactivityAnalysis.config import FEATURES_LETTERS_DIR, FIGURES_DIR
from PlantReactivityAnalysis.visualization.visualize import export_df_to_image_formatted

In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
file_path = r"experiment_results.csv"
df = pd.read_csv(file_path)
df.shape

(1863, 11)

In [4]:
df['Hop Length'].value_counts()

Hop Length
1.00    828
0.50    414
2.00    414
0.10    138
0.05     69
Name: count, dtype: int64

In [5]:
df['Window Size'].value_counts()

Window Size
1.0    828
2.0    828
0.1    138
0.2     69
Name: count, dtype: int64

# RQ2 All datasets

In [6]:
df_rq_2 = df[df['RQ'] == 2]
best_accuracy_per_rq_dataset = df_rq_2.loc[df_rq_2.groupby(['RQ'])['accuracy'].idxmax()]
df_rq_2.reset_index(drop=True, inplace=True)
best_accuracy_per_rq_dataset.head(30)

Unnamed: 0,model_name,parameters,f1,accuracy,precision,recall,confusion_matrix,RQ,Window Size,Hop Length,Correlation Treshold
1336,gradientboosting,"{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'random_state': 42}",0.453625,0.458564,0.453921,0.458564,[[ 95 68 88]\n [ 49 142 42]\n [ 72 73 95]],2,1.0,1.0,0.9


In [7]:
max_accuracy_index = df_rq_2['accuracy'].idxmax()
max_accuracy_row_dict = df_rq_2.iloc[max_accuracy_index].to_dict()
print(max_accuracy_row_dict)

{'model_name': 'gradientboosting', 'parameters': "{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'random_state': 42}", 'f1': 0.453625332256142, 'accuracy': 0.4585635359116022, 'precision': 0.4539205975541948, 'recall': 0.4585635359116022, 'confusion_matrix': '[[ 95  68  88]\n [ 49 142  42]\n [ 72  73  95]]', 'RQ': 2, 'Window Size': 1.0, 'Hop Length': 1.0, 'Correlation Treshold': 0.9}


In [8]:
filtered_df = df[df['RQ'] == 2]
idx = filtered_df.groupby('model_name')['accuracy'].idxmax()
best_combinations = filtered_df.loc[idx]
columns= ['model_name', 'Window Size', 'Hop Length', 'Correlation Treshold', 'parameters', 'f1', 'accuracy', 'precision', 'recall']
best_combinations= best_combinations[columns]
column_names= ['Model', 'Window Size', 'Hop Length', 'Correlation Treshold', 'Model Parameters', 'F1', 'Accuracy', 'Precision', 'Recall']
best_combinations.columns= column_names
best_combinations.head(10)

Unnamed: 0,Model,Window Size,Hop Length,Correlation Treshold,Model Parameters,F1,Accuracy,Precision,Recall
1349,adaboost,1.0,1.0,0.9,"{'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 42}",0.438793,0.439227,0.441472,0.439227
1200,extratrees,1.0,0.5,0.9,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42}",0.434937,0.439227,0.436259,0.439227
1347,gaussiannb,1.0,1.0,0.9,{'var_smoothing': 1e-10},0.418604,0.418508,0.419201,0.418508
1336,gradientboosting,1.0,1.0,0.9,"{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'random_state': 42}",0.453625,0.458564,0.453921,0.458564
120,kneighbors,1.0,0.5,0.7,"{'n_neighbors': 15, 'weights': 'uniform'}",0.373377,0.375691,0.375027,0.375691
1230,lgbm,1.0,0.5,0.9,"{'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 31, 'random_state': 42}",0.432006,0.435083,0.430682,0.435083
800,logisticregression,1.0,1.0,0.8,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.298383,0.374309,0.249567,0.374309
1177,randomforest,1.0,0.5,0.9,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200, 'random_state': 42}",0.444395,0.447514,0.443453,0.447514
1376,xgb,1.0,1.0,0.9,"{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}",0.453441,0.455801,0.452967,0.455801


In [9]:
col_widths = [0.05, 0.05, 0.05, 0.05, 0.2, 0.05, 0.05, 0.05, 0.05] 
#export_df_to_image_formatted(best_combinations, FIGURES_DIR/'models_performance_rq2.png', figsize=(10,20), col_widths=col_widths,  font_size=100)

# RQ1 All datasets

In [10]:
df_rq_1 = df[df['RQ'] == 1]
best_accuracy_per_rq_dataset = df_rq_1.loc[df_rq_1.groupby(['RQ'])['accuracy'].idxmax()]
df_rq_1.reset_index(drop=True, inplace=True)
best_accuracy_per_rq_dataset.head(30)

Unnamed: 0,model_name,parameters,f1,accuracy,precision,recall,confusion_matrix,RQ,Window Size,Hop Length,Correlation Treshold
1436,lgbm,"{'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 64, 'random_state': 42}",0.748755,0.749421,0.752096,0.749421,[[603 261]\n [172 692]],1,2.0,1.0,0.9


In [11]:
max_accuracy_index = df_rq_1['accuracy'].idxmax()
max_accuracy_row_dict = df_rq_1.iloc[max_accuracy_index].to_dict()
print(max_accuracy_row_dict)

{'model_name': 'lgbm', 'parameters': "{'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 64, 'random_state': 42}", 'f1': 0.7487548114328004, 'accuracy': 0.7494212962962963, 'precision': 0.7520962664590597, 'recall': 0.7494212962962963, 'confusion_matrix': '[[603 261]\n [172 692]]', 'RQ': 1, 'Window Size': 2.0, 'Hop Length': 1.0, 'Correlation Treshold': 0.9}


In [12]:
filtered_df = df[df['RQ'] == 1]
idx = filtered_df.groupby('model_name')['accuracy'].idxmax()
best_combinations = filtered_df.loc[idx]
columns= ['model_name', 'Window Size', 'Hop Length', 'Correlation Treshold', 'parameters', 'f1', 'accuracy', 'precision', 'recall']
best_combinations= best_combinations[columns]
column_names= ['Model', 'Window Size', 'Hop Length', 'Correlation Treshold', 'Model Parameters', 'F1', 'Accuracy', 'Precision', 'Recall']
best_combinations.columns= column_names
best_combinations.head(10)

Unnamed: 0,Model,Window Size,Hop Length,Correlation Treshold,Model Parameters,F1,Accuracy,Precision,Recall
1418,adaboost,2.0,1.0,0.9,"{'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 42}",0.713397,0.715278,0.721083,0.715278
167,extratrees,1.0,1.0,0.7,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200, 'random_state': 42}",0.725075,0.725694,0.727746,0.725694
174,gaussiannb,1.0,1.0,0.7,{'var_smoothing': 1e-10},0.453119,0.526042,0.555808,0.526042
1405,gradientboosting,2.0,1.0,0.9,"{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'random_state': 42}",0.738875,0.739583,0.742211,0.739583
47,kneighbors,1.0,0.5,0.7,"{'n_neighbors': 5, 'weights': 'uniform'}",0.517854,0.51794,0.517953,0.51794
1436,lgbm,2.0,1.0,0.9,"{'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 64, 'random_state': 42}",0.748755,0.749421,0.752096,0.749421
41,logisticregression,1.0,0.5,0.7,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.333333,0.5,0.25,0.5
690,randomforest,1.0,1.0,0.8,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}",0.733731,0.734375,0.736666,0.734375
1448,xgb,2.0,1.0,0.9,"{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 200, 'random_state': 42}",0.738145,0.739005,0.742184,0.739005


# RQ5 All datasets

In [13]:
df_rq_5 = df[df['RQ'] == 5]
best_accuracy_per_rq_dataset = df_rq_5.loc[df_rq_5.groupby(['RQ'])['accuracy'].idxmax()]
df_rq_5.reset_index(drop=True, inplace=True)
best_accuracy_per_rq_dataset.head(30)

Unnamed: 0,model_name,parameters,f1,accuracy,precision,recall,confusion_matrix,RQ,Window Size,Hop Length,Correlation Treshold
1858,xgb,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'random_state': 42}",0.505096,0.508403,0.510755,0.508403,[[5079 3182 1415]\n [2464 5467 1322]\n [2513 2297 3098]],5,0.2,0.1,0.8


In [14]:
max_accuracy_index = df_rq_5['accuracy'].idxmax()
max_accuracy_row_dict = df_rq_5.iloc[max_accuracy_index].to_dict()
print(max_accuracy_row_dict)

{'model_name': 'xgb', 'parameters': "{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'random_state': 42}", 'f1': 0.5050963121922322, 'accuracy': 0.5084025785296419, 'precision': 0.5107548169942433, 'recall': 0.5084025785296419, 'confusion_matrix': '[[5079 3182 1415]\n [2464 5467 1322]\n [2513 2297 3098]]', 'RQ': 5, 'Window Size': 0.2, 'Hop Length': 0.1, 'Correlation Treshold': 0.8}


In [15]:
filtered_df = df[df['RQ'] == 5]
idx = filtered_df.groupby('model_name')['accuracy'].idxmax()
best_combinations = filtered_df.loc[idx]
columns= ['model_name', 'Window Size', 'Hop Length', 'Correlation Treshold', 'parameters', 'f1', 'accuracy', 'precision', 'recall']
best_combinations= best_combinations[columns]
column_names= ['Model', 'Window Size', 'Hop Length', 'Correlation Treshold', 'Model Parameters', 'F1', 'Accuracy', 'Precision', 'Recall']
best_combinations.columns= column_names
best_combinations.head(10)

Unnamed: 0,Model,Window Size,Hop Length,Correlation Treshold,Model Parameters,F1,Accuracy,Precision,Recall
1834,adaboost,0.2,0.1,0.8,"{'learning_rate': 1.0, 'n_estimators': 100, 'random_state': 42}",0.413649,0.418005,0.418555,0.418005
1823,extratrees,0.2,0.1,0.8,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200, 'random_state': 42}",0.422491,0.429966,0.43545,0.429966
1830,gaussiannb,0.2,0.1,0.8,{'var_smoothing': 1e-10},0.269474,0.364571,0.382107,0.364571
1815,gradientboosting,0.2,0.1,0.8,"{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'random_state': 42}",0.494918,0.498118,0.500476,0.498118
1841,kneighbors,0.2,0.1,0.8,"{'n_neighbors': 5, 'weights': 'uniform'}",0.338227,0.351492,0.344342,0.351492
1850,lgbm,0.2,0.1,0.8,"{'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 64, 'random_state': 42}",0.484999,0.486828,0.488105,0.486828
1835,logisticregression,0.2,0.1,0.8,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.299158,0.368558,0.343257,0.368558
1808,randomforest,0.2,0.1,0.8,"{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 42}",0.44738,0.453441,0.460016,0.453441
1858,xgb,0.2,0.1,0.8,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'random_state': 42}",0.505096,0.508403,0.510755,0.508403


In [16]:
filtered_df = df[(df['Dataset'] == 4) & (df['RQ'] == 1)]
idx = filtered_df.groupby('model_name')['accuracy'].idxmax()
best_combinations = filtered_df.loc[idx]
columns= ['model_name', 'Window Size', 'Hop Length', 'Correlation Treshold', 'parameters', 'f1', 'accuracy', 'precision', 'recall']
best_combinations= best_combinations[columns]
column_names= ['Model', 'Window Size', 'Hop Length', 'Correlation Treshold', 'Model Parameters', 'F1', 'Accuracy', 'Precision', 'Recall']
best_combinations.columns= column_names
best_combinations.head(10)

KeyError: 'Dataset'