In [1]:
from PlantReactivityAnalysis.config import FEATURES_LETTERS_DIR, FIGURES_DIR
from PlantReactivityAnalysis.visualization.visualize import export_df_to_image_formatted

In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
file_path = r"results.csv"
df = pd.read_csv(file_path)
df.shape

(2815, 13)

In [4]:
df['Normalization'].value_counts()

Normalization
norm    1413
raw     1264
Name: count, dtype: int64

In [5]:
df['Hop Length'].value_counts()

Hop Length
1.0    1373
0.5     810
2.0     632
Name: count, dtype: int64

In [6]:
df['Window Size'].value_counts()

Window Size
1.0    1541
2.0    1274
Name: count, dtype: int64

# RQ2 All datasets

In [7]:
df_rq_2 = df[df['RQ'] == 2]
best_accuracy_per_rq_dataset = df_rq_2.loc[df_rq_2.groupby(['Dataset', 'RQ'])['accuracy'].idxmax()]
df_rq_2.reset_index(drop=True, inplace=True)
best_accuracy_per_rq_dataset.head(30)

Unnamed: 0,model_name,parameters,f1,accuracy,precision,recall,confusion_matrix,Dataset,RQ,Normalization,Window Size,Hop Length,Correlation Treshold
70,svm,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'random_state': 42}",0.426641,0.42649,0.42723,0.42649,[[107 63 87]\n [ 55 126 75]\n [ 79 74 89]],1.0,2,norm,1.0,1.0,1.0
787,xgb,"{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}",0.42924,0.433113,0.435503,0.433113,[[ 80 67 110]\n [ 41 141 74]\n [ 62 74 106]],2.0,2,norm,2.0,1.0,0.8
159,svm,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'random_state': 42}",0.432056,0.436464,0.43217,0.436464,[[ 96 79 76]\n [ 45 134 54]\n [ 85 69 86]],3.0,2,norm,1.0,0.5,0.8
236,xgb,"{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'random_state': 42}",0.441719,0.44337,0.441712,0.44337,[[103 73 75]\n [ 57 123 53]\n [ 79 66 95]],4.0,2,norm,1.0,0.5,0.8
560,logisticregression,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.436146,0.446483,0.447272,0.446483,[[ 94 75 44]\n [ 43 130 35]\n [ 91 74 68]],5.0,2,norm,1.0,1.0,0.8
566,randomforest,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 42}",0.44355,0.444954,0.447031,0.444954,[[ 92 54 67]\n [ 60 113 35]\n [ 89 58 86]],6.0,2,norm,1.0,1.0,0.8
325,lgbm,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'random_state': 42}",0.416478,0.417219,0.416961,0.417219,[[87 59 74]\n [57 88 41]\n [61 60 77]],7.0,2,norm,1.0,0.5,0.8
671,extratrees,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}",0.437923,0.440397,0.439672,0.440397,[[ 92 70 58]\n [ 47 100 39]\n [ 70 54 74]],8.0,2,norm,1.0,1.0,0.8


## Best results

In [8]:
max_accuracy_index = df_rq_2['accuracy'].idxmax()
max_accuracy_row_dict = df_rq_2.iloc[max_accuracy_index].to_dict()
print(max_accuracy_row_dict)

{'model_name': 'logisticregression', 'parameters': "{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}", 'f1': 0.436145906267162, 'accuracy': 0.4464831804281345, 'precision': 0.4472716592092792, 'recall': 0.4464831804281345, 'confusion_matrix': '[[ 94  75  44]\n [ 43 130  35]\n [ 91  74  68]]', 'Dataset': 5.0, 'RQ': 2, 'Normalization': 'norm', 'Window Size': 1.0, 'Hop Length': 1.0, 'Correlation Treshold': 0.8}


In [9]:
def get_highest_accuracy_row(df, dataset_value, rq_value):
    # Filter the DataFrame based on dataset and RQ values
    filtered_df = df[(df['Dataset'] == dataset_value) & (df['RQ'] == rq_value)]
    
    # Find the row with the highest Accuracy
    highest_accuracy_row = filtered_df.loc[filtered_df['accuracy'].idxmax()]
    
    # Return the row as a dictionary
    return highest_accuracy_row.to_dict()

dataset_value = 4
rq_value = 2

# Get the highest accuracy row for Dataset=4 and RQ=2
result = get_highest_accuracy_row(df, dataset_value, rq_value)
print(result)

{'model_name': 'xgb', 'parameters': "{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'random_state': 42}", 'f1': 0.4417189355301867, 'accuracy': 0.4433701657458563, 'precision': 0.4417115977441003, 'recall': 0.4433701657458563, 'confusion_matrix': '[[103  73  75]\n [ 57 123  53]\n [ 79  66  95]]', 'Dataset': 4.0, 'RQ': 2, 'Normalization': 'norm', 'Window Size': 1.0, 'Hop Length': 0.5, 'Correlation Treshold': 0.8}


# RQ1 All datasets

In [10]:
df_rq_1 = df[df['RQ'] == 1]
best_accuracy_per_rq_dataset = df_rq_1.loc[df_rq_1.groupby(['Dataset', 'RQ'])['accuracy'].idxmax()]
df_rq_1.reset_index(drop=True, inplace=True)
best_accuracy_per_rq_dataset.head(30)

Unnamed: 0,model_name,parameters,f1,accuracy,precision,recall,confusion_matrix,Dataset,RQ,Normalization,Window Size,Hop Length,Correlation Treshold
38,lgbm,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'random_state': 42}",0.780322,0.781532,0.785664,0.781532,[[618 251]\n [137 770]],1.0,1,norm,1.0,0.5,1.0
2608,randomforest,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}",0.733731,0.734375,0.736666,0.734375,[[592 272]\n [187 677]],4.0,1,norm,1.0,1.0,0.8


## Best results

In [11]:
max_accuracy_index = df_rq_1['accuracy'].idxmax()
max_accuracy_row_dict = df_rq_1.iloc[max_accuracy_index].to_dict()
print(max_accuracy_row_dict)

{'model_name': 'lgbm', 'parameters': "{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'random_state': 42}", 'f1': 0.7803224818445166, 'accuracy': 0.7815315315315315, 'precision': 0.7856640611007515, 'recall': 0.7815315315315315, 'confusion_matrix': '[[618 251]\n [137 770]]', 'Dataset': 1.0, 'RQ': 1, 'Normalization': 'norm', 'Window Size': 1.0, 'Hop Length': 0.5, 'Correlation Treshold': 1.0}


In [12]:
df_rq_1 = df[df['RQ'] == 2]
best_accuracy_per_rq_dataset = df_rq_1.loc[df_rq_1.groupby(['Dataset', 'RQ'])['accuracy'].idxmax()]
best_accuracy_per_rq_dataset.head(30)

Unnamed: 0,model_name,parameters,f1,accuracy,precision,recall,confusion_matrix,Dataset,RQ,Normalization,Window Size,Hop Length,Correlation Treshold
70,svm,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'random_state': 42}",0.426641,0.42649,0.42723,0.42649,[[107 63 87]\n [ 55 126 75]\n [ 79 74 89]],1.0,2,norm,1.0,1.0,1.0
787,xgb,"{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}",0.42924,0.433113,0.435503,0.433113,[[ 80 67 110]\n [ 41 141 74]\n [ 62 74 106]],2.0,2,norm,2.0,1.0,0.8
159,svm,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'random_state': 42}",0.432056,0.436464,0.43217,0.436464,[[ 96 79 76]\n [ 45 134 54]\n [ 85 69 86]],3.0,2,norm,1.0,0.5,0.8
236,xgb,"{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'random_state': 42}",0.441719,0.44337,0.441712,0.44337,[[103 73 75]\n [ 57 123 53]\n [ 79 66 95]],4.0,2,norm,1.0,0.5,0.8
560,logisticregression,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.436146,0.446483,0.447272,0.446483,[[ 94 75 44]\n [ 43 130 35]\n [ 91 74 68]],5.0,2,norm,1.0,1.0,0.8
566,randomforest,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 42}",0.44355,0.444954,0.447031,0.444954,[[ 92 54 67]\n [ 60 113 35]\n [ 89 58 86]],6.0,2,norm,1.0,1.0,0.8
325,lgbm,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'random_state': 42}",0.416478,0.417219,0.416961,0.417219,[[87 59 74]\n [57 88 41]\n [61 60 77]],7.0,2,norm,1.0,0.5,0.8
671,extratrees,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}",0.437923,0.440397,0.439672,0.440397,[[ 92 70 58]\n [ 47 100 39]\n [ 70 54 74]],8.0,2,norm,1.0,1.0,0.8


In [13]:
filtered_df = df[(df['Dataset'] == 4) & (df['RQ'] == 2)]
idx = filtered_df.groupby('model_name')['accuracy'].idxmax()
best_combinations = filtered_df.loc[idx]
columns= ['model_name', 'Window Size', 'Hop Length', 'Correlation Treshold', 'parameters', 'f1', 'accuracy', 'precision', 'recall']
best_combinations= best_combinations[columns]
column_names= ['Model', 'Window Size', 'Hop Length', 'Correlation Treshold', 'Model Parameters', 'F1', 'Accuracy', 'Precision', 'Recall']
best_combinations.columns= column_names
best_combinations.head(10)

Unnamed: 0,Model,Window Size,Hop Length,Correlation Treshold,Model Parameters,F1,Accuracy,Precision,Recall
839,adaboost,2.0,1.0,0.8,"{'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 42}",0.417333,0.424033,0.422853,0.424033
833,extratrees,2.0,1.0,0.8,"{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}",0.411931,0.412983,0.411221,0.412983
1153,gaussiannb,2.0,2.0,0.8,{'var_smoothing': 1e-10},0.38242,0.39779,0.398885,0.39779
819,gradientboosting,2.0,1.0,0.8,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}",0.427366,0.429558,0.426377,0.429558
220,kneighbors,1.0,0.5,0.8,"{'n_neighbors': 15, 'weights': 'uniform'}",0.373377,0.375691,0.375027,0.375691
226,lgbm,1.0,0.5,0.8,"{'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 31, 'random_state': 42}",0.420507,0.424033,0.420113,0.424033
2422,logisticregression,2.0,2.0,0.8,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.300103,0.379834,0.252903,0.379834
492,randomforest,1.0,1.0,0.8,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42}",0.424722,0.426796,0.423636,0.426796
236,xgb,1.0,0.5,0.8,"{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'random_state': 42}",0.441719,0.44337,0.441712,0.44337


In [14]:
col_widths = [0.05, 0.05, 0.05, 0.05, 0.2, 0.05, 0.05, 0.05, 0.05] 
export_df_to_image_formatted(best_combinations, FIGURES_DIR/'models_performance_rq2.png', figsize=(10,20), col_widths=col_widths,  font_size=100)

DataFrame exported as image to C:\Users\alvar\Documents\GitHub\Plant-Reactivity-Analysis\reports\figures\models_performance_rq2.png


In [15]:
filtered_df = df[(df['Dataset'] == 4) & (df['RQ'] == 1)]
idx = filtered_df.groupby('model_name')['accuracy'].idxmax()
best_combinations = filtered_df.loc[idx]
columns= ['model_name', 'Window Size', 'Hop Length', 'Correlation Treshold', 'parameters', 'f1', 'accuracy', 'precision', 'recall']
best_combinations= best_combinations[columns]
column_names= ['Model', 'Window Size', 'Hop Length', 'Correlation Treshold', 'Model Parameters', 'F1', 'Accuracy', 'Precision', 'Recall']
best_combinations.columns= column_names
best_combinations.head(10)

Unnamed: 0,Model,Window Size,Hop Length,Correlation Treshold,Model Parameters,F1,Accuracy,Precision,Recall
2646,adaboost,1.0,1.0,0.8,"{'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 42}",0.70895,0.710648,0.715682,0.710648
2634,extratrees,1.0,1.0,0.8,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}",0.720705,0.721065,0.722209,0.721065
2642,gaussiannb,1.0,1.0,0.8,{'var_smoothing': 1e-09},0.438982,0.522569,0.555861,0.522569
2633,gradientboosting,1.0,1.0,0.8,"{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'random_state': 42}",0.72222,0.722801,0.724679,0.722801
2655,kneighbors,1.0,1.0,0.8,"{'n_neighbors': 5, 'weights': 'uniform'}",0.517854,0.51794,0.517953,0.51794
2668,lgbm,1.0,1.0,0.8,"{'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 64, 'random_state': 42}",0.729386,0.729745,0.730973,0.729745
2649,logisticregression,1.0,1.0,0.8,"{'C': 0.1, 'solver': 'liblinear', 'random_state': 42}",0.333333,0.5,0.25,0.5
2608,randomforest,1.0,1.0,0.8,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}",0.733731,0.734375,0.736666,0.734375
2676,xgb,1.0,1.0,0.8,"{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 200, 'random_state': 42}",0.732035,0.732639,0.734753,0.732639
