# Notebook for hyperparameter searching 

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier

X = pd.read_csv("datasets/feature_updated_dataset_X.csv")
y = pd.read_csv("datasets/feature_updated_dataset_y.csv")

In [25]:
# Hyper Parameters Search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

rf_classifier = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

In [None]:

y_ravel = y.values.ravel()
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring=scoring, cv=5, refit=False)
grid_search.fit(X, y_ravel)

In [None]:
import numpy as np

for scorer in scoring:
  best_index = np.argmax(grid_search.cv_results_['mean_test_'+scorer])
  best_score = grid_search.cv_results_['mean_test_'+scorer][best_index]
  best_params = grid_search.cv_results_['params'][best_index]
  print(f"Best {scorer} score: {best_score}")
  print(f"Best {scorer} params: {best_params}")
  
results_df = pd.DataFrame(grid_search.cv_results_)
results_df.to_csv('datasets/grid_search_results_4_parameters.csv')

In [41]:
read_df = pd.read_csv('datasets/grid_search_results_4_parameters.csv')
read_df['rank_test_overall'] = read_df['rank_test_accuracy'] + read_df['rank_test_precision'] + read_df['rank_test_recall'] + read_df['rank_test_f1']
score_strs = ['accuracy', 'precision', 'recall', 'f1', 'overall']

for scorer in score_strs:
  sorted = read_df.sort_values(by=f'rank_test_{scorer}', ascending=True)
  top = sorted.head(1)
  
  rank_test_accuracy, mean_test_accuracy = top[['rank_test_accuracy', 'mean_test_accuracy']].values[0]
  rank_test_precision, mean_test_precision = top[['rank_test_precision', 'mean_test_precision']].values[0]
  rank_test_recall, mean_test_recall = top[['rank_test_recall', 'mean_test_recall']].values[0]
  rank_test_f1, mean_test_f1 = top[['rank_test_f1', 'mean_test_f1']].values[0]
  rank_test_overall = top[['rank_test_overall']].values[0][0]
  
  print(f'Best {scorer} params: {top["params"].values[0]}')
  print(f'Rank test accuracy: {rank_test_accuracy:.0f}, mean test accuracy: {mean_test_accuracy}')
  print(f'Rank test precision: {rank_test_precision:.0f}, mean test precision: {mean_test_precision}')
  print(f'Rank test recall: {rank_test_recall:.0f}, mean test recall: {mean_test_recall}')
  print(f'Rank test f1: {rank_test_f1:.0f}, mean test f1: {mean_test_f1}')
  print(f'Rank test overall: {rank_test_overall:.0f}')
  
  print('\n')

Best accuracy params: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Rank test accuracy: 1, mean test accuracy: 0.9172791762842902
Rank test precision: 22, mean test precision: 0.9357108413921456
Rank test recall: 1, mean test recall: 0.9172791762842902
Rank test f1: 4, mean test f1: 0.9205063647122816
Rank test overall: 28


Best precision params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Rank test accuracy: 7, mean test accuracy: 0.915786518215192
Rank test precision: 1, mean test precision: 0.9372386950191766
Rank test recall: 7, mean test recall: 0.915786518215192
Rank test f1: 6, mean test f1: 0.9201414244119188
Rank test overall: 21


Best recall params: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Rank test accuracy: 1, mean test accuracy: 0.9172791762842902
Rank test precision: 22, mean test precision: 0.9357108413921456
Rank test recall: 1, mean test reca