In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('final_data_V2.csv')  
X = data.drop(columns=['target'])
y = data['target']

In [3]:
smote_enn = SMOTEENN(random_state=42, sampling_strategy='minority')
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [4]:
X_sample, _, y_sample, _ = train_test_split(X_resampled, y_resampled, test_size=0.5, random_state=42)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

### Random Forest

In [7]:
model = RandomForestClassifier(random_state=42)

In [8]:
param_grid_rf = {
    'max_depth': [10, 20],
    'min_samples_split': [1, 4],
    'min_samples_leaf': [2, 5],
    'n_estimators': [100, 200, 500]
}

In [9]:
grid_search = GridSearchCV(model, param_grid_rf, cv=5, scoring='recall', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=8, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.959 total time= 2.2min
[CV 1/5] END max_depth=8, min_samples_leaf=2, min_samples_split=4, n_estimators=200;, score=0.959 total time= 2.2min
[CV 2/5] END max_depth=8, min_samples_leaf=2, min_samples_split=4, n_estimators=300;, score=0.959 total time= 3.4min
[CV 4/5] END max_depth=8, min_samples_leaf=2, min_samples_split=6, n_estimators=300;, score=0.961 total time= 3.3min
[CV 1/5] END max_depth=8, min_samples_leaf=4, min_samples_split=2, n_estimators=300;, score=0.960 total time= 3.3min
[CV 5/5] END max_depth=8, min_samples_leaf=4, min_samples_split=4, n_estimators=200;, score=0.962 total time= 2.2min
[CV 3/5] END max_depth=8, min_samples_leaf=4, min_samples_split=6, n_estimators=200;, score=0.960 total time= 2.2min
[CV 1/5] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.959 total time= 2.7min
[

In [10]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall-Score: ", grid_search.best_score_)

Best Parameters:  {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 300}
Best Recall-Score:  0.960388879604119


In [11]:
results_rf = pd.DataFrame(grid_search.cv_results_)

In [12]:
results_rf.to_csv('result_rf_2.csv')

### Decision Tree

In [15]:
model = DecisionTreeClassifier(random_state=42)

In [16]:
param_grid_dt = {
    'max_depth': [15, 20, 25],
    'max_leaf_nodes': [20, 50],
    'min_impurity_decrease': [0, 0.01]
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', 'None']
}

In [17]:
grid_search = GridSearchCV(model, param_grid_dt, cv=5, scoring='recall', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [18]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall-Score: ", grid_search.best_score_)

Best Parameters:  {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 4}
Best Recall-Score:  0.9310752978284338


In [20]:
results_dt = pd.DataFrame(grid_search.cv_results_)

In [21]:
results_dt.to_csv('result_dt_2.csv')

In [22]:
# results_dt = pd.DataFrame(grid_search.cv_results_)
# Best Parameters:  {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
# Best F1-Score:  0.8732871369448603

### XGBoost

In [24]:
model = XGBClassifier(random_state=42)

In [25]:
param_grid_xgb = {
    'colsample_bytree': [0.7, 0.9]
    'max_depth': [3, 6, 10, 20],
    'learning_rate': [0.1, 0.2],
    'n_estimators': [200, 300, 500],
    'subsample': [1, 2]
    'reg_alpha': [1],
    'reg_lambda': [1.5]
}

In [26]:
grid_search = GridSearchCV(model, param_grid_xgb, cv=5, scoring='recall', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [27]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall-Score: ", grid_search.best_score_)

Best Parameters:  {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 200, 'reg_alpha': 5, 'reg_lambda': 5}
Best Recall-Score:  0.9515008121159572


In [28]:
results_xgb = pd.DataFrame(grid_search.cv_results_)

In [29]:
results_xgb.to_csv('results_xgb_2.csv')

In [30]:
# resus_xgb = pd.DataFrame(grid_search.cv_results_) 
# Best Parameters:  {'learning_rate': 0.1, 'n_estimators': 100}
# Best F1-Score:  0.865998389543446

### Ada Boost

In [42]:
model = AdaBoostClassifier(random_state=42)

In [44]:
param_grid_ada = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.15, 0.2]
}

In [46]:
grid_search = GridSearchCV(model, param_grid_ada, cv=5, scoring='recall', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 5/5] END learning_rate=0.1, n_estimators=100;, score=0.944 total time= 2.5min
[CV 1/5] END learning_rate=0.1, n_estimators=300;, score=0.944 total time=10.9min
[CV 5/5] END learning_rate=0.5, n_estimators=100;, score=0.938 total time= 3.7min
[CV 3/5] END learning_rate=0.5, n_estimators=300;, score=0.937 total time= 6.9min
[CV 4/5] END learning_rate=1.0, n_estimators=200;, score=0.934 total time= 4.0min
[CV 2/5] END learning_rate=0.1, n_estimators=200;, score=0.941 total time= 6.1min
[CV 5/5] END learning_rate=0.1, n_estimators=300;, score=0.944 total time=11.0min
[CV 2/5] END learning_rate=0.5, n_estimators=300;, score=0.939 total time= 7.0min
[CV 3/5] END learning_rate=1.0, n_estimators=200;, score=0.933 total time= 4.0min
[CV 2/5] END learning_rate=0.1, n_estimators=100;, score=0.940 total time= 2.5min
[CV 3/5] END learning_rate=0.1, n_estimators=300;, score=0.941 total time=11.0min
[CV 4/5] END learning_rate=0.5, n_esti

In [47]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall-Score: ", grid_search.best_score_)

Best Parameters:  {'learning_rate': 0.1, 'n_estimators': 300}
Best Recall-Score:  0.9432518554277618


In [48]:
results_ada = pd.DataFrame(grid_search.cv_results_)

In [49]:
results_ada.to_csv('results_ada_2.csv')

In [50]:
# results_ada = pd.DataFrame(grid_search.cv_results_)
# Best Parameters:  {'learning_rate': 0.1, 'n_estimators': 100}
# Best F1-Score:  0.865998389543446

### Multilayer Perceptron

In [60]:
model = MLPClassifier(max_iter=3000, early_stopping=True, random_state=42)

In [61]:
param_grid = {
    'activation': ['relu', 'tanh'],
    'batch_size': ['auto'],
    'early_stopping': [TRUE],
    'hidden_layer_sizes': [(100, 100, 100), (150, 150, 150)], # number of neurons in hidden layers
    'alpha': [0.001, 0.002],
    'learning_rate': ['adaptive', 'constant'],
    'solver': ['adam', 'sgd']
}

In [62]:
# X_sample, _, y_sample, _ = train_test_split(X_resampled, y_resampled, test_size=0.9, random_state=42)

In [63]:
# X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.20, random_state=42)

In [64]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [65]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall-Score: ", grid_search.best_score_)

Best Parameters:  {'alpha': 0.001, 'hidden_layer_sizes': (150, 150, 150), 'learning_rate_init': 0.001}
Best Recall-Score:  0.9171751995868531


In [66]:
results_mlp1 = pd.DataFrame(grid_search.cv_results_)

In [67]:
results_mlp1.to_csv('results_mlp_2.csv')

In [68]:
# results_mlp = pd.DataFrame(grid_search.cv_results_)
# Best Parameters:  {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100, 100),
# 'learning_rate': 'constant', 'solver': 'adam'}
# Best F1-Score:  0.8410854355720228

### SVM

In [70]:
model = SVC(max_iter=10000, random_state=42)

In [78]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [84]:
param_grid_svm = {
    'C': [1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 1],
    'degree': [2, 3]  # Relevant for poly kernels
}

In [86]:
grid_search = GridSearchCV(model, param_grid_svm, cv=5, scoring='recall', n_jobs=-1, verbose=3)
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits




[CV 3/5] END C=0.1, degree=2, gamma=scale, kernel=rbf;, score=0.938 total time= 1.2min
[CV 4/5] END C=0.1, degree=2, gamma=scale, kernel=poly;, score=0.976 total time= 1.1min
[CV 4/5] END C=0.1, degree=2, gamma=auto, kernel=rbf;, score=0.935 total time= 1.1min
[CV 2/5] END C=0.1, degree=2, gamma=0.1, kernel=linear;, score=0.929 total time=  41.9s
[CV 3/5] END C=0.1, degree=2, gamma=0.1, kernel=rbf;, score=0.994 total time= 1.2min
[CV 4/5] END C=0.1, degree=2, gamma=1, kernel=linear;, score=0.922 total time=  43.1s
[CV 3/5] END C=0.1, degree=2, gamma=1, kernel=poly;, score=0.841 total time=  37.2s
[CV 3/5] END C=0.1, degree=3, gamma=scale, kernel=linear;, score=0.933 total time=  43.4s
[CV 2/5] END C=0.1, degree=3, gamma=scale, kernel=poly;, score=0.974 total time=  53.3s
[CV 1/5] END C=0.1, degree=3, gamma=auto, kernel=linear;, score=0.937 total time=  40.3s
[CV 4/5] END C=0.1, degree=3, gamma=auto, kernel=rbf;, score=0.935 total time= 1.1min
[CV 5/5] END C=0.1, degree=3, gamma=0.1, ke

In [87]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall-Score: ", grid_search.best_score_)

Best Parameters:  {'C': 0.1, 'degree': 2, 'gamma': 1, 'kernel': 'rbf'}
Best Recall-Score:  1.0


In [88]:
results_svm = pd.DataFrame(grid_search.cv_results_)

In [89]:
results_svm.to_csv('results_svm_2.csv')

In [None]:
# results_svm = pd.DataFrame(grid_search.cv_results_)
# Best Parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
# Best F1-Score:  0.8647564420390846