In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [14]:
data = pd.read_csv('clean_data.csv')

In [15]:
X = data.drop('isFraud', axis=1)
y = data['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [16]:
# hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'class_weight': ['balanced', None]
}

rf_model = RandomForestClassifier(min_samples_leaf=3,
                                  bootstrap=False,
                                  min_samples_split=5,
                                  class_weight='balanced',
                                  random_state=42)

In [17]:
random_search = RandomizedSearchCV(estimator=rf_model,
                                    param_distributions=param_grid,
                                    n_iter=5,  # Adjust based on your computational budget
                                    cv=3,
                                    verbose=2,
                                    random_state=42,
                                    n_jobs=-1,
                                    scoring='f1')

In [18]:
random_search.fit(X_train, y_train)

# Get the best estimator
best_rf_model = random_search.best_estimator_

# Predictions using the best found parameters
rf_predictions = best_rf_model.predict(X_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 2.2min
[CV] END bootstrap=True, class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 2.2min
[CV] END bootstrap=False, class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time= 2.8min
[CV] END bootstrap=False, class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time= 2.8min
[CV] END bootstrap=False, class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time= 2.9min
[CV] END bootstrap=False, class_weight=None, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.1s
[C

6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dom1k/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/dom1k/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/dom1k/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/dom1k/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_cons

In [19]:
# Evaluation
print("\nRandom Forest Classifier Results after Hyperparameter Tuning:")
print(classification_report(y_test, rf_predictions, zero_division=0))
print("Accuracy:", accuracy_score(y_test, rf_predictions))


Random Forest Classifier Results after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     46947
           1       0.94      0.76      0.84        21

    accuracy                           1.00     46968
   macro avg       0.97      0.88      0.92     46968
weighted avg       1.00      1.00      1.00     46968

Accuracy: 0.9998722534491569
