In [75]:
#Librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import contextlib

import sys
sys.path.append('/home/onyxia/work/Macroeconometrics')
from src.preprocessing import apply_transformation

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

In [76]:
#Main data
fred_md = pd.read_csv("/home/onyxia/work/Macroeconometrics/data/fred_md_2024_12.csv")
#Metadata
fred_info = pd.read_csv("/home/onyxia/work/Macroeconometrics/data/FRED_MD_updated_appendix.csv", encoding="latin1")
#Recession variable
us_rec = pd.read_csv("/home/onyxia/work/Macroeconometrics/data/USREC.csv")

In [77]:
#Indexing the dataset with dates
fred_md_short = (
    fred_md.iloc[1:]
    .assign(sasdate=pd.to_datetime(fred_md.iloc[1:].sasdate, format="%m/%d/%Y"))
    .set_index("sasdate")
)

#Transformation of series based on metadata
for _, row in fred_info.iterrows():
    series_name = row['fred']
    transformation_code = row['tcode']

    with contextlib.suppress(Exception):
        fred_md_short[series_name] = apply_transformation(fred_md_short[series_name], transformation_code)

#Filtering data by date
start_date = "1960"
end_date = "2024"
fred_md_short = fred_md_short[
    (fred_md_short.index >= start_date) & (fred_md_short.index <= end_date)
].dropna(axis=1)

#Addition of the variable of interest (American recession)
us_rec = us_rec.assign(
    observation_date=pd.to_datetime(us_rec.observation_date)
).set_index("observation_date")
us_rec = us_rec.loc[fred_md_short.index,:]

In [124]:
#Function Random Forest
def run_random_forest(X, y, param_grid=None, test_size=0.2, random_state=667, cv_folds=5):
    """
    Perform Random Forest classification with hyperparameter tuning using GridSearchCV.

    Parameters:
    X : Covariates (features) for training the model.
    y : Target variable for classification.
    param_grid : Dictionary of hyperparameters to tune.
    test_size : Fraction of data to use for testing (default is 0.2).
    random_state : Random seed for reproducibility (default is 42).
    cv_folds : Number of folds for cross-validation (default is 5).
    
    Returns:
    dict: Contains 'best_params', 'classification_report', 'accuracy', and 'feature_importance'.
    """
    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into train and test sets (time-series split without shuffling)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state, stratify = y)
    
    # Instantiate Random Forest model
    rf = RandomForestClassifier(random_state=random_state)
    
    # Hyperparameter tuning using GridSearchCV
    if param_grid is None:
        param_grid = {
        'n_estimators': [1000],  # Nombre d'arbres dans la forêt
        'max_depth': [None, 10, 30],   # Profondeur maximale des arbres
        'min_samples_split': [2, 5, 10],   # Nombre minimal d'échantillons pour diviser un noeud
        'min_samples_leaf': [1, 4],     # Nombre minimal d'échantillons par feuille
        }
    
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv_folds, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    #Best model
    best_rf = grid_search.best_estimator_

    #Predictions on the test set
    y_pred = best_rf.predict(X_test)

    #Evaluation
    classification_rep = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    #Feature importance
    importances = best_rf.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns if isinstance(X, pd.DataFrame) else [f'PC{i+1}' for i in range(X.shape[1])],
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    # Return results
    return {
        'best_params': grid_search.best_params_,
        'classification_report': classification_rep,
        'accuracy': accuracy,
        'feature_importance': feature_importance_df
    }

# RF naive

In [126]:
# Nous allons créer des variables à t-1 pour chaque série temporelle
fred_md_lagged = fred_md_short.shift(1)

# Fusionner les données laggées avec l'indicateur de récession (us_rec) à l'index
data = pd.concat([fred_md_lagged, us_rec], axis=1)

# Définir les variables X (features) et y (target)
X = data.drop(columns=['USREC'])  # Tout sauf 'us_rec' sera utilisé comme caractéristiques
y = data['USREC']  # La variable cible est 'us_rec'

# Run Random Forest with the custom function
results = run_random_forest(X, y)

# Display the results 
print("Best Hyperparameters :", results['best_params'])
print("\nClassification Report:")
print(results['classification_report'])
print("\nAccuracy:", results['accuracy'])
print("\nFeature Importance:")
print(results['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters : {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       135
           1       0.80      0.63      0.71        19

    accuracy                           0.94       154
   macro avg       0.87      0.80      0.83       154
weighted avg       0.93      0.94      0.93       154


Accuracy: 0.935064935064935

Feature Importance:
           Feature  Importance
31          PAYEMS    0.057510
32          USGOOD    0.057091
36         DMANEMP    0.044140
35          MANEMP    0.041427
82        TB3SMFFM    0.029373
..             ...         ...
21         CLF16OV    0.001831
107  CUSR0000SA0L2    0.001803
89         EXSZUSx    0.001770
45         AWOTMAN    0.001588
97       OILPRICEx    0.001414

[119 rows x 2 columns]


Problèmes :
- Récessions rares dans le dataset, donc le modèle a tout intérêt à prédire 0 s'il n'a pas assez d'informations sur l'état de l'économie
- Trop de variables (besoin de PCA)
- Besoin d'informations sur les variables en t-2...

# RF with principal components

In [127]:
#data
# Make sure the file paths are correct where you saved the PCA datasets
pca_60_df = pd.read_csv('/home/onyxia/work/Macroeconometrics/data/PCA/pca_60.csv', index_col='sasdate')  # For 60% variance explained
pca_80_df = pd.read_csv('/home/onyxia/work/Macroeconometrics/data/PCA/pca_80.csv', index_col='sasdate')  # For 80% variance explained
pca_90_df = pd.read_csv('/home/onyxia/work/Macroeconometrics/data/PCA/pca_90.csv', index_col='sasdate')  # For 90% variance explained
pca_60_df.index = pd.to_datetime(pca_60_df.index)
pca_80_df.index = pd.to_datetime(pca_80_df.index)
pca_90_df.index = pd.to_datetime(pca_90_df.index)

pca_60_df = pca_60_df.shift(1)
pca_80_df = pca_80_df.shift(1)
pca_90_df = pca_90_df.shift(1)

pca_60_df = pd.concat([pca_60_df, us_rec], axis=1, join='inner').dropna()
pca_80_df = pd.concat([pca_80_df, us_rec], axis=1, join='inner').dropna()
pca_90_df = pd.concat([pca_90_df, us_rec], axis=1, join='inner').dropna()

# Target variable is 'USREC', the recession indicator
X_60 = pca_60_df.drop(columns=['USREC'])
y_60 = pca_60_df['USREC']

X_80 = pca_80_df.drop(columns=['USREC'])
y_80 = pca_80_df['USREC']

X_90 = pca_90_df.drop(columns=['USREC'])
y_90 = pca_90_df['USREC']

In [128]:
# Run Random Forest with the custom function
results_60 = run_random_forest(X_60, y_60)

# Display the results for 60% explained variance
print("Best Hyperparameters for 60% Variance Explained:", results_60['best_params'])
print("\nClassification Report for 60% Variance Explained:")
print(results_60['classification_report'])
print("\nAccuracy for 60% Variance Explained:", results_60['accuracy'])
print("\nFeature Importance for 60% Variance Explained:")
print(results_60['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters for 60% Variance Explained: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report for 60% Variance Explained:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       135
           1       0.92      0.58      0.71        19

    accuracy                           0.94       154
   macro avg       0.93      0.79      0.84       154
weighted avg       0.94      0.94      0.94       154


Accuracy for 60% Variance Explained: 0.9415584415584416

Feature Importance for 60% Variance Explained:
   Feature  Importance
0      PC1    0.378559
4      PC5    0.115435
3      PC4    0.109692
5      PC6    0.077943
6      PC7    0.063657
10    PC11    0.053354
2      PC3    0.040001
1      PC2    0.038192
7      PC8    0.034314
11    PC12    0.032062
9     PC10    0.028751
8      PC9    0.028042


In [129]:
# Run Random Forest with the custom function
results_80 = run_random_forest(X_80, y_80)

# Display the results for 80% explained variance
print("Best Hyperparameters for 80% Variance Explained:", results_80['best_params'])
print("\nClassification Report for 80% Variance Explained:")
print(results_80['classification_report'])
print("\nAccuracy for 80% Variance Explained:", results_80['accuracy'])
print("\nFeature Importance for 80% Variance Explained:")
print(results_80['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters for 80% Variance Explained: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report for 80% Variance Explained:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       135
           1       0.90      0.47      0.62        19

    accuracy                           0.93       154
   macro avg       0.92      0.73      0.79       154
weighted avg       0.93      0.93      0.92       154


Accuracy for 80% Variance Explained: 0.9285714285714286

Feature Importance for 80% Variance Explained:
   Feature  Importance
0      PC1    0.306997
3      PC4    0.091000
4      PC5    0.086481
5      PC6    0.057627
6      PC7    0.046589
10    PC11    0.033723
1      PC2    0.025870
2      PC3    0.025371
12    PC13    0.022099
18    PC19    0.021832
7      PC8    0.021311
11    PC12    0.019797
13    PC14    0.019

In [130]:
# Run Random Forest with the custom function
results_90 = run_random_forest(X_90, y_90)

# Display the results for 90% explained variance
print("Best Hyperparameters for 90% Variance Explained:", results_90['best_params'])
print("\nClassification Report for 90% Variance Explained:")
print(results_90['classification_report'])
print("\nAccuracy for 90% Variance Explained:", results_90['accuracy'])
print("\nFeature Importance for 90% Variance Explained:")
print(results_90['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters for 90% Variance Explained: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report for 90% Variance Explained:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       135
           1       0.90      0.47      0.62        19

    accuracy                           0.93       154
   macro avg       0.92      0.73      0.79       154
weighted avg       0.93      0.93      0.92       154


Accuracy for 90% Variance Explained: 0.9285714285714286

Feature Importance for 90% Variance Explained:
   Feature  Importance
0      PC1    0.257344
3      PC4    0.082402
4      PC5    0.079797
5      PC6    0.050609
6      PC7    0.038523
10    PC11    0.027078
2      PC3    0.019900
43    PC44    0.019210
1      PC2    0.018905
12    PC13    0.018436
18    PC19    0.018011
11    PC12    0.017631
7      PC8    0.016

# RF avec sparse factors

In [131]:
factors = pd.read_csv('/home/onyxia/work/Macroeconometrics/data/estimated_factor.csv', index_col='sasdate') 
factors.index = pd.to_datetime(factors.index)
factors = factors.shift(1)


factors = pd.concat([factors, us_rec], axis=1, join='inner').dropna()

# Target variable is 'USREC', the recession indicator
X_factors = factors.drop(columns=['USREC'])
y_factors = factors['USREC']

# Run Random Forest with the custom function
results_factors = run_random_forest(X_factors, y_factors)

# Display the results 
print("Best Hyperparameters:", results_factors['best_params'])
print("\nClassification Report:")
print(results_factors['classification_report'])
print("\nAccuracy :", results_factors['accuracy'])
print("\nFeature Importance :")
print(results_factors['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       135
           1       0.80      0.63      0.71        19

    accuracy                           0.94       154
   macro avg       0.87      0.80      0.83       154
weighted avg       0.93      0.94      0.93       154


Accuracy : 0.935064935064935

Feature Importance :
  Feature  Importance
0       0    0.446972
7       7    0.097172
3       3    0.069335
6       6    0.067485
4       4    0.064086
1       1    0.061481
9       9    0.059798
5       5    0.047857
2       2    0.043119
8       8    0.042695


In [132]:
#on combine les sparses factors à toutes les covariates (fred_md_short)
factors = pd.read_csv('/home/onyxia/work/Macroeconometrics/data/estimated_factor.csv', index_col='sasdate') 
factors.index = pd.to_datetime(factors.index)

data = pd.merge(factors, fred_md_short, left_index=True, right_index=True, how='inner')
data = data.shift(1)

data = pd.concat([data, us_rec], axis=1, join='inner').dropna()

# Target variable is 'USREC', the recession indicator
X = data.drop(columns=['USREC'])
y = data['USREC']

# Run Random Forest with the custom function
results = run_random_forest(X, y)

# Display the results 
print("Best Hyperparameters:", results['best_params'])
print("\nClassification Report:")
print(results['classification_report'])
print("\nAccuracy :", results['accuracy'])
print("\nFeature Importance :")
print(results['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       135
           1       0.85      0.58      0.69        19

    accuracy                           0.94       154
   macro avg       0.89      0.78      0.83       154
weighted avg       0.93      0.94      0.93       154


Accuracy : 0.935064935064935

Feature Importance :
         Feature  Importance
0              0    0.062357
41        PAYEMS    0.045057
46       DMANEMP    0.043257
42        USGOOD    0.040794
45        MANEMP    0.039101
..           ...         ...
72          M2SL    0.001794
119        PCEPI    0.001794
55       AWOTMAN    0.001702
126  DTCOLNVHFNM    0.001514
113  CUSR0000SAC    0.001382

[129 rows x 2 columns]


# RF avec factors et covariates en dupliquant les périodes de récession

In [116]:
#Fonction de duplication des lignes ayant une récession
def duplicate_usrec(data, k):
    # Filtrer les lignes où USREC == 1
    usrec_1 = data[data['USREC'] == 1]
    usrec_0 = data[data['USREC'] == 0]
    
    # Dupliquer les lignes k fois
    duplicated_rows = pd.concat([usrec_1] * k, ignore_index=True)
    
    # Ajouter les lignes dupliquées au DataFrame d'origine
    data_with_duplicates = pd.concat([usrec_0, duplicated_rows], ignore_index=True)

    # Mélanger aléatoirement les lignes
    data_with_duplicates = data_with_duplicates.sample(frac=1, random_state=667).reset_index(drop=True)
    
    return data_with_duplicates

In [121]:
#on combine les sparses factors à toutes les covariates (fred_md_short)
factors = pd.read_csv('/home/onyxia/work/Macroeconometrics/data/estimated_factor.csv', index_col='sasdate') 
factors.index = pd.to_datetime(factors.index)

data = pd.merge(factors, fred_md_short, left_index=True, right_index=True, how='inner')
data = data.shift(1)

data = pd.concat([data, us_rec], axis=1, join='inner').dropna()

for k in range(1, 20, 3):

    X = data.drop(columns=['USREC'])  # Variables explicatives
    y = data['USREC']                  # Variable cible

    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Séparer les données en ensemble d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=667, stratify=y)

    # Convertir X_train en DataFrame
    X_train = pd.DataFrame(X_train, columns=X.columns)

    # Combiner X_train et y_train pour appliquer la duplication
    train_data = pd.concat([X_train, y_train], axis=1)

    # Dupliquer les lignes avec USREC = 1
    train_data_with_duplicates = duplicate_usrec(train_data, k)

    # Séparer à nouveau les variables explicatives et la cible
    X_train_final = train_data_with_duplicates.drop(columns=['USREC'])
    y_train_final = train_data_with_duplicates['USREC']

    # Instantiate Random Forest model
    rf = RandomForestClassifier(random_state=667)
    
    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [1000],  # Nombre d'arbres dans la forêt
        'max_depth': [None, 10, 30],   # Profondeur maximale des arbres
        'min_samples_split': [2, 5, 10],   # Nombre minimal d'échantillons pour diviser un noeud
        'min_samples_leaf': [1, 4],     # Nombre minimal d'échantillons par feuille
        }
    
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_final, y_train_final)
    
    #Best model
    best_rf = grid_search.best_estimator_

    #Predictions on the test set
    y_pred = best_rf.predict(X_test)

    #Evaluation
    classification_rep = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    #Feature importance
    importances = best_rf.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns if isinstance(X, pd.DataFrame) else [f'PC{i+1}' for i in range(X.shape[1])],
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    # Return results
    results = {
        'best_params': grid_search.best_params_,
        'classification_report': classification_rep,
        'accuracy': accuracy,
        'feature_importance': feature_importance_df
    }

    # Display the results
    print("Nombre de duplications :", k)
    print("Best Hyperparameters:", results['best_params'])
    print("\nClassification Report:")
    print(results['classification_report'])
    print("\nAccuracy :", results['accuracy'])
    print("\nFeature Importance :")
    print(results['feature_importance'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   1.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=   1.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.7s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=   1.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=   1.7s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.7s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=   1.7s
[CV] END max_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Nombre de duplications : 1
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93       135
           1       0.00      0.00      0.00        19

    accuracy                           0.88       154
   macro avg       0.44      0.50      0.47       154
weighted avg       0.77      0.88      0.82       154


Accuracy : 0.8766233766233766

Feature Importance :
           Feature  Importance
0                0         0.0
1                1         0.0
2                2         0.0
3                3         0.0
4                4         0.0
..             ...         ...
124  CES2000000008         0.0
125  CES3000000008         0.0
126    DTCOLNVHFNM         0.0
127       DTCTHFNM         0.0
128         INVEST         0.0

[129 rows x 2 columns]
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Nombre de duplications : 4
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93       135
           1       0.00      0.00      0.00        19

    accuracy                           0.88       154
   macro avg       0.44      0.50      0.47       154
weighted avg       0.77      0.88      0.82       154


Accuracy : 0.8766233766233766

Feature Importance :
           Feature  Importance
0                0         0.0
1                1         0.0
2                2         0.0
3                3         0.0
4                4         0.0
..             ...         ...
124  CES2000000008         0.0
125  CES3000000008         0.0
126    DTCOLNVHFNM         0.0
127       DTCTHFNM         0.0
128         INVEST         0.0

[129 rows x 2 columns]
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Nombre de duplications : 7
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93       135
           1       0.00      0.00      0.00        19

    accuracy                           0.88       154
   macro avg       0.44      0.50      0.47       154
weighted avg       0.77      0.88      0.82       154


Accuracy : 0.8766233766233766

Feature Importance :
           Feature  Importance
0                0         0.0
1                1         0.0
2                2         0.0
3                3         0.0
4                4         0.0
..             ...         ...
124  CES2000000008         0.0
125  CES3000000008         0.0
126    DTCOLNVHFNM         0.0
127       DTCTHFNM         0.0
128         INVEST         0.0

[129 rows x 2 columns]
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Nombre de duplications : 10
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       135
           1       0.12      1.00      0.22        19

    accuracy                           0.12       154
   macro avg       0.06      0.50      0.11       154
weighted avg       0.02      0.12      0.03       154


Accuracy : 0.12337662337662338

Feature Importance :
           Feature  Importance
0                0         0.0
1                1         0.0
2                2         0.0
3                3         0.0
4                4         0.0
..             ...         ...
124  CES2000000008         0.0
125  CES3000000008         0.0
126    DTCOLNVHFNM         0.0
127       DTCTHFNM         0.0
128         INVEST         0.0

[129 rows x 2 columns]
Fitting 5 folds for each of 18 candidates, totalling 90 fits

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Nombre de duplications : 13
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       135
           1       0.12      1.00      0.22        19

    accuracy                           0.12       154
   macro avg       0.06      0.50      0.11       154
weighted avg       0.02      0.12      0.03       154


Accuracy : 0.12337662337662338

Feature Importance :
           Feature  Importance
0                0         0.0
1                1         0.0
2                2         0.0
3                3         0.0
4                4         0.0
..             ...         ...
124  CES2000000008         0.0
125  CES3000000008         0.0
126    DTCOLNVHFNM         0.0
127       DTCTHFNM         0.0
128         INVEST         0.0

[129 rows x 2 columns]
Fitting 5 folds for each of 18 candidates, totalling 90 fits

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Nombre de duplications : 16
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       135
           1       0.12      1.00      0.22        19

    accuracy                           0.12       154
   macro avg       0.06      0.50      0.11       154
weighted avg       0.02      0.12      0.03       154


Accuracy : 0.12337662337662338

Feature Importance :
           Feature  Importance
0                0         0.0
1                1         0.0
2                2         0.0
3                3         0.0
4                4         0.0
..             ...         ...
124  CES2000000008         0.0
125  CES3000000008         0.0
126    DTCOLNVHFNM         0.0
127       DTCTHFNM         0.0
128         INVEST         0.0

[129 rows x 2 columns]
Fitting 5 folds for each of 18 candidates, totalling 90 fits

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
