In [1]:
import pandas as pd

In [2]:
df_fusionné = pd.read_csv('df_fusionné.csv')

In [3]:
df_fusionné_binaire = df_fusionné.sample(frac=0.2)
df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x == 'N' else 1)

In [4]:
# Count the occurrences of each class
counts = df_fusionné_binaire.iloc[:, 0].value_counts()

# Find the number of instances in the minority class
min_count = counts.min()

# Create balanced DataFrame through undersampling
df_balanced_under = pd.concat([
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 0].sample(min_count),
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 1].sample(min_count)
])

# Shuffle the DataFrame to mix the classes
df_balanced_under = df_balanced_under.sample(frac=1, random_state=42)


In [5]:
# Count occurrences of 0 and 1 in the first column
value_counts = df_balanced_under.iloc[:, 0].value_counts()

# Display the counts
print(value_counts)


ColumnName
0    3712
1    3712
Name: count, dtype: int64


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

X = df_balanced_under.iloc[:, 1:]  
y = df_balanced_under.iloc[:, 0] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(y_train.unique())


[1 0]


In [8]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Assuming X_train, y_train, X_test, y_test are already defined

# Step 1: Optimize for log loss
model = DecisionTreeClassifier()

param_grid = {
    'criterion': ['entropy'],  # Keeping criterion fixed as entropy consistently performs well
    'max_depth': [4, 5, 6],  # Exploring depths immediately around the optimal depth
    'max_features': [0.3, 0.4, 0.5],  # Slightly broader range around the best feature ratio
    'min_samples_leaf': [2, 3, 4],  # Exploring immediate values around the optimal leaf size
    'min_samples_split': [14, 15, 16],  # Close range around the optimal split size
    'splitter': ['random']  # Keeping splitter fixed as random given its success
}

grid_search_log_loss = GridSearchCV(model, param_grid=param_grid, scoring='neg_log_loss', cv=3, n_jobs=-1)
grid_search_log_loss.fit(X_train, y_train)

print(f"Best Parameters for Log Loss: {grid_search_log_loss.best_params_}")
print(f"Best Log Loss Score: {grid_search_log_loss.best_score_}")

# Step 2: Validate F1 Score with best parameters from log loss optimization
best_model_log_loss = grid_search_log_loss.best_estimator_
predictions = best_model_log_loss.predict(X_test)
f1 = f1_score(y_test, predictions, average='binary')  # adjust average based on your problem
print(f"F1 Score with Log Loss Optimization: {f1}")

# If F1 is not satisfactory, perform a more focused grid search around the best parameters for log loss
focused_param_grid = {
    'max_depth': [grid_search_log_loss.best_params_['max_depth']],
    'min_samples_leaf': [max(1, grid_search_log_loss.best_params_['min_samples_leaf'] - 1), grid_search_log_loss.best_params_['min_samples_leaf'], grid_search_log_loss.best_params_['min_samples_leaf'] + 1],
    'min_samples_split': [max(2, grid_search_log_loss.best_params_['min_samples_split'] - 1), grid_search_log_loss.best_params_['min_samples_split'], grid_search_log_loss.best_params_['min_samples_split'] + 1],
    'criterion': [grid_search_log_loss.best_params_['criterion']],
    'max_features': [grid_search_log_loss.best_params_['max_features']],
    'splitter': [grid_search_log_loss.best_params_['splitter']],
}

grid_search_f1 = GridSearchCV(model, param_grid=focused_param_grid, scoring='f1', cv=3, n_jobs=-1)
grid_search_f1.fit(X_train, y_train)

print(f"Best Parameters for F1 Score: {grid_search_f1.best_params_}")
print(f"Best F1 Score: {grid_search_f1.best_score_}")


Best Parameters for Log Loss: {'criterion': 'log_loss', 'max_depth': None, 'max_features': 0.5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'random'}
Best Log Loss Score: 0.887526720531984
F1 Score with Log Loss Optimization: 0.8709677419354839
Best Parameters for F1 Score: {'criterion': 'log_loss', 'max_depth': None, 'max_features': 0.5, 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'random'}
Best F1 Score: -2.8200912415807253


In [41]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=16, criterion='entropy', max_features=0.3, splitter='random' )
model.fit(X_train, y_train)

In [42]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, log_loss

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# ROC-AUC Score
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.2f}")

# Log Loss
logloss = log_loss(y_test, y_pred_proba)
print(f"Log Loss: {logloss:.2f}")

Confusion Matrix:
 [[3446  320]
 [1396 2384]]
Precision: 0.88


Recall: 0.63
F1 Score: 0.74
ROC AUC Score: 0.80
Log Loss: 0.47
