In [1]:
import pandas as pd

In [2]:
df_fusionné = pd.read_csv('df_fusionné.csv')

In [3]:
df_fusionné_binaire = df_fusionné.sample(frac=0.1)
df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x == 'N' else 1)

In [4]:
# Count the occurrences of each class
counts = df_fusionné_binaire.iloc[:, 0].value_counts()

# Find the number of instances in the minority class
min_count = counts.min()

# Create balanced DataFrame through undersampling
df_balanced_under = pd.concat([
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 0].sample(min_count),
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 1].sample(min_count)
])

# Shuffle the DataFrame to mix the classes
df_balanced_under = df_balanced_under.sample(frac=1, random_state=42)


In [5]:
# Count occurrences of 0 and 1 in the first column
value_counts = df_balanced_under.iloc[:, 0].value_counts()

# Display the counts
print(value_counts)


ColumnName
1    3757
0    3757
Name: count, dtype: int64


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

X = df_balanced_under.iloc[:, 1:]  
y = df_balanced_under.iloc[:, 0] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(y_train.unique())


[0 1]


In [8]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 10, 30, 50, 70],  # Include None to consider unlimited depth
    'min_samples_leaf': [1, 2, 4, 6, 10, 15],  # Finer increments in the lower range
    'min_samples_split': [2, 5, 10, 15, 20, 25],  # Start from 2 (minimum valid value) and provide finer increments
    'criterion': ['gini', 'entropy'],  # Include 'entropy' for information gain criterion
    'max_features': [None, 'sqrt', 'log2', 0.5],  # Include None and a float (e.g., 0.5 for half of the features)
    'splitter': ['best', 'random'],  # Include 'random' for randomized splits
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1)

grid_search.fit(X_train, y_train)


print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

best_estimator = grid_search.best_estimator_
predictions = best_estimator.predict(X_test)

Best Parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
Best Score: 0.6923207316689265


In [12]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, criterion='entropy', max_features= None, splitter='random' )
model.fit(X_train, y_train)

In [13]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, log_loss

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# ROC-AUC Score
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.2f}")

# Log Loss
logloss = log_loss(y_test, y_pred_proba)
print(f"Log Loss: {logloss:.2f}")

Confusion Matrix:
 [[523 249]
 [243 488]]
Precision: 0.66
Recall: 0.67
F1 Score: 0.66
ROC AUC Score: 0.67
Log Loss: 11.80
