In [1]:
import pandas as pd

In [2]:
df_fusionné = pd.read_csv('df_fusionné.csv')

In [3]:
df_fusionné_binaire = df_fusionné.sample(frac=0.5)
df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x == 'N' else 1)

In [4]:
# Count the occurrences of each class
counts = df_fusionné_binaire.iloc[:, 0].value_counts()

# Find the number of instances in the minority class
min_count = counts.min()

# Create balanced DataFrame through undersampling
df_balanced_under = pd.concat([
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 0].sample(min_count),
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 1].sample(min_count)
])

# Shuffle the DataFrame to mix the classes
df_balanced_under = df_balanced_under.sample(frac=1)


In [5]:
# Count occurrences of 0 and 1 in the first column
value_counts = df_balanced_under.iloc[:, 0].value_counts()

# Display the counts
print(value_counts)

ColumnName
1    18959
0    18959
Name: count, dtype: int64


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

X = df_balanced_under.iloc[:, 1:]  
y = df_balanced_under.iloc[:, 0] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(y_train.unique())

[1 0]


In [8]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC


# Define the parameter grid: 
param_grid = {
    'C': [0.1, 1, 5],  # Regularization parameter
    'kernel': ['poly', 'rbf'],  # Type of kernel
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'degree': [1,2, 3, 4]  # Degree of the polynomial kernel function ('poly'). Ignored by all other kernels.
}

# Initialize the SVM classifier
svm_classifier = SVC(random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(svm_classifier, param_grid, scoring='f1', cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate on the test set with the best parameters
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)

print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ..........C=0.1, degree=1, gamma=scale, kernel=poly; total time=   7.6s
[CV] END ..........C=0.1, degree=1, gamma=scale, kernel=poly; total time=   7.6s
[CV] END ..........C=0.1, degree=1, gamma=scale, kernel=poly; total time=   7.6s
[CV] END ..........C=0.1, degree=1, gamma=scale, kernel=poly; total time=   7.8s
[CV] END ..........C=0.1, degree=1, gamma=scale, kernel=poly; total time=   7.8s
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=   8.4s
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=   8.4s
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=   8.5s
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=   8.5s
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=   8.5s
[CV] END ...........C=0.1, degree=1, gamma=auto, kernel=poly; total time=   8.8s
[CV] END ...........C=0.1, degree=1, gamma=auto

Best parameters found:  {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

svm_classifier = SVC(kernel='rbf',C=5, degree=1, gamma='scale', probability=True)

svm_classifier.fit(X_train, y_train)

In [None]:
!pip install pickle

In [None]:
import pickle

# Assuming svm_classifier is your trained model
with open('svm_classifier.pkl', 'wb') as file:
    pickle.dump(svm_classifier, file)

print("Model saved successfully.")


In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, log_loss

# Prédiction sur l'ensemble de test
y_pred = svm_classifier.predict(X_test)

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# ROC-AUC Score
y_pred_proba = svm_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.2f}")

# Log Loss
logloss = log_loss(y_test, y_pred_proba)
print(f"Log Loss: {logloss:.2f}")

Confusion Matrix:
 [[1847   59]
 [ 134 1755]]
Precision: 0.97
Recall: 0.93
F1 Score: 0.95
ROC AUC Score: 0.98
Log Loss: 0.15
