In [10]:
import pandas as pd

In [11]:
df_fusionné = pd.read_csv('df_fusionné.csv')

In [12]:
df_fusionné_binaire = df_fusionné.sample(frac=0.2)
df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x == 'N' else 1)

In [13]:
# Count the occurrences of each class
counts = df_fusionné_binaire.iloc[:, 0].value_counts()

# Find the number of instances in the minority class
min_count = counts.min()

# Create balanced DataFrame through undersampling
df_balanced_under = pd.concat([
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 0].sample(min_count),
    df_fusionné_binaire[df_fusionné_binaire.iloc[:, 0] == 1].sample(min_count)
])

# Shuffle the DataFrame to mix the classes
df_balanced_under = df_balanced_under.sample(frac=1)


In [14]:
# Count occurrences of 0 and 1 in the first column
value_counts = df_balanced_under.iloc[:, 0].value_counts()

# Display the counts
print(value_counts)

ColumnName
1    7612
0    7612
Name: count, dtype: int64


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

X = df_balanced_under.iloc[:, 1:]  
y = df_balanced_under.iloc[:, 0] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print(y_train.unique())

[0 1]


In [17]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC


# Define the parameter grid: 
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # More expansive range for regularization parameter
    'kernel': ['rbf'],  # Expanded set of kernels to include 'linear' and 'sigmoid'
    'gamma': ['scale'],  # Broader range for gamma, including specific numerical values
    'degree': [1, 2, 3],  # Extended range for degree, applicable only for 'poly' kernel
}

# Initialize the SVM classifier
svm_classifier = SVC(random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(svm_classifier, param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate on the test set with the best parameters
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)

print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=  34.6s
[CV] END ...........C=0.1, degree=1, gamma=scale, kernel=rbf; total time=  34.9s
[CV] END ..........C=0.01, degree=2, gamma=scale, kernel=rbf; total time=  38.8s
[CV] END ..........C=0.01, degree=1, gamma=scale, kernel=rbf; total time=  39.0s
[CV] END ..........C=0.01, degree=2, gamma=scale, kernel=rbf; total time=  39.1s
[CV] END ..........C=0.01, degree=3, gamma=scale, kernel=rbf; total time=  39.0s
[CV] END ..........C=0.01, degree=3, gamma=scale, kernel=rbf; total time=  39.1s
[CV] END ..........C=0.01, degree=2, gamma=scale, kernel=rbf; total time=  39.5s
[CV] END ..........C=0.01, degree=1, gamma=scale, kernel=rbf; total time=  39.4s
[CV] END ..........C=0.01, degree=1, gamma=scale, kernel=rbf; total time=  39.9s
[CV] END ..........C=0.01, degree=3, gamma=scale, kernel=rbf; total time=  39.9s
[CV] END ...........C=0.1, degree=1, gamma=scale

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

svm_classifier = SVC(kernel='rbf',C=10, degree=1, gamma='scale', probability=True)

svm_classifier.fit(X_train, y_train)

In [24]:
import pickle

with open('svm_classifier.pkl', 'wb') as file:
    pickle.dump(svm_classifier, file)

print("Model saved successfully.")

Model saved successfully.


In [25]:
import pickle

with open('svm_classifier.pkl', 'rb') as file:
    svm_classifier = pickle.load(file)

# Assuming this print statement is for demonstration that the model was loaded successfully
print("Model loaded successfully.")


Model loaded successfully.


In [20]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, log_loss

# Prédiction sur l'ensemble de test
y_pred = svm_classifier.predict(X_test)

In [21]:
prediction = svm_classifier.predict(X_test)

prediction = pd.DataFrame(prediction)
number = prediction.value_counts()
number

0
0    1871
1    1174
Name: count, dtype: int64

In [22]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# ROC-AUC Score
y_pred_proba = svm_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.2f}")

# Log Loss
logloss = log_loss(y_test, y_pred_proba)
print(f"Log Loss: {logloss:.2f}")

Confusion Matrix:
 [[1360  174]
 [ 511 1000]]
Precision: 0.85
Recall: 0.66
F1 Score: 0.74
ROC AUC Score: 0.86
Log Loss: 0.49
