In [77]:
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import chi2_contingency
import warnings
import re

warnings.filterwarnings('ignore')

In [78]:
df_X = pd.read_csv('clean_data.csv')
df_Y = pd.read_csv('Y_train_2_XPXJDyy.csv')
df_id=pd.read_csv('X_train_G3tdtEn.csv')

In [79]:
df_X = df_X.drop('Unnamed: 0', axis=1)

# Utiliser la colonne "ID" comme nouvel index
df_X = df_X.set_index(df_id['ID'])

df_Y=df_Y.drop('index', axis=1)
df_Y = df_Y.set_index(df_Y['ID'])

df_Y=df_Y.drop('ID', axis=1)

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, random_state=42)

In [81]:
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42,sampling_strategy=0.75)
X_train, y_train = undersampler.fit_resample(X_train, y_train)

### Decision Tree

In [100]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)

# Calculer la précision du classifieur sur les données de test
accuracy = accuracy_score(y_test, y_pred)
print("Précision : {:.2f}%".format(accuracy * 100))

Précision : 59.70%


### Random Forest

In [82]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier

In [83]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, random_state=42)

In [84]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7520475903095094

In [67]:
df = y_test.reset_index()
df1 = pd.DataFrame({ 
                    'ID': df['ID'], 
                    'pred_proba': y_pred})
df = df.rename(columns={'index': 'ID'})
pr_auc_score(df, df1)

0.035343050624743806

#### Test de cross Validation

In [68]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, random_state=42)

# Calculer les scores de validation croisée sur l'ensemble d'entraînement
scores = cross_val_score(bag_clf, X_train, y_train, cv=5)

# Afficher les scores moyens et l'écart-type
print("Scores de validation croisée : ", scores)
print("Score moyen : ", np.mean(scores))
print("Écart-type : ", np.std(scores))

Scores de validation croisée :  [0.76703297 0.75384615 0.72246696 0.7753304  0.75330396]
Score moyen :  0.7543960884930048
Écart-type :  0.01799388971922286


## Mise en place du voting soft classifer

In [69]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [70]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)

svm_clf = SVC(kernel='rbf',gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

In [101]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    df = y_test.reset_index()
    df1 = pd.DataFrame({ 
                    'ID': df['ID'], 
                    'pred_proba': y_pred})
    df1 = df.rename(columns={'index': 'ID'})
    #print(clf.__class__.__name__, pr_auc_score(df, df1))
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.5134063281317355
RandomForestClassifier 0.780498318820588
SVC 0.8220536253125269
VotingClassifier 0.7803258901629451


In [41]:
from sklearn.metrics import average_precision_score
def pr_auc_score(y_true, y_pred_proba):
    ''' 
    Return the area under the Precision-Recall curve.
    
    Args:
        - y_true (pd.DataFrame): Dataframe with a unique identifier for each observation (first column) and the ground truth observations (second column).
        - y_pred_proba (pd.DataFrame): Dataframe with a unique identifier for each observation (first column) and the predicted probabilities estimates for the minority class (second column).
        
    Returns:
        float
    '''
    
    y_true_sorted = y_true.sort_values(by='ID').reset_index(drop=True)
    y_pred_proba_sorted = y_pred_proba.sort_values(by='ID').reset_index(drop=True)
    pr_auc_score = average_precision_score(np.ravel(y_true_sorted.iloc[:, 1]), np.ravel(y_pred_proba_sorted.iloc[:, 1]))

    return pr_auc_score