In [1]:
from utils import *

In [2]:
x, y, x_train, x_test, y_train, y_test, variables = preparer_donnees('credit_scoring.csv')

print("Variables utilisées pour l'entrainement :", variables)
print("Dimensions de x_train:", x_train.shape)
print("Dimensions de y_train:", y_train.shape)
print("Dimensions de x_test:", x_test.shape)
print("Dimensions de y_test:", y_test.shape)

Répartition des données : 72.21% positives, 27.79% négatives
Données prêtes pour l'entrainement...
Variables utilisées pour l'entrainement : ['Seniority' 'Home' 'Time' 'Age' 'Marital' 'Records' 'Job' 'Expenses'
 'Income' 'Assets' 'Debt' 'Amount' 'Price']
Dimensions de x_train: (2187, 13)
Dimensions de y_train: (2187,)
Dimensions de x_test: (2188, 13)
Dimensions de y_test: (2188,)


## Comparaison train test

In [3]:
X_train, X_test, Y_train, Y_test = decoupage_train_test(x, y, 0.33)
apprentissage_train_test(X_train, X_test, Y_train, Y_test, clfs)

MLP -> Test Score: 73.36%
[[  37  364]
 [  25 1018]]

DT -> Test Score: 77.33%
[[212 189]
 [201 842]]

KNN -> Test Score: 73.76%
[[125 276]
 [143 900]]

CART -> Test Score: 79.45%
[[218 183]
 [162 881]]

ID3 -> Test Score: 80.06%
[[229 172]
 [167 876]]

Bagging -> Test Score: 77.97%
[[129 272]
 [ 53 990]]

AdaBoost -> Test Score: 79.34%
[[217 184]
 [163 880]]

RandomForest -> Test Score: 75.40%
[[  65  336]
 [  18 1025]]



In [4]:
X_train_norm, X_test_norm = normalisation(x_train, x_test)
apprentissage_train_test(X_train_norm, X_test_norm, y_train, y_test, clfs)

MLP -> Test Score: 79.60%
[[ 348  256]
 [ 278 1306]]

DT -> Test Score: 77.27%
[[ 322  282]
 [ 315 1269]]

KNN -> Test Score: 78.17%
[[ 286  318]
 [ 223 1361]]

CART -> Test Score: 76.73%
[[ 167  437]
 [  87 1497]]

ID3 -> Test Score: 76.58%
[[ 167  437]
 [  92 1492]]

Bagging -> Test Score: 78.13%
[[ 200  404]
 [  88 1496]]

AdaBoost -> Test Score: 77.81%
[[ 299  305]
 [ 257 1327]]

RandomForest -> Test Score: 75.84%
[[  98  506]
 [  17 1567]]



## Comparaison CV

In [5]:
def apprentissage_CV(X, Y, clfs):
    """
    Évalue plusieurs classifieurs avec validation croisée 10-fold
    Retourne le meilleur classifieur et son score
    
    Args:
        X: Features
        Y: Target
        clfs: Dictionnaire de classifieurs
        
    Returns:
        tuple: (nom_meilleur_classifieur, meilleur_score_moyen)
    """
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    
    best_score = 0
    best_clf_name = ""
    
    for i in clfs:
        clf = clfs[i]
        
        # Validation croisée avec notre score personnalisé
        cv_scores = cross_val_score(
            clf, X, Y, 
            cv = kf, 
            scoring = monscore,  
            n_jobs = 1
        )
        
        mean_score = np.mean(cv_scores)
        
        print("Score de {0} est: {1:.3f}".format(clf, mean_score))
        
        if mean_score > best_score:
            best_score = mean_score
            best_clf_name = clf
    
    return best_clf_name, best_score

In [6]:
def comparaison_CV(X, Y, clfs):
    """
    Compare plusieurs classifieurs et retourne le meilleur
    
    Args:
        X: Features
        Y: Target
        clfs: Dictionnaire de classifieurs
        
    Returns:
        str: Nom du meilleur classifieur
    """
    
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    best_score = 0
    best_clf_name = ""
    
    scores_dict = {}
    
    for clf_name in clfs:
        clf = clfs[clf_name]
        
        cv_scores = cross_val_score(
            clf, X, Y, 
            cv=kf, 
            scoring=monscore,
            n_jobs=-1
        )
        
        mean_score = np.mean(cv_scores)
        std_score = np.std(cv_scores)
        
        scores_dict[clf_name] = mean_score
        
        print(f"{clf_name:15s}: {mean_score:.3f} ± {std_score:.3f}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_clf_name = clf_name
    
    print(f"\nMeilleur classifieur: {best_clf_name} avec score: {best_score:.3f}")
    
    return best_clf_name

In [7]:
best_classifieur, best_score = apprentissage_CV(x, y, clfs)
best_classifieur_norm, best_score_norm = apprentissage_CV(X_train_norm, y, clfs)
meilleur_original = comparaison_CV(x, y, clfs)
meilleur_norm = comparaison_CV(X_train_norm, Y_train, clfs)

print("SYNTHÈSE DES RÉSULTATS")
print(f"Meilleur sur données originales: {best_classifieur}")
print(f"Score: {best_score:.3f}")

print(f"\nMeilleur sur données normalisées: {best_classifieur_norm}")
print(f"Score: {best_score_norm:.3f}")

print("\n" + "="*60)
print("CONCLUSION")
print("="*60)
if best_score > best_score_norm:
    print("✓ LES DONNÉES ORIGINALES DONNENT DE MEILLEURS RÉSULTATS")
    print(f"  Algorithme à retenir: {best_classifieur}")
    print(f"  Score: {best_score:.3f}")
    print("\n→ STRATÉGIE: Ne pas normaliser les données")
else:
    print("LES DONNÉES NORMALISÉES DONNENT DE MEILLEURS RÉSULTATS")
    print(f"  Algorithme à retenir: {best_classifieur_norm}")
    print(f"  Score: {best_score_norm:.3f}")
    print("\n→ STRATÉGIE: Normaliser les données")


Score de MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=1000, random_state=1) est: 0.758
Score de DecisionTreeClassifier(random_state=1) est: 0.767
Score de KNeighborsClassifier(n_jobs=1) est: 0.741
Score de DecisionTreeClassifier(max_depth=3, random_state=1) est: 0.778
Score de DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=1) est: 0.769
Score de BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=3, random_state=1),
                  n_estimators=200, n_jobs=1, random_state=1) est: 0.783
Score de AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=3,
                                                    random_state=1),
                   n_estimators=200, random_state=1) est: 0.781
Score de RandomForestClassifier(max_depth=3, n_estimators=200, n_jobs=1, random_state=1) est: 0.756


ValueError: Found input variables with inconsistent numbers of samples: [2187, 4375]