In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

def load_data():
    print("🎯 Chargement des données")
    
    trans = pd.read_csv('C:\\Users\\ariel\\Desktop\\unh\\projet-annuel\\dataset\\train_transaction.csv')
    ident = pd.read_csv('C:\\Users\\ariel\\Desktop\\unh\\projet-annuel\\dataset\\train_identity.csv')
    data = pd.merge(trans, ident, on='TransactionID', how='left')
    
    fraud_data = data[data['isFraud'] == 1]
    n_samples = len(fraud_data)
    non_fraud_sample = data[data['isFraud'] == 0].sample(n=n_samples, random_state=42)
    
    balanced_data = pd.concat([fraud_data, non_fraud_sample])
    return balanced_data

def create_features(df):
    print("⚡ Création des features")
    
    df = df.copy()
    
    if 'TransactionDT' in df.columns:
        df['TransactionDT'] = pd.to_datetime(df['TransactionDT'], unit='s', errors='coerce')
        df['hour'] = df['TransactionDT'].dt.hour.fillna(-1)
        df['weekday'] = df['TransactionDT'].dt.weekday.fillna(-1)
        df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    
    if 'TransactionAmt' in df.columns:
        df['TransactionAmt_log'] = np.log1p(df['TransactionAmt'])
        df['amount_high'] = (df['TransactionAmt'] > df['TransactionAmt'].quantile(0.8)).astype(int)
    
    v_important = ['V257', 'V244', 'V242', 'V246', 'V258', 'V52', 'V51', 'V87', 'V201', 'V86',
                   'V14', 'V12', 'V13', 'V11', 'V10', 'V1', 'V2', 'V3', 'V4']
    
    for col in v_important:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    c_cols = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']
    d_cols = ['D1', 'D2', 'D3', 'D4', 'D5', 'D10']
    
    for col in c_cols + d_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    card_cols = ['card1', 'card2', 'card3', 'card5']
    for col in card_cols:
        if col in df.columns:
            df[col] = df[col].fillna(-1)
    
    feature_cols = [
        'TransactionAmt', 'TransactionAmt_log', 'amount_high',
        'hour', 'weekday', 'is_weekend',
        'card1', 'card2', 'card3', 'card5',
        'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14',
        'D1', 'D2', 'D3', 'D4', 'D5', 'D10'
    ] + v_important
    
    final_cols = ['isFraud'] + [col for col in feature_cols if col in df.columns]
    df = df[final_cols]
    df = df.fillna(df.median())
    
    return df

def create_ensemble():
    xgb = XGBClassifier(
        n_estimators=1000,
        max_depth=7,
        learning_rate=0.02,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    )
    
    lgb = LGBMClassifier(
        n_estimators=1000,
        max_depth=7,
        learning_rate=0.02,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=15,
        min_samples_split=15,
        min_samples_leaf=8,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    
    ensemble = VotingClassifier(
        estimators=[('xgb', xgb), ('lgb', lgb), ('rf', rf)],
        voting='soft',
        weights=[2, 2, 1]
    )
    
    return ensemble

def find_best_threshold(y_true, y_proba):
    thresholds = np.linspace(0.35, 0.55, 200)
    best_threshold = 0.44
    best_score = -1
    
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        if all(m >= 0.90 for m in [acc, prec, rec, f1]):
            return threshold, {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}
        
        current_min = min(acc, prec, rec, f1)
        score = current_min + (np.mean([acc, prec, rec, f1]) * 0.3)
        
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    return best_threshold, {}

def main():
    print("🚀 Entraînement du modèle de détection de fraude")
    
    df = load_data()
    df = create_features(df)
    
    features = [col for col in df.columns if col != 'isFraud']
    X = df[features].values
    y = df['isFraud'].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
    )
    
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    ensemble = create_ensemble()
    ensemble.fit(X_train_scaled, y_train)
    
    y_proba = ensemble.predict_proba(X_test_scaled)[:, 1]
    optimal_threshold, _ = find_best_threshold(y_test, y_proba)
    
    y_pred_final = (y_proba >= optimal_threshold).astype(int)
    
    acc = accuracy_score(y_test, y_pred_final)
    prec = precision_score(y_test, y_pred_final, zero_division=0)
    rec = recall_score(y_test, y_pred_final, zero_division=0)
    f1 = f1_score(y_test, y_pred_final, zero_division=0)
    auc = roc_auc_score(y_test, y_proba)
    
    print(f"\n📈 RÉSULTATS FINAUX:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    
    print(f"\n🎯 Seuil optimal: {optimal_threshold:.4f}")
    print(f"📊 Matrice de confusion:")
    print(confusion_matrix(y_test, y_pred_final))
    
    joblib.dump({
        'model': ensemble,
        'scaler': scaler,
        'features': features,
        'optimal_threshold': optimal_threshold
    }, 'fraud_model_github_poids.pkl')
    
    print("✅ Modèle sauvegardé!")

if __name__ == "__main__":
    main()

🚀 Entraînement du modèle de détection de fraude
🎯 Chargement des données
⚡ Création des features

📈 RÉSULTATS FINAUX:
Accuracy:  0.8524
Precision: 0.8529
Recall:    0.8517
F1-score:  0.8523
ROC AUC:   0.9285

🎯 Seuil optimal: 0.4405
📊 Matrice de confusion:
[[3526  607]
 [ 613 3520]]
✅ Modèle sauvegardé!
