<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/xai_deepseek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install catboost
!pip install shap

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                            roc_auc_score, roc_curve, precision_recall_curve,
                            average_precision_score, f1_score, make_scorer)
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import shap
from sklearn.inspection import PartialDependenceDisplay
import warnings
warnings.filterwarnings('ignore')

# Load and preprocess data
def load_and_preprocess(file_path):
    """Load the dataset and perform initial preprocessing"""
    data = pd.read_csv(file_path)

    # Drop identifier columns
    data = data.drop(['username', 'enrollment_id', 'course_id'], axis=1)

    # Check class distribution
    print("Class distribution:")
    print(data['dropout'].value_counts(normalize=True))

    return data

# Feature engineering
def create_features(data):
    """Create additional features from the raw data"""
    X = data.drop('dropout', axis=1)
    y = data['dropout']

    activities = ['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']

    # Weekly aggregates
    for activity in activities:
        for week in range(4):
            start_day = week * 7 + 1
            end_day = (week + 1) * 7
            cols = [f'day_{d}_{activity}' for d in range(start_day, end_day+1)]
            X[f'week_{week+1}_{activity}_sum'] = X[cols].sum(axis=1)
            X[f'week_{week+1}_{activity}_mean'] = X[cols].mean(axis=1)

        # Total activity features
        cols = [f'day_{d}_{activity}' for d in range(1, 31)]
        X[f'total_{activity}_sum'] = X[cols].sum(axis=1)
        X[f'total_{activity}_mean'] = X[cols].mean(axis=1)

    # Activity duration features
    for activity in activities:
        activity_cols = [f'day_{d}_{activity}' for d in range(1, 31)]
        # Last day with activity
        last_active = X[activity_cols].gt(0).idxmax(axis=1)
        X[f'last_active_day_{activity}'] = last_active.str.extract('(\d+)').astype(float)
        # Days since last activity
        X[f'days_since_{activity}'] = 30 - X[f'last_active_day_{activity}']

    # Engagement pattern features
    X['total_engagement'] = X[[f'total_{act}_sum' for act in activities]].sum(axis=1)
    X['engagement_variance'] = X[[f'total_{act}_sum' for act in activities]].var(axis=1)

    return X, y

# Model evaluation
def evaluate_model(model, X_test, y_test, model_name=""):
    """Evaluate model performance"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print(f"Average Precision: {average_precision_score(y_test, y_proba):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'f1': f1_score(y_test, y_pred),
        'precision': average_precision_score(y_test, y_proba)
    }

# Hyperparameter tuning with cross-validation
def tune_model(X, y, model, param_grid, scoring='accuracy'):
    """Perform hyperparameter tuning with cross-validation"""
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Create pipeline with SMOTE and scaler
    pipeline = imbpipeline([
        ('scaler', StandardScaler()),
        ('sampling', SMOTE(random_state=42)),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid={'model__' + key: value for key, value in param_grid.items()},
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X, y)

    print("\nBest parameters found:")
    print(grid_search.best_params_)
    print(f"Best {scoring}: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_

# Main execution
def main():
    # Load and preprocess data
    data = load_and_preprocess('model1_210_features.csv')

    # Feature engineering
    X, y = create_features(data)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Define models and parameter grids for tuning
    models = {
        'RandomForest': {
            'model': RandomForestClassifier(class_weight='balanced', random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5],
                'max_features': ['sqrt', 'log2']
            }
        },
        'XGBoost': {
            'model': XGBClassifier(random_state=42, eval_metric='logloss'),
            'params': {
                'learning_rate': [0.01, 0.1],
                'max_depth': [3, 5, 7],
                'n_estimators': [100, 200],
                'subsample': [0.8, 1.0],
                'gamma': [0, 0.1]
            }
        },
        'LightGBM': {
            'model': LGBMClassifier(random_state=42),
            'params': {
                'learning_rate': [0.01, 0.1],
                'num_leaves': [31, 63],
                'max_depth': [-1, 5, 10],
                'n_estimators': [100, 200],
                'min_child_samples': [20, 50]
            }
        },
        'CatBoost': {
            'model': CatBoostClassifier(random_state=42, verbose=0),
            'params': {
                'iterations': [100, 200],
                'depth': [4, 6, 8],
                'learning_rate': [0.01, 0.1],
                'l2_leaf_reg': [1, 3]
            }
        }
    }

    # Scale positive weight for boosting algorithms
    scale_pos_weight = len(y_train[y_train==0])/len(y_train[y_train==1])

    # Train and evaluate models
    best_accuracy = 0
    best_model = None
    results = {}

    for name, config in models.items():
        print(f"\n{'='*50}")
        print(f"Training and tuning {name}")
        print(f"{'='*50}")

        # Adjust for class imbalance in boosting algorithms
        if name in ['XGBoost', 'LightGBM']:
            config['model'].set_params(scale_pos_weight=scale_pos_weight)

        # Tune model
        model = tune_model(X_train, y_train, config['model'], config['params'], scoring='accuracy')

        # Evaluate on test set
        test_results = evaluate_model(model, X_test, y_test, name)
        results[name] = test_results

        # Track best model
        if test_results['accuracy'] > best_accuracy:
            best_accuracy = test_results['accuracy']
            best_model = model

    # Print summary of results
    print("\n\nModel Comparison:")
    for name, res in results.items():
        print(f"{name}: Accuracy = {res['accuracy']:.4f}, ROC AUC = {res['roc_auc']:.4f}")

    # Explain best model
    print(f"\nBest model: {type(best_model.named_steps['model']).__name__}")
    print(f"Best accuracy: {best_accuracy:.4f}")

    # SHAP explanation
    print("\nGenerating SHAP explanations...")
    explainer = shap.TreeExplainer(best_model.named_steps['model'])
    X_train_scaled = best_model.named_steps['scaler'].transform(X_train)
    shap_values = explainer.shap_values(X_train_scaled)

    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_train_scaled, feature_names=X.columns, plot_type="bar")
    plt.title("Feature Importance (SHAP)")
    plt.tight_layout()
    plt.show()

    # Feature importance
    if hasattr(best_model.named_steps['model'], 'feature_importances_'):
        feature_importance = best_model.named_steps['model'].feature_importances_
        sorted_idx = np.argsort(feature_importance)

        plt.figure(figsize=(10, 12))
        plt.barh(range(len(sorted_idx[-20:])), feature_importance[sorted_idx[-20:]])
        plt.yticks(range(len(sorted_idx[-20:])), np.array(X.columns)[sorted_idx[-20:]])
        plt.xlabel("Feature Importance")
        plt.title("Top 20 Feature Importances")
        plt.tight_layout()
        plt.show()

    # Partial dependence plots
    top_features = np.array(X.columns)[sorted_idx[-5:]]
    print("\nPartial Dependence Plots for Top Features:", top_features)

    plt.figure(figsize=(12, 8))
    PartialDependenceDisplay.from_estimator(
        best_model,
        X_train,
        features=top_features,
        feature_names=X.columns,
        grid_resolution=20
    )
    plt.suptitle("Partial Dependence Plots")
    plt.tight_layout()
    plt.show()

    # Threshold optimization
    print("\nOptimizing decision threshold...")
    y_proba = best_model.predict_proba(X_test)[:, 1]

    thresholds = np.linspace(0.1, 0.9, 50)
    accuracies = []

    for thresh in thresholds:
        y_pred_thresh = (y_proba >= thresh).astype(int)
        accuracies.append(accuracy_score(y_test, y_pred_thresh))

    best_thresh = thresholds[np.argmax(accuracies)]
    print(f"Best threshold: {best_thresh:.2f} with accuracy: {max(accuracies):.4f}")

    # Evaluate with optimal threshold
    y_pred_opt = (y_proba >= best_thresh).astype(int)
    print("\nPerformance with optimized threshold:")
    print(classification_report(y_test, y_pred_opt))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_opt))

if __name__ == "__main__":
    main()

Class distribution:
dropout
1    0.792927
0    0.207073
Name: proportion, dtype: float64

Training and tuning RandomForest
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best parameters found:
{'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best accuracy: 0.8545

RandomForest Performance:
Accuracy: 0.8498900825417893
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.61      0.63      4992
           1       0.90      0.91      0.91     19117

    accuracy                           0.85     24109
   macro avg       0.77      0.76      0.77     24109
weighted avg       0.85      0.85      0.85     24109

ROC AUC: 0.8438
Average Precision: 0.9371
Confusion Matrix:
[[ 3059  1933]
 [ 1686 17431]]

Training and tuning XGBoost
Fitting 5 folds for each of 48 candidates, totalling 240 fits
