<a href="https://colab.research.google.com/github/aymenchibouti/newversion/blob/master/xgb_rand_logi_lime_shap_res_best.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install shap
!pip install lime
!pip install xgboost

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=65500d793900823c1b06a602e97967fe741c9c031380639c8f80265f584583d9
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import shap
import lime
import lime.lime_tabular
import xgboost as xgb
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

# Load the dataset
file_path = 'model1_210_features.csv'
data = pd.read_csv(file_path)

# Drop non-numeric columns that are not useful for prediction
data = data.drop(columns=['username', 'course_id', 'enrollment_id'])

# Handle missing values (fill with 0 or use mean/median imputation as necessary)
data.fillna(0, inplace=True)

# Separate features and target variable
X = data.drop(columns=['dropout'])  # Features
y = data['dropout']  # Target variable

# Standardize the features (important for models like Logistic Regression and XGBoost)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE (oversampling the minority class)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# RandomizedSearchCV for hyperparameter tuning (for Random Forest)
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
rf_random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, n_iter=100, cv=3, random_state=42)
rf_random_search.fit(X_train, y_train)

# XGBoost RandomizedSearchCV (hyperparameter tuning for XGBoost)
param_dist_xgb = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb_model = xgb.XGBClassifier(random_state=42)
xgb_random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist_xgb, n_iter=100, cv=3, random_state=42)
xgb_random_search.fit(X_train, y_train)

# Logistic Regression with class weight to handle imbalance
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)

# Best models from RandomizedSearchCV
best_rf = rf_random_search.best_estimator_
best_xgb = xgb_random_search.best_estimator_

# Predictions and evaluation for each model
models = [lr, best_rf, best_xgb]
model_names = ['Logistic Regression', 'Random Forest (Tuned)', 'XGBoost (Tuned)']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    print(f"Evaluation for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("-" * 50)

# Explainability using SHAP (SHAP can work with tree-based models like RandomForest and XGBoost)
explainer_rf = shap.TreeExplainer(best_rf)
shap_values_rf = explainer_rf.shap_values(X_test)

# SHAP summary plot for Random Forest
shap.summary_plot(shap_values_rf, X_test, feature_names=X.columns)

# LIME - Local Interpretable Model-Agnostic Explanations
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    training_labels=y_train,
    mode="classification",
    feature_names=X.columns,
    class_names=["No Dropout", "Dropout"],
    verbose=True,
    random_state=42
)

# Pick a single instance for LIME explanation
instance = X_test[0]  # First instance in the test set
explanation_lime = explainer_lime.explain_instance(instance, best_xgb.predict_proba)

# Visualize LIME explanation
explanation_lime.show_in_notebook()

# Feature Importance (using Random Forest)
feature_importance = best_rf.feature_importances_
# Visualize the feature importance
plt.figure(figsize=(10, 8))
plt.barh(X.columns, feature_importance)
plt.title("Feature Importance (Random Forest - Tuned)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


Evaluation for Logistic Regression:
Accuracy: 0.7635
Precision: 0.7125
Recall: 0.8836
F1 Score: 0.7889
--------------------------------------------------
Evaluation for Random Forest (Tuned):
Accuracy: 0.8848
Precision: 0.8644
Recall: 0.9129
F1 Score: 0.8880
--------------------------------------------------
Evaluation for XGBoost (Tuned):
Accuracy: 0.8798
Precision: 0.8351
Recall: 0.9465
F1 Score: 0.8873
--------------------------------------------------
