In [5]:
%pip install pandas numpy sklearn-pandas lightgbm xgboost catboost optuna kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

# Download latest version
# === Download Dataset ===
path = kagglehub.dataset_download("thedevastator/higher-education-predictors-of-student-retention")
csv_file = [f for f in os.listdir(path) if f.endswith(".csv")][0]
df = pd.read_csv(os.path.join(path, csv_file))

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting catboost
  Using cached catboost-1.2.8-cp313-cp313-win_amd64.whl.metadata (1.5 kB)
Collecting optuna
  Using cached optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Using cached matplotlib-3.10.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting plotly (from catboost)
  Using cached plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  U

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\Users\\Ibrah\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\PIL\\FpxImagePlugin.py'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 24.3.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'lightgbm'

In [25]:
# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Features and labels
X = df.drop(columns=['Target'])
y = LabelEncoder().fit_transform(df['Target'])  # target must be numerical

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# === Optuna Tuning for LightGBM ===
def objective_lgbm(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 15, 100)
    }
    model = LGBMClassifier(**params)
    return cross_val_score(model, X_scaled, y, cv=3, scoring='f1_macro').mean()

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=30)
best_lgbm = LGBMClassifier(**study_lgbm.best_params)

# === Optuna Tuning for XGB ===
def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0)
    }
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **params)
    return cross_val_score(model, X_scaled, y, cv=3, scoring='f1_macro').mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)
best_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **study_xgb.best_params)

# === CatBoost (without tuning) ===
best_cat = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.05, verbose=0)


[I 2025-04-25 08:20:23,924] A new study created in memory with name: no-name-a5e03301-b7a1-44cf-adab-8510f5aa8c65
[I 2025-04-25 08:20:29,794] Trial 0 finished with value: 0.707263545121316 and parameters: {'max_depth': 12, 'learning_rate': 0.05171451071010541, 'n_estimators': 473, 'reg_alpha': 1.4583125834784127, 'reg_lambda': 0.6775343638665765, 'subsample': 0.73796781078786, 'colsample_bytree': 0.6745349661167812, 'num_leaves': 99}. Best is trial 0 with value: 0.707263545121316.
[I 2025-04-25 08:20:31,851] Trial 1 finished with value: 0.7020652635075727 and parameters: {'max_depth': 4, 'learning_rate': 0.06556035065753119, 'n_estimators': 617, 'reg_alpha': 4.229671821639606, 'reg_lambda': 4.049574073853709, 'subsample': 0.6455297251477711, 'colsample_bytree': 0.6478339052826397, 'num_leaves': 87}. Best is trial 0 with value: 0.707263545121316.
[I 2025-04-25 08:20:35,957] Trial 2 finished with value: 0.708563269859439 and parameters: {'max_depth': 11, 'learning_rate': 0.05025339790157

In [22]:
# === StratifiedKFold CV with Voting Ensemble ===
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_preds = []
all_true = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled, y)):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    ensemble = VotingClassifier(
        estimators=[
            ('lgbm', best_lgbm),
            ('xgb', best_xgb),
            ('cat', best_cat)
        ],
        voting='soft'
    )
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_val)
    all_preds.extend(preds)
    all_true.extend(y_val)
    print(f"Fold {fold+1} Accuracy: {accuracy_score(y_val, preds):.4f}")

# === Final Evaluation ===
print("\n=== Overall Performance ===")
print(classification_report(all_true, all_preds))

=== Final Ensemble (ens_1) Metrics ===
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       284
           1       0.52      0.43      0.47       159
           2       0.81      0.90      0.85       442

    accuracy                           0.77       885
   macro avg       0.72      0.69      0.70       885
weighted avg       0.76      0.77      0.76       885

[[214  31  39]
 [ 35  68  56]
 [ 12  31 399]]
