In [24]:
%pip install kaggle ucimlrepo pandas numpy scikit-learn optuna xgboost lightgbm


Note: you may need to restart the kernel to use updated packages.


In [32]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Download latest version
# === Download Dataset ===
# Fetch the dataset
dataset = fetch_ucirepo(id=697)
X = dataset.data.features
y = dataset.data.targets
print(f"Features shape: {X.shape}, Target shape: {y.shape}")

print(X.columns.tolist())


Features shape: (4424, 36), Target shape: (4424, 1)
['Marital Status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rat

In [26]:
# Simple preprocessing: encode targets and split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# After your split...
total = X.shape[0]
train_pct = X_train.shape[0] / total * 100
test_pct  = X_test.shape[0]  / total * 100

print(f"Train: {X_train.shape} ({train_pct:.1f}%), Test: {X_test.shape} ({test_pct:.1f}%)")


Train: (3539, 36), Test: (885, 36)
Train: (3539, 36) (80.0%), Test: (885, 36) (20.0%)


  y = column_or_1d(y, warn=True)


In [27]:
# Train Random Forest classifier with class weighting
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)
clf.fit(X_train, y_train)


In [28]:
# Evaluate on test set
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

     Dropout       0.82      0.76      0.79       284
    Enrolled       0.59      0.35      0.44       159
    Graduate       0.79      0.94      0.86       442

    accuracy                           0.78       885
   macro avg       0.73      0.68      0.70       885
weighted avg       0.76      0.78      0.76       885

Confusion Matrix:
[[216  23  45]
 [ 38  56  65]
 [ 10  16 416]]


NameError: name 'df' is not defined

In [29]:
# === Optuna Tuning for LightGBM ===
def objective_lgbm(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 15, 100)
    }
    model = LGBMClassifier(**params)
    return cross_val_score(model, X_scaled, y, cv=3, scoring='f1_macro').mean()

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=30)
best_lgbm = LGBMClassifier(**study_lgbm.best_params)

# === Optuna Tuning for XGB ===
def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0)
    }
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **params)
    return cross_val_score(model, X_scaled, y, cv=3, scoring='f1_macro').mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)
best_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **study_xgb.best_params)

# === CatBoost (without tuning) ===
best_cat = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.05, verbose=0)


[I 2025-04-26 21:10:13,619] A new study created in memory with name: no-name-ecc406b5-5212-4975-94e4-2c6be2baf23b
[W 2025-04-26 21:10:13,623] Trial 0 failed with parameters: {'max_depth': 11, 'learning_rate': 0.028213924663320428, 'n_estimators': 1394, 'reg_alpha': 0.6264151255086797, 'reg_lambda': 3.9288750767411367, 'subsample': 0.41807808832757454, 'colsample_bytree': 0.9979961087837321, 'num_leaves': 58} because of the following error: NameError("name 'X_scaled' is not defined").
Traceback (most recent call last):
  File "c:\Users\brizz\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\brizz\AppData\Local\Temp\ipykernel_18844\452835764.py", line 14, in objective_lgbm
    return cross_val_score(model, X_scaled, y, cv=3, scoring='f1_macro').mean()
                                  ^^^^^^^^
NameError: name 'X_scaled' is not defined
[W 2025-04

NameError: name 'X_scaled' is not defined

In [None]:
# === StratifiedKFold CV with Voting Ensemble ===
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_preds = []
all_true = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled, y)):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    ensemble = VotingClassifier(
        estimators=[
            ('lgbm', best_lgbm),
            ('xgb', best_xgb),
            ('cat', best_cat)
        ],
        voting='soft'
    )
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_val)
    all_preds.extend(preds)
    all_true.extend(y_val)
    print(f"Fold {fold+1} Accuracy: {accuracy_score(y_val, preds):.4f}")

# === Final Evaluation ===
print("\n=== Overall Performance ===")
print(classification_report(all_true, all_preds))

=== Final Ensemble (ens_1) Metrics ===
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       284
           1       0.52      0.43      0.47       159
           2       0.81      0.90      0.85       442

    accuracy                           0.77       885
   macro avg       0.72      0.69      0.70       885
weighted avg       0.76      0.77      0.76       885

[[214  31  39]
 [ 35  68  56]
 [ 12  31 399]]
