In [1]:
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import SGDClassifier
# from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import mlflow

In [2]:
work_dir = Path.home() / 'Programming/Python/machine-learning-exercises/higher-education-students-performance-evaluation'
data_file = work_dir / 'data/students-performance.csv'

In [3]:
attribute_names_json_file = work_dir / 'attribute_names.json'
with open(attribute_names_json_file, 'rt') as f_in:
    attribute_names_json = json.load(f_in)

In [4]:
sp_df = pd.read_csv(data_file)

In [5]:
X = sp_df.drop(['STUDENT ID', 'GRADE'], axis=1)
y = sp_df['GRADE'].copy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=33)

In [7]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
cat_attribs = sp_df.columns[1:-1]
cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessing = ColumnTransformer([("cat", cat_pipeline, cat_attribs)])

In [9]:
X_train_tr = preprocessing.fit_transform(X_train)
X_test_tr = preprocessing.transform(X_test)

In [10]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('higher-education-students-performance-evaluation');

In [11]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'XGBoost')
        mlflow.log_params(params)
        clf = xgb.XGBClassifier(
            **params,
            eval_metric='auc',
            early_stopping_rounds=50,
            n_jobs=-1,
        )
        clf.fit(
            X_train_tr,
            y_train,
            eval_set=[(X_test_tr, y_test)],
        )
        y_pred = clf.predict_proba(X_test_tr)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
        mlflow.log_metric("AUC", auc)

    return {'loss': auc, 'status': STATUS_OK}

In [12]:
xgboost_search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 4, 200, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 50, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

In [13]:
xgboost_best_result = fmin(
    fn=objective,
    space=xgboost_search_space,
    algo=tpe.suggest,
    max_evals=30,
    trials=Trials()
)

[0]	validation_0-auc:0.54877                                                                                                                      
[1]	validation_0-auc:0.54771                                                                                                                      
[2]	validation_0-auc:0.57075                                                                                                                      
[3]	validation_0-auc:0.58199                                                                                                                      
[4]	validation_0-auc:0.57538                                                                                                                      
[5]	validation_0-auc:0.56409                                                                                                                      
[6]	validation_0-auc:0.55635                                                                                          

In [14]:
xgboost_best_params = {
    'learning_rate': 0.1822609570893024,
    'max_depth': 16,
    'min_child_weight': 0.4639113171017813,
    'n_estimators': 131,
    'objective': 'binary:logistic',
    'reg_alpha': 0.007860242176975434,
    'reg_lambda': 0.02768073078548693,
    'seed': 42,
}

In [15]:
mlflow.xgboost.autolog()
xgboost_clf = xgb.XGBClassifier(
    **xgboost_best_params,
    eval_metric='auc',
    early_stopping_rounds=20,
    n_jobs=-1)
xgboost_clf.fit(X_train_tr, y_train, eval_set=[(X_test_tr, y_test)],)

2025/07/12 20:19:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '10749f67ce0d40efb1e84199e56a9421', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation_0-auc:0.62236
[1]	validation_0-auc:0.64030
[2]	validation_0-auc:0.65390
[3]	validation_0-auc:0.68579
[4]	validation_0-auc:0.71081
[5]	validation_0-auc:0.72131
[6]	validation_0-auc:0.71703
[7]	validation_0-auc:0.72751
[8]	validation_0-auc:0.72040
[9]	validation_0-auc:0.70807
[10]	validation_0-auc:0.70215
[11]	validation_0-auc:0.69231
[12]	validation_0-auc:0.68800
[13]	validation_0-auc:0.68626
[14]	validation_0-auc:0.67731
[15]	validation_0-auc:0.67533
[16]	validation_0-auc:0.67299
[17]	validation_0-auc:0.66811
[18]	validation_0-auc:0.67082
[19]	validation_0-auc:0.67400
[20]	validation_0-auc:0.67182
[21]	validation_0-auc:0.67191
[22]	validation_0-auc:0.66840
[23]	validation_0-auc:0.66679
[24]	validation_0-auc:0.66564
[25]	validation_0-auc:0.66342
[26]	validation_0-auc:0.66268
[27]	validation_0-auc:0.65876




In [16]:
with open('models/xgb_cls.bin', 'wb') as f_out:
    pickle.dump(xgboost_clf, f_out)

In [17]:
log_reg_params = {
    'C': scope.int(hp.quniform('C', -3, 3, 1)),
    'class_weight': 'balanced',
    'seed': 42,
    'n_jobs': -1,
}

In [18]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'LogisticRegression')
        mlflow.log_params(params)
        log_reg = LogisticRegression()
        log_reg.fit(
            X_train_tr,
            y_train,
            )
        y_pred = log_reg.predict_proba(X_test_tr)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
        mlflow.log_metric("AUC", auc)
    return {'loss': auc, 'status': STATUS_OK}

In [19]:
log_reg_best_result = fmin(
    fn=objective,
    space=log_reg_params,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 35.72trial/s, best loss: 0.5896990311276026]
