In [1]:
import json
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import mlflow

In [2]:
work_dir = Path.home() / 'Programming/Python/machine-learning-exercises/higher-education-students-performance-evaluation'
data_file = work_dir / 'data/students-performance.csv'

In [3]:
attribute_names_json_file = work_dir / 'attribute_names.json'
with open(attribute_names_json_file, 'rt') as f_in:
    attribute_names_json = json.load(f_in)

In [4]:
labels_dict = {}
string_indexes = [str(id) for id in range(1, 33)]
for ind in string_indexes:
    label = attribute_names_json[ind]['name']
    labels_dict[ind] = label
labels_dict['0'] = 'STUDENT ID'

In [5]:
sp_df = pd.read_csv(data_file)
sp_df.rename(columns=labels_dict, inplace=True)

In [6]:
X = sp_df.drop(['STUDENT ID', 'GRADE'], axis=1)
y = sp_df['GRADE'].copy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=33)

In [8]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
cat_attribs = sp_df.columns[1:-1]
cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessing = ColumnTransformer([("cat", cat_pipeline, cat_attribs)])

In [10]:
X_train_tr = preprocessing.fit_transform(X_train)
X_test_tr = preprocessing.transform(X_test)

In [11]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('higher-education-students-performance-evaluation');

## XGBoost Classifier

In [12]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'XGBoost')
        mlflow.log_params(params)
        clf = xgb.XGBClassifier(
            **params,
            eval_metric='auc',
            early_stopping_rounds=50,
            n_jobs=-1,
        )
        clf.fit(
            X_train_tr,
            y_train,
            eval_set=[(X_test_tr, y_test)],
        )
        y_pred = clf.predict_proba(X_test_tr)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
        mlflow.log_metric("AUC", auc)

    return {'loss': auc, 'status': STATUS_OK}

In [13]:
xgboost_search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 4, 200, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 50, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

In [14]:
xgboost_results = fmin(
    fn=objective,
    space=xgboost_search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation_0-auc:0.60260                                                                                                                      
[1]	validation_0-auc:0.62415                                                                                                                      
[2]	validation_0-auc:0.63764                                                                                                                      
[3]	validation_0-auc:0.64801                                                                                                                      
[4]	validation_0-auc:0.64034                                                                                                                      
[5]	validation_0-auc:0.62230                                                                                                                      
[6]	validation_0-auc:0.61635                                                                                          

In [15]:
xgboost_best_params = {
    'learning_rate': 0.1822609570893024,
    'max_depth': 16,
    'min_child_weight': 0.4639113171017813,
    'n_estimators': 131,
    'objective': 'binary:logistic',
    'reg_alpha': 0.007860242176975434,
    'reg_lambda': 0.02768073078548693,
    'seed': 42,
}

In [16]:
with mlflow.start_run():
    mlflow.set_tag('model', 'XGBoost')
    mlflow.log_params(xgboost_best_params)
    xgboost_clf = xgb.XGBClassifier(
        **xgboost_best_params,
        eval_metric='auc',
        early_stopping_rounds=50,
        n_jobs=-1,
    )
    xgboost_clf.fit(
        X_train_tr,
        y_train,
        eval_set=[(X_test_tr, y_test)],
    )
    y_pred = xgboost_clf.predict_proba(X_test_tr)
    auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
    mlflow.log_metric("AUC", auc)

    with open('models/xgb_cls.bin', 'wb') as f_out:
        pickle.dump(xgboost_clf, f_out)

    mlflow.log_artifact('models/xgb_cls.bin', artifact_path='best_models')

    mlflow.xgboost.log_model(xgboost_clf, artifact_path='artifacts')

[0]	validation_0-auc:0.62236
[1]	validation_0-auc:0.64030
[2]	validation_0-auc:0.65390
[3]	validation_0-auc:0.68579
[4]	validation_0-auc:0.71081
[5]	validation_0-auc:0.72131
[6]	validation_0-auc:0.71703
[7]	validation_0-auc:0.72751
[8]	validation_0-auc:0.72040
[9]	validation_0-auc:0.70807
[10]	validation_0-auc:0.70215
[11]	validation_0-auc:0.69231
[12]	validation_0-auc:0.68800
[13]	validation_0-auc:0.68626
[14]	validation_0-auc:0.67731
[15]	validation_0-auc:0.67533
[16]	validation_0-auc:0.67299
[17]	validation_0-auc:0.66811
[18]	validation_0-auc:0.67082
[19]	validation_0-auc:0.67400
[20]	validation_0-auc:0.67182
[21]	validation_0-auc:0.67191
[22]	validation_0-auc:0.66840
[23]	validation_0-auc:0.66679
[24]	validation_0-auc:0.66564
[25]	validation_0-auc:0.66342
[26]	validation_0-auc:0.66268
[27]	validation_0-auc:0.65876
[28]	validation_0-auc:0.65653
[29]	validation_0-auc:0.65508
[30]	validation_0-auc:0.65281
[31]	validation_0-auc:0.65233
[32]	validation_0-auc:0.65164
[33]	validation_0-au



## Logistic Regression

In [17]:
log_reg_params = {
    'C': scope.int(hp.quniform('C', -3, 3, 1)),
    'class_weight': 'balanced',
    'seed': 42
}

In [18]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'LogisticRegression')
        mlflow.log_params(params)
        log_reg = LogisticRegression()
        log_reg.fit(
            X_train_tr,
            y_train,
            )
        y_pred = log_reg.predict_proba(X_test_tr)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
        mlflow.log_metric("AUC", auc)
        
        with open('models/log_reg.bin', 'wb') as f_out:
            pickle.dump(xgboost_clf, f_out)

        mlflow.log_artifact('models/log_reg.bin', artifact_path='best_models')

    return {'loss': auc, 'status': STATUS_OK}

In [19]:
log_reg_result = fmin(
    fn=objective,
    space=log_reg_params,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 26.10trial/s, best loss: 0.5896990311276026]


## SVC

In [20]:
svc_params = {
    'C': scope.int(hp.quniform('C', -3, 3, 1)),
    'max_iter': scope.int(hp.quniform('max_iter', 1, 3, 1))
}

In [21]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'SVC')
        mlflow.log_params(params)
        svc_clf = SVC(probability=True)
        svc_clf.fit(
            X_train_tr,
            y_train,
            )
        y_pred = svc_clf.predict_proba(X_test_tr)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
        mlflow.log_metric("AUC", auc)

        with open('models/svc_cls.bin', 'wb') as f_out:
            pickle.dump(svc_clf, f_out)

        mlflow.log_artifact('models/svc_cls.bin', artifact_path='best_models')

        mlflow.sklearn.log_model(svc_clf, input_example=X_train_tr[[0]], artifact_path='artifacts')
    
    return {'loss': auc, 'status': STATUS_OK}

In [22]:
svc_result = fmin(
    fn=objective,
    space=svc_params,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  2%|█▌                                                                          | 1/50 [00:01<01:33,  1.91s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000369 seconds

INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.595151



  4%|███                                                                         | 2/50 [00:03<01:31,  1.90s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000377 seconds

INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.595151



  6%|████▌                                                                       | 3/50 [00:05<01:30,  1.92s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000419 seconds

INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.595151



  8%|██████                                                                      | 4/50 [00:07<01:28,  1.92s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000382 seconds

INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.595151



 10%|███████▌                                                                    | 5/50 [00:09<01:25,  1.91s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000395 seconds

INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.595151



 12%|█████████                                                                   | 6/50 [00:11<01:24,  1.92s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000438 seconds

INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.595151



 14%|██████████▋                                                                 | 7/50 [00:13<01:22,  1.91s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000384 seconds

INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.595151



 16%|████████████▏                                                               | 8/50 [00:15<01:20,  1.91s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000398 seconds

INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.595151



 18%|█████████████▋                                                              | 9/50 [00:17<01:19,  1.93s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000383 seconds

INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.595151



 20%|███████████████                                                            | 10/50 [00:19<01:17,  1.94s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000392 seconds

INFO:hyperopt.tpe:TPE using 10/10 trials with best loss 0.595151



 22%|████████████████▌                                                          | 11/50 [00:21<01:15,  1.93s/trial, best loss: 0.5951511286332715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000402 seconds

INFO:hyperopt.tpe:TPE using 11/11 trials with best loss 0.595151



 24%|██████████████████                                                         | 12/50 [00:23<01:13,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000391 seconds

INFO:hyperopt.tpe:TPE using 12/12 trials with best loss 0.586495



 26%|███████████████████▌                                                       | 13/50 [00:24<01:11,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000391 seconds

INFO:hyperopt.tpe:TPE using 13/13 trials with best loss 0.586495



 28%|█████████████████████                                                      | 14/50 [00:26<01:09,  1.94s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000402 seconds

INFO:hyperopt.tpe:TPE using 14/14 trials with best loss 0.586495



 30%|██████████████████████▌                                                    | 15/50 [00:28<01:08,  1.95s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000388 seconds

INFO:hyperopt.tpe:TPE using 15/15 trials with best loss 0.586495



 32%|████████████████████████                                                   | 16/50 [00:30<01:05,  1.94s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000415 seconds

INFO:hyperopt.tpe:TPE using 16/16 trials with best loss 0.586495



 34%|█████████████████████████▌                                                 | 17/50 [00:32<01:03,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000394 seconds

INFO:hyperopt.tpe:TPE using 17/17 trials with best loss 0.586495



 36%|███████████████████████████                                                | 18/50 [00:34<01:01,  1.94s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000412 seconds

INFO:hyperopt.tpe:TPE using 18/18 trials with best loss 0.586495



 38%|████████████████████████████▌                                              | 19/50 [00:36<00:59,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000395 seconds

INFO:hyperopt.tpe:TPE using 19/19 trials with best loss 0.586495



 40%|██████████████████████████████                                             | 20/50 [00:38<00:57,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000389 seconds

INFO:hyperopt.tpe:TPE using 20/20 trials with best loss 0.586495



 42%|███████████████████████████████▌                                           | 21/50 [00:40<00:55,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000390 seconds

INFO:hyperopt.tpe:TPE using 21/21 trials with best loss 0.586495



 44%|█████████████████████████████████                                          | 22/50 [00:42<00:53,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000393 seconds

INFO:hyperopt.tpe:TPE using 22/22 trials with best loss 0.586495



 46%|██████████████████████████████████▌                                        | 23/50 [00:44<00:51,  1.91s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000391 seconds

INFO:hyperopt.tpe:TPE using 23/23 trials with best loss 0.586495



 48%|████████████████████████████████████                                       | 24/50 [00:46<00:49,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000417 seconds

INFO:hyperopt.tpe:TPE using 24/24 trials with best loss 0.586495



 50%|█████████████████████████████████████▌                                     | 25/50 [00:48<00:47,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000388 seconds

INFO:hyperopt.tpe:TPE using 25/25 trials with best loss 0.586495



 52%|███████████████████████████████████████                                    | 26/50 [00:50<00:46,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000396 seconds

INFO:hyperopt.tpe:TPE using 26/26 trials with best loss 0.586495



 54%|████████████████████████████████████████▌                                  | 27/50 [00:51<00:44,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000394 seconds

INFO:hyperopt.tpe:TPE using 27/27 trials with best loss 0.586495



 56%|██████████████████████████████████████████                                 | 28/50 [00:53<00:42,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000388 seconds

INFO:hyperopt.tpe:TPE using 28/28 trials with best loss 0.586495



 58%|███████████████████████████████████████████▌                               | 29/50 [00:55<00:40,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000400 seconds

INFO:hyperopt.tpe:TPE using 29/29 trials with best loss 0.586495



 60%|█████████████████████████████████████████████                              | 30/50 [00:57<00:38,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000435 seconds

INFO:hyperopt.tpe:TPE using 30/30 trials with best loss 0.586495



 62%|██████████████████████████████████████████████▌                            | 31/50 [00:59<00:36,  1.94s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000417 seconds

INFO:hyperopt.tpe:TPE using 31/31 trials with best loss 0.586495



 64%|████████████████████████████████████████████████                           | 32/50 [01:01<00:34,  1.94s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000394 seconds

INFO:hyperopt.tpe:TPE using 32/32 trials with best loss 0.586495



 66%|█████████████████████████████████████████████████▌                         | 33/50 [01:03<00:32,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000411 seconds

INFO:hyperopt.tpe:TPE using 33/33 trials with best loss 0.586495



 68%|███████████████████████████████████████████████████                        | 34/50 [01:05<00:30,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000395 seconds

INFO:hyperopt.tpe:TPE using 34/34 trials with best loss 0.586495



 70%|████████████████████████████████████████████████████▌                      | 35/50 [01:07<00:28,  1.93s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000397 seconds

INFO:hyperopt.tpe:TPE using 35/35 trials with best loss 0.586495



 72%|██████████████████████████████████████████████████████                     | 36/50 [01:09<00:26,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000406 seconds

INFO:hyperopt.tpe:TPE using 36/36 trials with best loss 0.586495



 74%|███████████████████████████████████████████████████████▌                   | 37/50 [01:11<00:24,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000395 seconds

INFO:hyperopt.tpe:TPE using 37/37 trials with best loss 0.586495



 76%|█████████████████████████████████████████████████████████                  | 38/50 [01:13<00:23,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000398 seconds

INFO:hyperopt.tpe:TPE using 38/38 trials with best loss 0.586495



 78%|██████████████████████████████████████████████████████████▌                | 39/50 [01:15<00:21,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000394 seconds

INFO:hyperopt.tpe:TPE using 39/39 trials with best loss 0.586495



 80%|████████████████████████████████████████████████████████████               | 40/50 [01:16<00:19,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000397 seconds

INFO:hyperopt.tpe:TPE using 40/40 trials with best loss 0.586495



 82%|█████████████████████████████████████████████████████████████▍             | 41/50 [01:18<00:17,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000388 seconds

INFO:hyperopt.tpe:TPE using 41/41 trials with best loss 0.586495



 84%|███████████████████████████████████████████████████████████████            | 42/50 [01:20<00:15,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000395 seconds

INFO:hyperopt.tpe:TPE using 42/42 trials with best loss 0.586495



 86%|████████████████████████████████████████████████████████████████▌          | 43/50 [01:22<00:13,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000397 seconds

INFO:hyperopt.tpe:TPE using 43/43 trials with best loss 0.586495



 88%|██████████████████████████████████████████████████████████████████         | 44/50 [01:24<00:11,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000406 seconds

INFO:hyperopt.tpe:TPE using 44/44 trials with best loss 0.586495



 90%|███████████████████████████████████████████████████████████████████▌       | 45/50 [01:26<00:09,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000402 seconds

INFO:hyperopt.tpe:TPE using 45/45 trials with best loss 0.586495



 92%|█████████████████████████████████████████████████████████████████████      | 46/50 [01:28<00:07,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000404 seconds

INFO:hyperopt.tpe:TPE using 46/46 trials with best loss 0.586495



 94%|██████████████████████████████████████████████████████████████████████▌    | 47/50 [01:30<00:05,  1.92s/trial, best loss: 0.5864951814058956]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000393 seconds

INFO:hyperopt.tpe:TPE using 47/47 trials with best loss 0.586495



 96%|████████████████████████████████████████████████████████████████████████   | 48/50 [01:32<00:03,  1.92s/trial, best loss: 0.5855725623582766]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001010 seconds

INFO:hyperopt.tpe:TPE using 48/48 trials with best loss 0.585573



 98%|█████████████████████████████████████████████████████████████████████████▌ | 49/50 [01:34<00:01,  1.92s/trial, best loss: 0.5818378942486085]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000415 seconds

INFO:hyperopt.tpe:TPE using 49/49 trials with best loss 0.581838



100%|███████████████████████████████████████████████████████████████████████████| 50/50 [01:36<00:00,  1.92s/trial, best loss: 0.5818378942486085]
