In [1]:
import sys
import os
from datetime import datetime 

import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction import DictVectorizer 
import xgboost as xgb
from hyperopt import fmin, hp, tpe, Trials
from hyperopt.pyll import scope

import mlflow

from misc import init, supports

In [2]:
# initialize configuration
CONFIG = init.init_config("test")

In [3]:
# read data
data = pd.read_parquet("data/custom_wvs7_data.parquet")

In [4]:
# setup training context
# get datasets
xtrain, xtest = train_test_split(data, test_size=0.2, random_state=99)

# get targets
ytrain = xtrain["political_engagement"].values
ytest = xtest["political_engagement"].values
xtrain.drop(columns=["political_engagement"], inplace=True)
xtest.drop(columns=["political_engagement"], inplace=True)

# vectorize
dv = DictVectorizer(sparse=False)
train_dict = xtrain.to_dict(orient="records")
test_dict = xtest.to_dict(orient="records")
xtrain = dv.fit_transform(train_dict)
xtest = dv.transform(test_dict)
feature_names = dv.get_feature_names_out().tolist()

# get dmatrix
xtrain = xgb.DMatrix(xtrain, label=ytrain, feature_names=feature_names)
xtest = xgb.DMatrix(xtest, label=ytest, feature_names=feature_names)

In [5]:
# # launch mlflow
# mlflow ui --backend-store-uri sqlite:///mlflow/mlflow.db --default-artifact-root mlflow
mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
mlflow.set_experiment("political_engagement")

<Experiment: artifact_location='/home/adi/projects/political-engagement-mlops/mlruns/1', creation_time=1726041395995, experiment_id='1', last_update_time=1726041395995, lifecycle_stage='active', name='political_engagement', tags={}>

In [6]:
# optimize
search_space = {
    "learning_rate": hp.loguniform("learning_rate", -7, 10),
    "max_depth": scope.int(hp.quniform("max_depth", 0, 100, 1)),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 4.6),
    "reg_alpha": hp.loguniform("reg_alpha", -5, 4.6), 
    "scale_pos_weight": hp.loguniform("scale_pos_weight", 0, 4.6),
    "objective": "binary:logistic",
    "seed": 99
}

best_result = fmin(
    fn=lambda search_space: supports.objective(
        search_space=search_space,
        xtrain=xtrain,
        xtest=xtest,
        ytrain=ytrain,
        ytest=ytest,
        num_boost_round=CONFIG["booster_rounds"]
        ),
    space=search_space,
    algo=tpe.suggest,
    max_evals=CONFIG["mlflow_evals_nbr"],
    trials=Trials()
    )

# from previous run
best_run = {'learning_rate': 0.005132133544114984,
            'max_depth': 84.0,
            'min_child_weight': 9.971863457487139,
            'reg_alpha': 2.960929919747157,
            'scale_pos_weight': 1.0299401967261155}

[0]	test-logloss:1.35905                               
[1]	test-logloss:1.34969                               
[2]	test-logloss:1.34156                               
[3]	test-logloss:1.33425                               
[4]	test-logloss:1.32722                               
[5]	test-logloss:1.32088                               
[6]	test-logloss:1.31550                               
[7]	test-logloss:1.31049                               
[8]	test-logloss:1.30606                               
[9]	test-logloss:1.30200                               
[10]	test-logloss:1.29805                              
[11]	test-logloss:1.29489                              
[12]	test-logloss:1.29181                              
[13]	test-logloss:1.28886                              
[14]	test-logloss:1.28628                              
[15]	test-logloss:1.28404                              
[16]	test-logloss:1.28175                              
[17]	test-logloss:1.27994                       