In [1]:
import sys
import os
import pickle
from datetime import datetime 

import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction import DictVectorizer 
from sklearn.metrics import log_loss
import xgboost as xgb
from hyperopt import fmin, hp, tpe, Trials
from hyperopt.pyll import scope

import mlflow

from misc import init, supports

In [2]:
# initialize configuration
MODE = "test"
CONFIG = init.init_config(MODE)

In [3]:
# # launch mlflow
# mlflow ui --backend-store-uri sqlite:///mlflow/mlflow.db --default-artifact-root mlflow
mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
mlflow.set_experiment("political_engagement")
# enable auto-log
mlflow.xgboost.autolog()

In [4]:
# read data
data = pd.read_parquet("data/custom_wvs7_data.parquet")

In [5]:
# setup training context
# get datasets
dftrain, dftest = train_test_split(data, test_size=0.2, random_state=99)

# get targets
ytrain = dftrain["political_engagement"].values
ytest = dftest["political_engagement"].values
dftrain.drop(columns=["political_engagement"], inplace=True)
dftest.drop(columns=["political_engagement"], inplace=True)

# vectorize
dv = DictVectorizer(sparse=False)
train_dict = dftrain.to_dict(orient="records")
test_dict = dftest.to_dict(orient="records")
xtrain = dv.fit_transform(train_dict)
xtest = dv.transform(test_dict)
feature_names = dv.get_feature_names_out().tolist()

# get dmatrix
xtrain = xgb.DMatrix(xtrain, label=ytrain, feature_names=feature_names)
xtest = xgb.DMatrix(xtest, label=ytest, feature_names=feature_names)

In [6]:
# optimize
search_space = {
    "learning_rate": hp.loguniform("learning_rate", -7, 10),
    "max_depth": scope.int(hp.quniform("max_depth", 0, 100, 1)),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 4.6),
    "reg_alpha": hp.loguniform("reg_alpha", -5, 4.6), 
    "scale_pos_weight": hp.loguniform("scale_pos_weight", 0, 4.6),
    "objective": "binary:logistic",
    "seed": 99
}

best_result = fmin(
    fn=lambda search_space: supports.objective(
        search_space=search_space,
        xtrain=xtrain,
        xtest=xtest,
        ytrain=ytrain,
        ytest=ytest,
        num_boost_round=CONFIG["booster_rounds"]
        ),
    space=search_space,
    algo=tpe.suggest,
    max_evals=CONFIG["mlflow_evals_nbr"],
    trials=Trials()
    )

[0]	test-logloss:2.22936                             
[1]	test-logloss:12.54354                            
[2]	test-logloss:16.23009                            
[3]	test-logloss:18.25861                            
[4]	test-logloss:22.34129                            
[5]	test-logloss:22.34129                            
[6]	test-logloss:22.34129                            
[7]	test-logloss:22.34129                            
[8]	test-logloss:22.34129                            
[9]	test-logloss:22.34129                            
  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]





[0]	test-logloss:1.41819                                                      
[1]	test-logloss:1.42121                                                      
[2]	test-logloss:1.42426                                                      
[3]	test-logloss:1.42731                                                      
[4]	test-logloss:1.43037                                                      
[5]	test-logloss:1.43344                                                      
[6]	test-logloss:1.43648                                                      
[7]	test-logloss:1.43957                                                      
[8]	test-logloss:1.44263                                                      
[9]	test-logloss:1.44573                                                      
 50%|█████     | 1/2 [00:02<00:02,  2.37s/trial, best loss: 9.673245722625499]







100%|██████████| 2/2 [00:03<00:00,  1.93s/trial, best loss: 1.445730191505302]


In [7]:
# save best model from prod run
if MODE != "test":
    # define tag names (model & author) 
    model_prefix = "poleng"
    model_algo = "xgb"
    # yymmddhhmmss
    model_creation_time = datetime.now().strftime("%y%m%d%H%M%S")
    model_name = f"{model_prefix}_{model_algo}_{model_creation_time}"
    author = "andrei lupascu"
    tags = {
            "model": model_name,
            "author": author
    }
    # update best_result
    best_result["objective"] = search_space["objective"]
    best_result["seed"] = search_space["seed"]
    # format best results (some params need to be cast as int)
    int_params = ["max_depth"]
    for int_param in int_params:
        best_result[int_param] = int(best_result[int_param])

    with mlflow.start_run():
            mlflow.set_tags(tags)

            # train model
            booster = xgb.train(
            params=best_result,
            dtrain=xtrain,
            num_boost_round=CONFIG["booster_rounds"],
            evals=[(xtest, "test")],
            early_stopping_rounds=50
            )

            # predict
            ypred = booster.predict(xtest)
            lg_loss = log_loss(ytest, ypred)

            # log artifacts and metrics 
            with open("./mlflow/preprocessor.bin", "wb") as fout:
                    pickle.dump(dv, fout)
            mlflow.log_artifact("./mlflow/preprocessor.bin", artifact_path="preprocessor")
            mlflow.log_metric("test_log_loss", lg_loss)

[0]	test-logloss:1.41819
[1]	test-logloss:1.42121
[2]	test-logloss:1.42426
[3]	test-logloss:1.42731
[4]	test-logloss:1.43037
[5]	test-logloss:1.43344
[6]	test-logloss:1.43648
[7]	test-logloss:1.43957
[8]	test-logloss:1.44263
[9]	test-logloss:1.44573


