In [1]:
import sys
import os
import pickle
from datetime import datetime 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction import DictVectorizer 
from sklearn.metrics import roc_auc_score, log_loss, root_mean_squared_error
import xgboost as xgb
from hyperopt import fmin, hp, tpe, Trials
from hyperopt.pyll import scope

import mlflow

from misc import cfg, supports, drift_handler

In [2]:
# initialize configuration
# mode=test will minimize parameters (ex: 20 VS 1000 booster runs)
# skip_optimization=True will skip model tuning
CONFIG = cfg.init_config(mode="test", skip_optimization=False)

# output config
print("---CONFIG:")
for k, v in CONFIG.items():
    print(f"---{k} > {v}")

---CONFIG:
---mode > test
---mlflow_evals_nbr > 2
---booster_rounds > 10
---skip_optimization > False
---mlflow_artifacts_path > /home/adi/projects/political-engagement-mlops/mlflow/mlruns/poleng


In [4]:
# # launch mlflow
# mlflow ui --backend-store-uri sqlite:///mlflow/mlflow.db --default-artifact-root mlflow
experiment = "political_engagement"
artifact_location = CONFIG["mlflow_artifacts_path"]

mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
mlflow.set_experiment(experiment)
supports.set_mlflow_artifact_location(
    "./mlflow/mlflow.db",
    experiment,
    artifact_location
)

In [5]:
# read data
data = drift_handler.get_data("./data/training_data/production_data.parquet")

In [6]:
# setup training context
# get datasets
dftrain, dftest = train_test_split(data, test_size=0.2, random_state=99)

# get targets
ytrain = dftrain["political_engagement"].values
ytest = dftest["political_engagement"].values
dftrain.drop(columns=["political_engagement"], inplace=True)
dftest.drop(columns=["political_engagement"], inplace=True)

# vectorize
dv = DictVectorizer(sparse=False)
train_dict = dftrain.to_dict(orient="records")
test_dict = dftest.to_dict(orient="records")
xtrain = dv.fit_transform(train_dict)
xtest = dv.transform(test_dict)

# get dmatrix
xtrain = xgb.DMatrix(xtrain, label=ytrain)
xtest = xgb.DMatrix(xtest, label=ytest)

In [7]:
# optimize
if not CONFIG["skip_optimization"]:
    search_space = {
        "learning_rate": hp.loguniform("learning_rate", -7, 10),
        "max_depth": scope.int(hp.quniform("max_depth", 0, 100, 1)),
        "min_child_weight": hp.loguniform("min_child_weight", -1, 4.6),
        "reg_alpha": hp.loguniform("reg_alpha", -5, 4.6), 
        "scale_pos_weight": hp.loguniform("scale_pos_weight", 0, 4.6),
        "objective": "binary:logistic",
        "seed": 99
    }

    best_result = fmin(
        fn=lambda search_space: supports.objective(
            search_space=search_space,
            xtrain=xtrain,
            xtest=xtest,
            ytrain=ytrain,
            ytest=ytest,
            num_boost_round=CONFIG["booster_rounds"]
            ),
        space=search_space,
        algo=tpe.suggest,
        max_evals=CONFIG["mlflow_evals_nbr"],
        trials=Trials()
        )

[0]	test-logloss:8.07765                             
[1]	test-logloss:17.56499                            
[2]	test-logloss:13.60351                            
[3]	test-logloss:14.44745                            
[4]	test-logloss:14.10848                            
[5]	test-logloss:14.34474                            
[6]	test-logloss:14.86733                            
[7]	test-logloss:15.21324                            
[8]	test-logloss:16.24017                            
[9]	test-logloss:15.67882                            
[0]	test-logloss:2.76156                                                      
[1]	test-logloss:14.02294                                                     
[2]	test-logloss:21.54712                                                     
[3]	test-logloss:18.53724                                                     
[4]	test-logloss:25.37838                                                     
[5]	test-logloss:25.37838                                        

In [8]:
# save the model with the best params
artifacts_path = "./mlflow"
tags = {
        "author": "andrei lupascu",
        "mode": CONFIG["mode"]
}
# update best_result
best_result["objective"] = search_space["objective"]
best_result["seed"] = search_space["seed"]
# format best results (some params need to be cast as int)
int_params = ["max_depth"]
for int_param in int_params:
        best_result[int_param] = int(best_result[int_param])

supports.objective(
     search_space=best_result,
     xtrain=xtrain,
     xtest=xtest,
     ytrain=ytrain,
     ytest=ytest,
     num_boost_round=CONFIG["booster_rounds"],
     tags=tags,
     save_artifacts=(True, artifacts_path, dv)
     )
    

[0]	test-logloss:8.07765
[1]	test-logloss:17.56499
[2]	test-logloss:13.60351
[3]	test-logloss:14.44745
[4]	test-logloss:14.10848
[5]	test-logloss:14.34474
[6]	test-logloss:14.86733
[7]	test-logloss:15.21324
[8]	test-logloss:16.24017
[9]	test-logloss:15.67882




{'loss': 6.802716098443426, 'status': 'ok'}

In [9]:
# with mlflow.start_run():
#     mlflow.set_tags(tags)
#     # train model
#     booster = xgb.train(
#     params=best_result,
#     dtrain=xtrain,
#     num_boost_round=CONFIG["booster_rounds"],
#     evals=[(xtest, "test")],
#     early_stopping_rounds=50
#     )

#     # get metrics
#     ytrain_pred = booster.predict(xtrain)
#     yval_pred = booster.predict(xtest)
#     # get auc
#     train_auc = roc_auc_score(ytrain, ytrain_pred)
#     test_auc = roc_auc_score(ytest, yval_pred)
#     # get loss
#     train_log_loss = log_loss(ytrain, ytrain_pred)
#     test_log_loss = log_loss(ytest, yval_pred)
#     # get rmse
#     train_rmse = root_mean_squared_error(ytrain, ytrain_pred)
#     test_rmse = root_mean_squared_error(ytest, yval_pred)
#     # store metrics
#     metrics = {
#             "train_auc": train_auc,
#             "test_auc": test_auc,
#             "train_log_loss": train_log_loss,
#             "test_log_loss": test_log_loss,
#             "train_rmse": train_rmse,
#             "test_rmse": test_rmse
#     }

#         # log metrics
#     for name, metric in metrics.items():
#         mlflow.log_metric(name, metric)

#     # log artifacts  
#     with open(f"./mlflow/{preprocessor_name}.bin", "wb") as fout:
#             pickle.dump(dv, fout)
#     booster.save_model(f"./mlflow/{model_name}.xgb")
#     mlflow.log_artifact(f"./mlflow/{preprocessor_name}.bin", artifact_path=preprocessor_name)
#     mlflow.log_artifact(f"./mlflow/{model_name}.xgb", artifact_path=model_name)