In [38]:
import sys
import os
import pickle
from datetime import datetime 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction import DictVectorizer 
from sklearn.metrics import roc_auc_score, log_loss, root_mean_squared_error
import xgboost as xgb
from hyperopt import fmin, hp, tpe, Trials
from hyperopt.pyll import scope

import mlflow

from misc import init, supports, drift_handler

In [39]:
# initialize configuration
# mode=test will minimize parameters (ex: 20 VS 1000 booster runs)
# skip_optimization=True will skip model tuning
CONFIG = init.init_config(mode="test", skip_optimization=False)

# output config
print("---CONFIG:")
for k, v in CONFIG.items():
    print(f"---{k} > {v}")

---CONFIG:
---mode > test
---mlflow_evals_nbr > 2
---booster_rounds > 10
---skip_optimization > False


In [40]:
# # launch mlflow
# mlflow ui --backend-store-uri sqlite:///mlflow/mlflow.db --default-artifact-root mlflow
mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
mlflow.set_experiment("political_engagement")
# # enable auto-log
# mlflow.xgboost.autolog()

<Experiment: artifact_location='/home/adi/projects/political-engagement-mlops/mlruns/1', creation_time=1726041395995, experiment_id='1', last_update_time=1726041395995, lifecycle_stage='active', name='political_engagement', tags={}>

In [8]:
# read data
data = drift_handler.get_data("./data/training_data/production_data.parquet")

In [50]:
# setup training context
# get datasets
dftrain, dftest = train_test_split(data, test_size=0.2, random_state=99)

# get targets
ytrain = dftrain["political_engagement"].values
ytest = dftest["political_engagement"].values
dftrain.drop(columns=["political_engagement"], inplace=True)
dftest.drop(columns=["political_engagement"], inplace=True)

# vectorize
dv = DictVectorizer(sparse=False)
train_dict = dftrain.to_dict(orient="records")
test_dict = dftest.to_dict(orient="records")
xtrain = dv.fit_transform(train_dict)
xtest = dv.transform(test_dict)
feature_names = dv.get_feature_names_out().tolist()

# get dmatrix
xtrain = xgb.DMatrix(xtrain, label=ytrain, feature_names=feature_names)
xtest = xgb.DMatrix(xtest, label=ytest, feature_names=feature_names)

In [65]:
# optimize
if not CONFIG["skip_optimization"]:
    search_space = {
        "learning_rate": hp.loguniform("learning_rate", -7, 10),
        "max_depth": scope.int(hp.quniform("max_depth", 0, 100, 1)),
        "min_child_weight": hp.loguniform("min_child_weight", -1, 4.6),
        "reg_alpha": hp.loguniform("reg_alpha", -5, 4.6), 
        "scale_pos_weight": hp.loguniform("scale_pos_weight", 0, 4.6),
        "objective": "binary:logistic",
        "seed": 99
    }

    best_result = fmin(
        fn=lambda search_space: supports.objective(
            search_space=search_space,
            xtrain=xtrain,
            xtest=xtest,
            ytrain=ytrain,
            ytest=ytest,
            num_boost_round=CONFIG["booster_rounds"]
            ),
        space=search_space,
        algo=tpe.suggest,
        max_evals=CONFIG["mlflow_evals_nbr"],
        trials=Trials()
        )

[0]	test-logloss:10.88660                            
[1]	test-logloss:21.56619                            
[2]	test-logloss:25.39921                            
[3]	test-logloss:25.39921                            
[4]	test-logloss:25.39921                            
[5]	test-logloss:25.39921                            
[6]	test-logloss:25.39921                            
[7]	test-logloss:25.39921                            
[8]	test-logloss:25.39921                            
[9]	test-logloss:25.39921                            
[0]	test-logloss:1.00322                                                       
[1]	test-logloss:0.94623                                                       
[2]	test-logloss:0.91495                                                       
[3]	test-logloss:0.89154                                                       
[4]	test-logloss:0.87611                                                       
[5]	test-logloss:0.86297                                    

In [7]:
# save the model with the best params
artifacts_path = "./mlflow"
tags = {
        "author": "andrei lupascu",
        "mode": CONFIG["mode"]
}
# update best_result
best_result["objective"] = search_space["objective"]
best_result["seed"] = search_space["seed"]
# format best results (some params need to be cast as int)
int_params = ["max_depth"]
for int_param in int_params:
        best_result[int_param] = int(best_result[int_param])

supports.objective(
     search_space=best_result,
     xtrain=xtrain,
     xtest=xtest,
     ytrain=ytrain,
     ytest=ytest,
     num_boost_round=CONFIG["booster_rounds"],
     tags=tags,
     save_artifacts=(True, artifacts_path, dv)
     )
    

[0]	test-logloss:7.28818
[1]	test-logloss:15.16560
[2]	test-logloss:10.18008
[3]	test-logloss:13.07074
[4]	test-logloss:12.78388
[5]	test-logloss:14.20724
[6]	test-logloss:14.20950
[7]	test-logloss:16.14627
[8]	test-logloss:14.09731
[9]	test-logloss:20.01706




{'loss': 8.667792280455794, 'status': 'ok'}

In [67]:
int_params = ["max_depth"]
for int_param in int_params:
        best_result[int_param] = int(best_result[int_param])

booster = xgb.train(
params=best_result,
dtrain=xtrain,
num_boost_round=CONFIG["booster_rounds"],
evals=[(xtest, "test")],
early_stopping_rounds=50
)

# with mlflow.start_run():
#     mlflow.set_tags(tags)
#     # train model
#     booster = xgb.train(
#     params=best_result,
#     dtrain=xtrain,
#     num_boost_round=CONFIG["booster_rounds"],
#     evals=[(xtest, "test")],
#     early_stopping_rounds=50
#     )

#     # get metrics
#     ytrain_pred = booster.predict(xtrain)
#     yval_pred = booster.predict(xtest)
#     # get auc
#     train_auc = roc_auc_score(ytrain, ytrain_pred)
#     test_auc = roc_auc_score(ytest, yval_pred)
#     # get loss
#     train_log_loss = log_loss(ytrain, ytrain_pred)
#     test_log_loss = log_loss(ytest, yval_pred)
#     # get rmse
#     train_rmse = root_mean_squared_error(ytrain, ytrain_pred)
#     test_rmse = root_mean_squared_error(ytest, yval_pred)
#     # store metrics
#     metrics = {
#             "train_auc": train_auc,
#             "test_auc": test_auc,
#             "train_log_loss": train_log_loss,
#             "test_log_loss": test_log_loss,
#             "train_rmse": train_rmse,
#             "test_rmse": test_rmse
#     }

#         # log metrics
#     for name, metric in metrics.items():
#         mlflow.log_metric(name, metric)

#     # log artifacts  
#     with open(f"./mlflow/{preprocessor_name}.bin", "wb") as fout:
#             pickle.dump(dv, fout)
#     booster.save_model(f"./mlflow/{model_name}.xgb")
#     mlflow.log_artifact(f"./mlflow/{preprocessor_name}.bin", artifact_path=preprocessor_name)
#     mlflow.log_artifact(f"./mlflow/{model_name}.xgb", artifact_path=model_name)
#     mlflow.log_metric("titties", train_auc)

[0]	test-rmse:0.61440


[1]	test-rmse:0.56503
[2]	test-rmse:0.53274
[3]	test-rmse:0.51102
[4]	test-rmse:0.49719
[5]	test-rmse:0.48670
[6]	test-rmse:0.47958
[7]	test-rmse:0.47455
[8]	test-rmse:0.47023
[9]	test-rmse:0.46648


In [1]:
from misc import mageai_supports

In [4]:
batch = mageai_supports.predict(
        "./mlflow",
        "./data/new_batches",
        "./data/predictions"
        )

Reading: ./data/new_batches/prod_data_batch_1.parquet
Predicting...


ValueError: training data did not have the following fields: birth_country=0, birth_country=1, birth_country_father=0, birth_country_father=1, birth_country_mother=0, birth_country_mother=1, chief_earner=1, chief_earner=2, children_number, citizenship=1, citizenship=2, country=104, country=124, country=152, country=156, country=158, country=170, country=196, country=20, country=203, country=218, country=231, country=276, country=300, country=32, country=320, country=344, country=36, country=360, country=364, country=368, country=392, country=398, country=400, country=404, country=410, country=417, country=422, country=434, country=446, country=458, country=462, country=484, country=496, country=50, country=51, country=68, country=76, education, education_mother, education_spouse, employment=1, employment=2, employment=3, employment=4, employment=5, employment=6, employment=7, employment_sector=0, employment_sector=1, employment_sector=2, employment_sector=3, employment_spouse=0, employment_spouse=1, employment_spouse=2, employment_spouse=3, employment_spouse=4, employment_spouse=5, employment_spouse=6, employment_spouse=7, employment_spouse=8, generation, household_size, immigrant=1, immigrant=2, immigrant_father=1, immigrant_father=2, immigrant_mother=1, immigrant_mother=2, income_scale, intprivacy=1, intprivacy=2, lives_with_parents=1, lives_with_parents=2, lives_with_parents=3, lives_with_parents=4, marital_status=1, marital_status=2, marital_status=3, marital_status=4, marital_status=5, marital_status=6, mode=1, mode=2, mode=3, mode=4, profession=1, profession=10, profession=11, profession=12, profession=2, profession=3, profession=4, profession=5, profession=6, profession=7, profession=8, profession=9, profession_father=1, profession_father=10, profession_father=11, profession_father=12, profession_father=2, profession_father=3, profession_father=4, profession_father=5, profession_father=6, profession_father=7, profession_father=8, profession_father=9, profession_spouse=0, profession_spouse=1, profession_spouse=10, profession_spouse=11, profession_spouse=12, profession_spouse=2, profession_spouse=3, profession_spouse=4, profession_spouse=5, profession_spouse=6, profession_spouse=7, profession_spouse=8, profession_spouse=9, religion=1, religion=10, religion=2, religion=3, religion=4, religion=5, religion=6, religion=7, religion=8, religion=9, respint, savings=1, savings=2, savings=3, savings=4, settlement=1, settlement=2, settlement=3, settlement=4, settlement=5, sex=1, sex=2, subjective_social_class

In [96]:
b1 = pd.read_parquet("/home/adi/projects/political-engagement-mlops/data/batches/prod_data_batch_1.parquet")
n1 = pd.read_parquet("/home/adi/projects/political-engagement-mlops/data/batches/new_data_batch_1.parquet")

In [101]:
b1.drop(columns="subject_id", inplace=True)
b = mageai_supports.get_dmatrix(b1, dv)

In [102]:
b1["pred"] = booster.predict(b)

ValueError: training data did not have the following fields: birth_country=0, birth_country=1, birth_country_father=0, birth_country_father=1, birth_country_mother=0, birth_country_mother=1, chief_earner=1, chief_earner=2, children_number, citizenship=1, citizenship=2, country=104, country=124, country=152, country=156, country=158, country=170, country=196, country=20, country=203, country=218, country=231, country=276, country=300, country=32, country=320, country=344, country=36, country=360, country=364, country=368, country=392, country=398, country=400, country=404, country=410, country=417, country=422, country=434, country=446, country=458, country=462, country=484, country=496, country=50, country=51, country=68, country=76, education, education_mother, education_spouse, employment=1, employment=2, employment=3, employment=4, employment=5, employment=6, employment=7, employment_sector=0, employment_sector=1, employment_sector=2, employment_sector=3, employment_spouse=0, employment_spouse=1, employment_spouse=2, employment_spouse=3, employment_spouse=4, employment_spouse=5, employment_spouse=6, employment_spouse=7, employment_spouse=8, generation, household_size, immigrant=1, immigrant=2, immigrant_father=1, immigrant_father=2, immigrant_mother=1, immigrant_mother=2, income_scale, intprivacy=1, intprivacy=2, lives_with_parents=1, lives_with_parents=2, lives_with_parents=3, lives_with_parents=4, marital_status=1, marital_status=2, marital_status=3, marital_status=4, marital_status=5, marital_status=6, mode=1, mode=2, mode=3, mode=4, profession=1, profession=10, profession=11, profession=12, profession=2, profession=3, profession=4, profession=5, profession=6, profession=7, profession=8, profession=9, profession_father=1, profession_father=10, profession_father=11, profession_father=12, profession_father=2, profession_father=3, profession_father=4, profession_father=5, profession_father=6, profession_father=7, profession_father=8, profession_father=9, profession_spouse=0, profession_spouse=1, profession_spouse=10, profession_spouse=11, profession_spouse=12, profession_spouse=2, profession_spouse=3, profession_spouse=4, profession_spouse=5, profession_spouse=6, profession_spouse=7, profession_spouse=8, profession_spouse=9, religion=1, religion=10, religion=2, religion=3, religion=4, religion=5, religion=6, religion=7, religion=8, religion=9, respint, savings=1, savings=2, savings=3, savings=4, settlement=1, settlement=2, settlement=3, settlement=4, settlement=5, sex=1, sex=2, subjective_social_class