In [4]:
import sys
import os
from datetime import datetime 

import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction import DictVectorizer 
import xgboost as xgb

import mlflow

# from misc import init

In [2]:
# # initialize configuration
# CONFIG = init.init_config()

---User configuration not found. Loading default settings.


In [5]:
# read data
data = pd.read_parquet("data/custom_wvs7_data.parquet")

In [6]:
# setup training context
# get datasets
xtrain, xtest = train_test_split(data, test_size=0.2, random_state=99)

# get targets
ytrain = xtrain["political_engagement"].values
ytest = xtest["political_engagement"].values
xtrain.drop(columns=["political_engagement"], inplace=True)
xtest.drop(columns=["political_engagement"], inplace=True)

# vectorize
dv = DictVectorizer(sparse=False)
train_dict = xtrain.to_dict(orient="records")
test_dict = xtest.to_dict(orient="records")
xtrain = dv.fit_transform(train_dict)
xtest = dv.transform(test_dict)
feature_names = dv.get_feature_names_out().tolist()

# get dmatrix
xtrain = xgb.DMatrix(xtrain, ytrain, feature_names=feature_names)
xtest = xgb.DMatrix(xtest, ytest, feature_names=feature_names)

In [7]:
# # launch mlflow
# mlflow ui --backend-store-uri sqlite:///mlflow/mlflow.db --default-artifact-root mlflow
mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
mlflow.set_experiment("political_engagement")

<Experiment: artifact_location='/home/adi/projects/political-engagement-mlops/mlruns/1', creation_time=1726041395995, experiment_id='1', last_update_time=1726041395995, lifecycle_stage='active', name='political_engagement', tags={}>

In [16]:
from hyperopt import hp
from hyperopt.pyll import scope

In [17]:
prod_search_space = {
    "learning_rate": hp.loguniform("learning_rate", -7, 10),
    "max_depth": scope.int(hp.quniform("max_depth", 0, 100, 1)),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 4.6),
    "reg_alpha": hp.loguniform("reg_alpha", -5, 4.6), 
    "scale_pos_weight": hp.loguniform("scale_pos_weight", -5, 4.6)
}

test_search_space = {
    "learning_rate": hp.loguniform("learning_rate", -7, 10)
}

In [13]:
# with mlflow.stat_run():