In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer 
from sklearn.metrics import roc_auc_score

import xgboost as xgb

import bentoml
import pickle

In [2]:
# custom_data corresponds to the final wrangle of the wvs data
data = pd.read_parquet("misc/custom_wvs7_data.parquet")

# note: variable definitions available in "misc/post_wrangle_variable_definitions.json"

In [3]:
# get datasets
df_train, df_test = train_test_split(data, test_size=0.2, random_state=23)

In [4]:
# flag categorial variables
nominal = ["country", "mode", "settlement", "intprivacy",
           "sex", "immigrant", "immigrant_mother", "immigrant_father", 
           "birth_country", "birth_country_mother", "birth_country_father", 
           "citizenship", "lives_with_parents", "marital_status", "employment", 
           "employment_spouse", "profession", "profession_spouse", 
           "profession_father", "employment_sector", "chief_earner", "savings",
           "religion"]

# get best xbg params
xgb_params = {"eta": 0.1, 
              "max_depth": 10, 
              "min_child_weight": 10,
              "objective": "binary:logistic",
              "nthread": -1,
              "seed": 23,
              "verbosity": 1,
              "eval_metric": "auc",
              "scale_pos_weight" : 10,
              "alpha": 25}

In [5]:
# get targets 
ytrain = df_train.political_engagement.values
ytest = df_test.political_engagement.values
x_train = df_train.drop(columns="political_engagement")
x_test = df_test.drop(columns="political_engagement")

# dummy encode nominal variables
dv = DictVectorizer(sparse=False)
x_train[nominal] = x_train[nominal].astype(str)
x_test[nominal] = x_test[nominal].astype(str)
train_dict = x_train.to_dict(orient="records")
test_dict = x_test.to_dict(orient="records")
xtrain = dv.fit_transform(train_dict)
xtest = dv.transform(test_dict)
features = dv.get_feature_names_out().tolist()

# get xbg matrices
xtrain = xgb.DMatrix(xtrain, label=ytrain)
xtest = xgb.DMatrix(xtest, label=ytest)

In [6]:
# train
xgb_model = xgb.train(xgb_params, xtrain, num_boost_round=200)

# predict
ypred = xgb_model.predict(xtest)
roc_auc_score(ytest, ypred)

0.7796568492162079

In [7]:
#############################commented for convenience##########################
# # save model with bentoml
# bentoml.xgboost.save_model("poleng_xgb_binary", xgb_model,
#                            custom_objects={"dict_vectorizer": dv,
#                                            "nominal_variables": nominal})

# # save model traditionally (easier to deploy on aws lambda)
# with open("poleng_xgb.bin", "wb") as fout:
#     pickle.dump((dv, xgb_model), fout)
#############################commented for convenience##########################