In [31]:
import pickle

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb

from typing import Tuple
# import mlflow

In [2]:
# # # launch mlflow
# # mlflow ui --backend-store-uri sqlite:///mlflow/mlflow.db --default-artifact-root mlflow
# mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
# mlflow.set_experiment("political_engagement")

In [2]:
# # retrieve best model
# client = mlflow.MlflowClient(tracking_uri=mlflow.get_tracking_uri())

# best_model = client.search_runs(
#     experiment_ids="1",
#     run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
#     max_results=1,
#     order_by=["metrics.test_log_loss ASC"]
# )[0]

# model_id = best_model.info.run_id
# model_path = f"runs:/{model_id}/model"
model = xgb.Booster()
model.load_model("./mlflow/model.xgb")

# get preprocessor 
with open("./mlflow/preprocessor.bin", "rb") as fin:
    dv = pickle.load(fin)

# load batch
batch = pd.read_parquet("data/batches/wvs7_2024-01-01.parquet")
batch_dict = batch.to_dict(orient="records")
batch_set = dv.transform(batch_dict)
batch_set = xgb.DMatrix(batch_set)

# predict (0=subject doesn't need intervention (-), 1=subject could benefit from intervention (CONTACT))
batch["prediction"] = np.round(model.predict(batch_set)).astype(int)
batch["prediction"] = batch.prediction.replace([0, 1], ["-", "CONTACT"])

In [None]:
import os
from datetime import datetime

In [50]:
def load_artifacts(model_path: str, vectorizer_path: str) -> Tuple[xgb.Booster, DictVectorizer]:
    """Load the latest xgb model and the related vectorizer."""

    model = xgb.Booster()
    model.load_model(model_path)

    with open(vectorizer_path, "rb") as fin:
        dv = pickle.load(fin)

    return model, dv 


In [57]:
def get_dmatrix(df: pd.DataFrame, dv: DictVectorizer) -> pd.DataFrame:
    """Get the XGB DMatrix of a dataframe."""

    df_dict = df.to_dict(orient="records")
    df_vect = dv.transform(df_dict)
    df_xgbdm = xgb.DMatrix(df_vect)
    
    return df_xgbdm

In [65]:
def predict(
        model_path: str,
        vectorizer_path: str, 
        new_batches_folder_path: str,
        predictions_output_path: str
        ) -> pd.DataFrame:
    """Make predictions for new batches."""

    batch_dict = {}
    model, dv = load_artifacts(model_path, vectorizer_path)

    for batch in os.listdir(new_batches_folder_path):
        # label data
        new_batch_path = f"{new_batches_folder_path}/{batch}"
        batch_name = batch.split(".")[0]
        output_batch_name = f"{batch_name}_predicted.parquet"
        predicted_batch_path = f"{predictions_output_path}/{output_batch_name}"
        # read data and wrangle data
        print(f"Reading: {new_batch_path}")
        df = pd.read_parquet(new_batch_path)
        print("Predicting...")
        df_dmat = get_dmatrix(df, dv)
        # predict (0=subject doesn't need intervention (-), 1=subject could benefit from intervention (CONTACT))
        df["prediction"] = np.round(model.predict(df_dmat)).astype(int)
        df["prediction"] = df.prediction.replace([0, 1], ["-", "CONTACT"])
        # make things easier for the agents by only keeping the target subjects
        df = df[df.prediction == "CONTACT"]
        # output predicted batch
        print(f"Writing predictions to: {predicted_batch_path}")
        df.to_parquet(predicted_batch_path, index=False)
        # remove new batch
        os.remove(new_batch_path)
        # store predictions (troubleshooting)
        batch_dict[output_batch_name] = df

    return batch_dict 


In [66]:
d = predict(
    "./mlflow/model.xgb",
    "./mlflow/preprocessor.bin",
    "./data/new_batches",
    "./data/predictions"
    )

Reading: ./data/new_batches/wvs7_2024-01-01.parquet
Predicting...
Writing predictions to: ./data/predictions/wvs7_2024-01-01_predicted.parquet
Reading: ./data/new_batches/wvs7_2024-02-01.parquet
Predicting...
Writing predictions to: ./data/predictions/wvs7_2024-02-01_predicted.parquet
