In [1]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, precision_score, recall_score, precision_recall_curve, classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
print('done importing')

done importing


In [2]:
spark = SparkSession \
    .builder \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/27 14:50:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# setup config

In [3]:
def setup_config():
    snapshot_date_str = "2024-01-01"
    model_name = "credit_model_2024_09_01.pkl"

    config = {}
    config["snapshot_date_str"] = snapshot_date_str
    config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")

    config["model_name"] = model_name
    config["model_bank_directory"] = "model_bank/"
    config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

    print("Config setup complete")
    pprint.pprint(config)

    return config

def read_gold_table(table, gold_db, spark):
    folder_path = os.path.join(gold_db, table)
    files_list = [os.path.join(folder_path, os.path.basename(f)) for f in glob.glob(os.path.join(folder_path, '*'))]
    df = spark.read.option("header", "true").parquet(*files_list)
    return df

In [4]:
# == Setup configuration ===
config = setup_config()

Config setup complete
{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}


# load label store

In [5]:
# ==== Load Label Store ====
y_spark = read_gold_table('label_store', 'datamart/gold', spark)
y_df = y_spark.toPandas().sort_values(by='snapshot_date')
y_df['snapshot_date'] = pd.to_datetime(y_df['snapshot_date'])

                                                                                

# load inference table

In [6]:
# ==== Load Inference ========
inference_path = f"datamart/gold/predictions/{config['model_name']}/{config['model_name'][:-4]}_predictions_{config['snapshot_date_str'].replace('-', '_')}.parquet"
y_inference = spark.read.option("header", "true").parquet(inference_path)
y_inference_pdf = y_inference.toPandas().sort_values(by='snapshot_date')
y_inference_pdf['snapshot_date'] = pd.to_datetime(y_inference_pdf['snapshot_date'])

# monitor with performance metrics

In [7]:
df_monitor = pd.merge(
    y_inference_pdf,
    y_df[['customer_id', 'snapshot_date', 'label']],
    on=['customer_id'],
    how='inner'
)

In [8]:
df_monitor = df_monitor[['customer_id', 'model_name', 'prediction', 'label', 'snapshot_date_x']]
df_monitor['snapshot_date'] = df_monitor['snapshot_date_x']
df_monitor = df_monitor.drop('snapshot_date_x', axis = 1)
df_monitor

Unnamed: 0,customer_id,model_name,prediction,label,snapshot_date
0,CUS_0x9778,credit_model_2024_09_01.pkl,0.247142,0,2024-01-01
1,CUS_0x305c,credit_model_2024_09_01.pkl,0.198256,0,2024-01-01
2,CUS_0x303a,credit_model_2024_09_01.pkl,0.371733,1,2024-01-01
3,CUS_0x2ff7,credit_model_2024_09_01.pkl,0.077434,0,2024-01-01
4,CUS_0x2fc5,credit_model_2024_09_01.pkl,0.690279,1,2024-01-01
...,...,...,...,...,...
480,CUS_0x7622,credit_model_2024_09_01.pkl,0.147215,0,2024-01-01
481,CUS_0x7500,credit_model_2024_09_01.pkl,0.078024,0,2024-01-01
482,CUS_0x7480,credit_model_2024_09_01.pkl,0.155120,0,2024-01-01
483,CUS_0x8449,credit_model_2024_09_01.pkl,0.134830,0,2024-01-01


In [9]:
# Threshold = 0.35
threshold = 0.35
y_pred_train = (df_monitor['prediction'] >= threshold).astype(int)
df_monitor['pred_label'] = y_pred_train
df_monitor

Unnamed: 0,customer_id,model_name,prediction,label,snapshot_date,pred_label
0,CUS_0x9778,credit_model_2024_09_01.pkl,0.247142,0,2024-01-01,0
1,CUS_0x305c,credit_model_2024_09_01.pkl,0.198256,0,2024-01-01,0
2,CUS_0x303a,credit_model_2024_09_01.pkl,0.371733,1,2024-01-01,1
3,CUS_0x2ff7,credit_model_2024_09_01.pkl,0.077434,0,2024-01-01,0
4,CUS_0x2fc5,credit_model_2024_09_01.pkl,0.690279,1,2024-01-01,1
...,...,...,...,...,...,...
480,CUS_0x7622,credit_model_2024_09_01.pkl,0.147215,0,2024-01-01,0
481,CUS_0x7500,credit_model_2024_09_01.pkl,0.078024,0,2024-01-01,0
482,CUS_0x7480,credit_model_2024_09_01.pkl,0.155120,0,2024-01-01,0
483,CUS_0x8449,credit_model_2024_09_01.pkl,0.134830,0,2024-01-01,0


In [10]:
# classification report
monitoring_cr = classification_report(df_monitor['label'], df_monitor['pred_label'])

print(f"Monitoring report:\n {monitoring_cr}")

Monitoring report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       341
           1       0.73      0.67      0.70       144

    accuracy                           0.83       485
   macro avg       0.80      0.78      0.79       485
weighted avg       0.83      0.83      0.83       485



In [12]:
monitor_accuracy = accuracy_score(df_monitor['label'], df_monitor['pred_label'])
monitor_precision = precision_score(df_monitor['label'], df_monitor['pred_label'])
monitor_recall = recall_score(df_monitor['label'], df_monitor['pred_label'])
monitor_f1 = f1_score(df_monitor['label'], df_monitor['pred_label'])


print('accuracy:', monitor_accuracy)
print('precision:', monitor_precision)
print('recall:', monitor_recall)
print('f1 score:', monitor_f1)

accuracy: 0.8288659793814434
precision: 0.7293233082706767
recall: 0.6736111111111112
f1 score: 0.700361010830325
