In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin

## 1. Data preparation
##### Let's load the dataset containing two data samples. One from June 2023 and one from April 2024.
##### I select some features and my target variable - "comp_dol" representing estimated compensation for the role.
##### I split the data into train, test and production sets.
##### I will use them respectively to:
- train -> train my model
- test -> test my model and create reference dataset for monitoring purposes
- prod -> create analysis dataset that I will be monitoring


In [None]:
df = pd.read_csv('data/comp_df.csv')

df = df[['hours', 'remote', 'education', 'seniority', 'country', 'job_published_at', 'sample_date', 'comp_dol']]
cat_cols = ['hours', 'remote', 'education', 'seniority', 'country']
num_cols = ['comp_dol']

df[cat_cols] = df[cat_cols].astype('category')
df['job_published_at'] = pd.to_datetime(df['job_published_at'])

train_test = df[df['sample_date'] == 2023]
train_test = train_test[(train_test['job_published_at']>'2022-12-31' )&(train_test['job_published_at']<'2023-06-03')]

prod_data = df[df['sample_date'] == 2024]
prod_data = prod_data[(prod_data['job_published_at']>'2023-11-30' )&(prod_data['job_published_at']<'2024-04-03')]

## 2. Quick modelling
##### I build simple pipeline for my model. I'll use HistGradientBoostingRegressor as my default estimator.
##### I fit the model on the training data and make predictions on test data.
##### I will further use the test data and test prediction to build my reference dataset.

In [2]:
col_transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cat_cols)
)
scaler = StandardScaler()

X = train_test.drop(columns=['sample_date', 'comp_dol'])
y = scaler.fit_transform(train_test[['comp_dol']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return np.asarray(X.todense())

In [4]:
hist_pipe = make_pipeline(
    col_transformer,
    DenseTransformer(),
    HistGradientBoostingRegressor()
)

In [5]:
hist_pipe.fit(X_train.drop(columns='job_published_at'), y_train.flatten())

In [6]:
y_test_pred = hist_pipe.predict(X_test.drop(columns='job_published_at'))

In [7]:
mse_hist = mean_squared_error(y_test, y_test_pred)
r2_hist = r2_score(y_test, y_test_pred)
print(f'Baseline Model Mean Squared Error:  {mse_hist}\n')
print(f'Baseline Model R2-score: {r2_hist}')

In [10]:
X_prod = prod_data.drop(columns=['sample_date', 'comp_dol'])
y_prod = scaler.fit_transform(prod_data[['comp_dol']])
y_prod_pred = hist_pipe.predict(X_prod.drop(columns=['job_published_at']))

In [11]:
mse_prod = mean_squared_error(y_prod, y_prod_pred)
r2_prod = r2_score(y_prod, y_prod_pred)
print(f'Baseline Model Mean Squared Error:  {mse_prod}\n')
print(f'Baseline Model R2-score: {r2_prod}')

## 3. Building the datasets for monitoring
##### Now I can put it all together.
##### I include both predictions and ground truth in the reference and analysis datasets.
##### However I use scaled values for them. The reason is I will further use PCA and it's sensitive to unscaled data.
##### Now I have two datasets that I'll save as csv files. I can use them to demonstrate covariate drift detection using NannyML.

In [15]:
reference = X_test.copy()
reference['y_pred']=y_test_pred
reference['comp_dol']=y_test

In [16]:
analysis = X_prod.copy()
analysis['y_pred']=y_prod_pred
analysis['comp_dol']=y_prod

In [17]:
reference.to_csv('data/reference.csv', index=False)
analysis.to_csv('data/analysis.csv', index=False)