In [1]:
import os
from pathlib import Path
import pandas as pd
from alibi_detect.cd import TabularDrift
from joblib import load
from alibi_detect.saving import save_detector

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Load model

In [2]:
proj_path = Path(os.getcwd()).parent.absolute()
model_path = proj_path/'models'/'clf-model.joblib'
model = load(model_path)

Load train and test data

In [3]:
X_test = pd.read_pickle(proj_path/'data'/'processed'/'X_test.pkl')
X_train = pd.read_pickle(proj_path/'data'/'processed'/'X_train.pkl')

In [4]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
3979,720,26,10,51962.91,2,1,0,45507.24
4025,667,34,5,0.0,2,1,1,102908.63
2239,549,31,4,0.0,2,0,1,25684.85
2644,822,32,8,116358.0,1,1,0,108798.36
185,605,28,6,0.0,2,0,0,159508.52


In [5]:
feat_cols = X_train.columns.tolist()
feat_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

Load data from new geography (Germany) that the model has never seen


In [9]:
df_germany = pd.read_csv("/workspaces/open-source-mlops-e2e-starting-point/data/raw/Churn_Modelling_Germany.csv")

In [10]:
df_germany.shape

(2509, 13)

In [11]:
X_germany = df_germany[feat_cols]


Train drift detection model


In [12]:
preprocessor = model[:-1]
preprocessor

In [15]:
categories_per_feature = {i:None for i,k in enumerate(feat_cols) if k.startswith('cat__')}

cd = TabularDrift(X_train, 
                  p_val=.05, 
                  preprocess_fn=preprocessor.transform)



Will there be drift if data was unseen by the model (test data), but it comes from the same geographies (France and Spain)?

In [16]:
preds = cd.predict(X_test)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? No!


Will there be drift if data comes from a different geography (Germany) when what the model was trained on?

In [21]:
preds = cd.predict(X_germany)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? Yes!


In [22]:
p_val = preds['data']['p_val']


In [23]:
import datetime
now = datetime.datetime.now()

df_p_val = pd.DataFrame([[now] + p_val.tolist()], columns=['time'] + feat_cols)
df_p_val

Unnamed: 0,time,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2025-09-04 22:38:11.054340,0.362236,2.077079e-08,0.418449,0.0,0.024508,0.997952,0.142224,0.254453


Save drift detector

In [24]:
detector_path = proj_path/'models'/'drift_detector'
save_detector(cd, detector_path)