In [1]:
import os
from pathlib import Path

import pandas as pd
from alibi_detect.cd import TabularDrift
from joblib import load
from alibi_detect.saving import save_detector

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Load model 

In [2]:
proj_path = Path(os.getcwd()).parent.absolute()

model_path = proj_path/'models'/'clf-model.joblib'
model = load(model_path)

### Load train and test data

In [3]:
X_test = pd.read_pickle(proj_path/'data'/'processed'/'X_test.pkl')
X_train = pd.read_pickle(proj_path/'data'/'processed'/'X_train.pkl')

In [4]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
1442,635,38,0,103257.14,1,0,0,158344.63
1488,508,54,10,0.0,1,1,1,175749.36
4716,753,33,5,122568.05,2,1,1,82820.85
107,751,52,8,0.0,2,0,1,179291.85
2662,622,32,5,179305.09,1,1,1,149043.78


In [5]:
feat_cols = X_train.columns.tolist()
feat_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

### Load data from new geography (Germany) that the model has never seen

In [6]:
df_germany = pd.read_csv(proj_path/'data'/'more_data'/'Churn_Modelling_Germany.csv')
df_germany.shape

(2509, 13)

In [7]:
X_germany = df_germany[feat_cols]

### Train drift detection model

In [8]:
# need preprocessor from sklearn pipeline 
# in order to process the data the exact same way as it was during training
preprocessor = model[:-1]

In [9]:
categories_per_feature = {i:None for i,k in enumerate(feat_cols) if k.startswith('cat__')}

cd = TabularDrift(X_train, 
                  p_val=.05, 
                  preprocess_fn=preprocessor.transform,
                  categories_per_feature=categories_per_feature)

### Will there be drift if data was unseen by the model (test data), but it comes from the same geographies (France and Spain)?

In [10]:
preds = cd.predict(X_test)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? No!


In [13]:
X_train.shape

(5618, 8)

In [11]:
preds

{'data': {'is_drift': 0,
  'distance': array([0.03423763, 0.0079729 , 0.01403448, 0.01935878, 0.0134369 ,
         0.01553678, 0.00435666, 0.02729775], dtype=float32),
  'p_val': array([0.0725126 , 0.9999878 , 0.9411055 , 0.66084754, 0.95858634,
         0.8813387 , 1.        , 0.24153146], dtype=float32),
  'threshold': 0.00625},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.12.0',
  'detector_type': 'drift'}}

In [14]:
preds['data']['p_val']

array([0.0725126 , 0.9999878 , 0.9411055 , 0.66084754, 0.95858634,
       0.8813387 , 1.        , 0.24153146], dtype=float32)

### Will there be drift if data comes from a different geography (Germany) when what the model was trained on?

In [15]:
preds = cd.predict(X_germany)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? Yes!


In [16]:
preds

{'data': {'is_drift': 1,
  'distance': array([0.01707729, 0.07053575, 0.01923655, 0.48504806, 0.03570642,
         0.01500501, 0.02252659, 0.02009157], dtype=float32),
  'p_val': array([6.8601173e-01, 6.0225517e-08, 5.3602523e-01, 0.0000000e+00,
         2.3450304e-02, 8.2404953e-01, 3.3738029e-01, 4.7969258e-01],
        dtype=float32),
  'threshold': 0.00625},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.12.0',
  'detector_type': 'drift'}}

In [17]:
p_val = preds['data']['p_val']
p_val

array([6.8601173e-01, 6.0225517e-08, 5.3602523e-01, 0.0000000e+00,
       2.3450304e-02, 8.2404953e-01, 3.3738029e-01, 4.7969258e-01],
      dtype=float32)

In [18]:
import datetime
now = datetime.datetime.now()

df_p_val = pd.DataFrame([[now] + p_val.tolist()], columns=['time'] + feat_cols)
df_p_val

Unnamed: 0,time,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2024-09-24 13:28:59.547075,0.686012,6.022552e-08,0.536025,0.0,0.02345,0.82405,0.33738,0.479693


### Save drift detector

In [19]:
detector_path = proj_path/'models'/'drift_detector'
save_detector(cd, detector_path)