In [1]:
import json
import os
import time
from pathlib import Path

import numpy as np
import pandas as pd
import requests
import yaml
from box import ConfigBox
from sklearn.metrics import f1_score
from tqdm import tqdm


### Set web service URL

In [14]:
# local testing
# API_URL = "http://0.0.0.0:8080" 
# testing deployed service
API_URL = "https://open-source-mlops-e2e-starting-point.fly.dev" 
API_URL

'https://open-source-mlops-e2e-starting-point.fly.dev'

### Load feature columns and data

In [15]:
def load_params(params_path):
    with open(params_path, "r") as f:
        params = yaml.safe_load(f)
        params = ConfigBox(params)
    return params


proj_path = Path(os.getcwd()).parent.absolute()
params = load_params(proj_path/'params.yaml')
feat_cols = params.features.cols
targ_col = params.target
feat_cols, targ_col

(BoxList(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']),
 'Exited')

In [19]:
df = pd.concat([pd.read_csv(proj_path/'data'/'raw'/f'Churn_Modelling_{country}.csv') for country in ['France', 'Spain']])
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
2,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
3,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
4,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0


### Test `/predict` endpoint by sending one sample request

In [20]:
my_obj = {
    "data": [
            {
      "CreditScore": 619,
      "Age": 42,
      "Tenure": 2,
      "Balance": 0,
      "NumOfProducts": 1,
      "HasCrCard": 1,
      "IsActiveMember": 1,
      "EstimatedSalary": 101348.88
            }
        ]
    }
x = requests.post(API_URL + '/predict', json = my_obj)
if x.ok:
    probs = x.json()
else:
    x.raise_for_status()

probs

[0.761036968147323]

### Create a functions for calling `/predict` endpoint

In [21]:
def get_prob(obj):
    x = requests.post(API_URL + '/predict', json = obj)
    if x.ok:
        probs = x.json()
    else:
        x.raise_for_status()
    return probs

get_prob(my_obj)

[0.761036968147323]

In [7]:
# def get_prob(obj):
#     resp = requests.post(API_URL + "/predict", json=obj)
#     return resp.json()["probs"]  # Return only the list of floats


### Check what would f1-scores look like if we send the data from the same geographies that the model was trained on 

In [22]:
import time


def send_sample_requests(df):
    f1_score_list = []
    for _ in tqdm(range(50)):
        df_sample = df.sample(n=60)
        y_true = df_sample[targ_col].values
        obj = {"data": df_sample[feat_cols].to_dict('records')}
        probs = get_prob(obj)
        
        y_pred = np.array([prob < 0.5 for prob in probs]).astype(int)

        f1 = f1_score(y_true, y_pred)
        f1_score_list.append(f1)
        time.sleep(1)
    return f1_score_list
        
f1_score_list = send_sample_requests(df)
f1_score_list

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [01:00<00:00,  1.20s/it]


[0.7058823529411764,
 0.0,
 0.5454545454545454,
 0.4444444444444445,
 0.3333333333333333,
 0.5,
 0.6666666666666665,
 0.8,
 0.4444444444444444,
 0.7368421052631579,
 0.7058823529411764,
 0.4615384615384615,
 0.47058823529411764,
 0.6153846153846153,
 0.631578947368421,
 0.6363636363636364,
 0.3636363636363636,
 0.8571428571428571,
 0.888888888888889,
 0.5333333333333333,
 0.5333333333333333,
 0.625,
 0.5555555555555556,
 0.5555555555555556,
 0.6666666666666666,
 0.6,
 0.6,
 0.5454545454545454,
 0.33333333333333337,
 0.5333333333333333,
 0.6153846153846153,
 0.6666666666666666,
 0.7272727272727272,
 0.7058823529411764,
 0.7000000000000001,
 0.4615384615384615,
 0.6,
 0.5714285714285715,
 0.625,
 0.6,
 0.7857142857142857,
 0.7142857142857143,
 0.5454545454545454,
 0.6153846153846153,
 0.7272727272727273,
 0.8,
 0.8571428571428571,
 0.7777777777777778,
 0.4,
 0.7272727272727273]

### What would f1-scores look like for input data from new geography?

In [23]:
df_germany =  pd.read_csv(proj_path/'data'/'raw'/'Churn_Modelling_Germany.csv')
time.sleep(20)
f1_score_list = send_sample_requests(df_germany)
f1_score_list


100%|██████████| 50/50 [00:59<00:00,  1.20s/it]


[0.48275862068965514,
 0.4444444444444444,
 0.37037037037037035,
 0.3448275862068966,
 0.32000000000000006,
 0.4444444444444445,
 0.23076923076923078,
 0.5217391304347826,
 0.46153846153846156,
 0.19999999999999998,
 0.25,
 0.4,
 0.5294117647058824,
 0.5517241379310346,
 0.5384615384615384,
 0.5,
 0.3478260869565218,
 0.47058823529411764,
 0.2857142857142857,
 0.5517241379310345,
 0.6153846153846153,
 0.56,
 0.5333333333333333,
 0.48484848484848486,
 0.3846153846153846,
 0.5,
 0.4166666666666667,
 0.4444444444444445,
 0.6666666666666666,
 0.48,
 0.5454545454545454,
 0.5517241379310345,
 0.45454545454545453,
 0.5333333333333333,
 0.34782608695652173,
 0.5833333333333334,
 0.5714285714285715,
 0.3478260869565218,
 0.3870967741935483,
 0.45454545454545453,
 0.5454545454545454,
 0.5517241379310345,
 0.5517241379310346,
 0.5,
 0.5263157894736842,
 0.37037037037037035,
 0.5384615384615385,
 0.5333333333333333,
 0.44444444444444436,
 0.13333333333333333]

### ... as expected, the values are much lower.
### But, typically, we can't compute model metrics on production data right away 
### because ground truth labels might not be available until much-much later


In [25]:
API_URL

'https://open-source-mlops-e2e-starting-point.fly.dev'

### All we can do is try and look at "proxy" metrics that measure statistical differences (distances) between train data and production data

In [26]:
x = requests.get(API_URL + '/drift_data')
if x.ok:
    data = x.json()
else:
    x.raise_for_status()



In [11]:
obj = json.loads(data)                 # dict-of-dicts
df_p_vals = pd.DataFrame.from_dict(obj)    # rows indexed by "0","1","2",...
df_p_vals.index = df_p_vals.index.astype(int)     # make the index numeric
df_p_vals = df_p_vals.sort_index()
df_p_vals["time"] = pd.to_datetime(df_p_vals["time"])
num_cols = [c for c in df_p_vals.columns if c != "time"]
df_p_vals[num_cols] = df_p_vals[num_cols].apply(pd.to_numeric, errors="coerce")
df_p_vals

Unnamed: 0,time,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2025-09-09 20:49:03.150595,0.918567,0.863865,0.143013,0.848606,0.817897,0.908434,1.000000,0.599941
1,2025-09-09 20:49:04.406590,0.768535,0.310214,0.550226,0.447905,1.000000,1.000000,0.459908,0.613268
2,2025-09-09 20:49:05.587528,0.405358,0.996501,0.788216,0.987326,0.924585,0.828952,1.000000,0.580430
3,2025-09-09 20:49:06.874139,0.828620,0.455293,0.476074,0.025352,0.924585,1.000000,0.092570,0.292029
4,2025-09-09 20:49:08.075518,0.688048,0.890941,0.978125,0.630033,0.994599,0.999244,1.000000,0.745673
...,...,...,...,...,...,...,...,...,...
95,2025-09-09 20:54:23.488090,0.051168,0.935130,0.898065,0.000000,0.985062,1.000000,1.000000,0.197280
96,2025-09-09 20:54:24.641141,0.750580,0.324957,0.451180,0.000000,0.999996,1.000000,0.999849,0.453783
97,2025-09-09 20:54:25.709301,0.857514,0.367966,0.643598,0.000000,0.589860,0.999980,1.000000,0.441865
98,2025-09-09 20:54:26.867831,0.924858,0.354600,0.216913,0.000000,1.000000,0.828952,0.981324,0.369667


### Values below the threshold (e.g. 0.05) indicate data drift

In [27]:
import plotly_express as px

fig = px.line(df_p_vals, x='time', y=feat_cols)
fig.add_hline(y=0.05, line_color='red')