In [1]:
import json
import os
import time
from pathlib import Path

import numpy as np
import pandas as pd
import requests
import yaml
from box import ConfigBox
from sklearn.metrics import f1_score
from tqdm import tqdm


### Set web serice URL

In [2]:
# local testing
API_URL = "http://0.0.0.0:8080" 
# testing deployed service
# API_URL = "https://<YOUR_APP_NAME>.fly.dev" 

API_URL

'http://0.0.0.0:8080'

### Load feature columns and data

In [3]:
def load_params(params_path):
    with open(params_path, "r") as f:
        params = yaml.safe_load(f)
        params = ConfigBox(params)
    return params


proj_path = Path(os.getcwd()).parent.absolute()
params = load_params(proj_path/'params.yaml')
feat_cols = params.base.feat_cols
targ_col = params.base.targ_col
feat_cols, targ_col

(BoxList(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']),
 'Exited')

In [4]:
df = pd.concat([pd.read_csv(proj_path/'data'/'raw'/f'Churn_Modelling_{country}.csv') for country in ['France', 'Spain']])
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
2,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
3,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
4,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0


### Test `/predict` endpoint by sending one sample request

In [6]:
my_obj = {
    "data": [
            {
      "CreditScore": 619,
      "Age": 42,
      "Tenure": 2,
      "Balance": 0,
      "NumOfProducts": 1,
      "HasCrCard": 1,
      "IsActiveMember": 1,
      "EstimatedSalary": 101348.88
            }
        ]
    }
x = requests.post(API_URL + '/predict', json = my_obj)
if x.ok:
    probs = x.json()
else:
    x.raise_for_status()

print(probs)

[0.7238478660583496]


### Create a functions for calling `/predict` endpoint

In [8]:
def get_prob(obj):
    x = requests.post(API_URL + '/predict', json = obj)
    if x.ok:
        probs = x.json()
    else:
        x.raise_for_status()
    return probs

print(get_prob(my_obj))

[0.7238478660583496]


### Check what would f1-scores look like if we send the data from the same geographies that the model was trained on 

In [9]:
import time


def send_sample_requests(df):
    f1_score_list = []
    for _ in tqdm(range(50)):
        df_sample = df.sample(n=60)
        y_true = df_sample[targ_col].values
        obj = {"data": df_sample[feat_cols].to_dict('records')}
        probs = get_prob(obj)
        y_pred = np.array([prob < 0.5 for prob in probs]).astype(int)
        f1 = f1_score(y_true, y_pred)
        f1_score_list.append(f1)
        time.sleep(1)
    return f1_score_list
        
f1_score_list = send_sample_requests(df)
f1_score_list

100%|██████████| 50/50 [00:51<00:00,  1.02s/it]


[0.9411764705882353,
 0.8571428571428572,
 1.0,
 0.923076923076923,
 0.8666666666666666,
 0.8750000000000001,
 0.9333333333333333,
 0.8695652173913043,
 0.923076923076923,
 0.923076923076923,
 0.9166666666666666,
 0.8421052631578948,
 0.9523809523809523,
 0.8333333333333333,
 0.8235294117647058,
 0.888888888888889,
 0.8571428571428571,
 1.0,
 0.9,
 0.88,
 0.7499999999999999,
 0.9523809523809523,
 0.9600000000000001,
 1.0,
 0.9090909090909091,
 0.8571428571428571,
 0.8181818181818181,
 0.8799999999999999,
 0.875,
 0.9523809523809523,
 0.9090909090909091,
 0.875,
 1.0,
 0.75,
 0.8333333333333333,
 0.896551724137931,
 0.9333333333333333,
 0.888888888888889,
 0.9285714285714286,
 0.8235294117647058,
 0.9,
 0.9411764705882353,
 1.0,
 0.6666666666666665,
 0.8,
 0.923076923076923,
 0.9411764705882353,
 1.0,
 0.888888888888889,
 1.0]

In [10]:
print(f1_score_list)

[0.9411764705882353, 0.8571428571428572, 1.0, 0.923076923076923, 0.8666666666666666, 0.8750000000000001, 0.9333333333333333, 0.8695652173913043, 0.923076923076923, 0.923076923076923, 0.9166666666666666, 0.8421052631578948, 0.9523809523809523, 0.8333333333333333, 0.8235294117647058, 0.888888888888889, 0.8571428571428571, 1.0, 0.9, 0.88, 0.7499999999999999, 0.9523809523809523, 0.9600000000000001, 1.0, 0.9090909090909091, 0.8571428571428571, 0.8181818181818181, 0.8799999999999999, 0.875, 0.9523809523809523, 0.9090909090909091, 0.875, 1.0, 0.75, 0.8333333333333333, 0.896551724137931, 0.9333333333333333, 0.888888888888889, 0.9285714285714286, 0.8235294117647058, 0.9, 0.9411764705882353, 1.0, 0.6666666666666665, 0.8, 0.923076923076923, 0.9411764705882353, 1.0, 0.888888888888889, 1.0]


### What would f1-scores look like for input data from new geography?

In [11]:
df_germany =  pd.read_csv(proj_path/'data'/'more_data'/'Churn_Modelling_Germany.csv')
time.sleep(20) # this is to create a pause between requests. Later it'll be easier to distinguish the two types if requests
f1_score_list = send_sample_requests(df_germany)
f1_score_list


100%|██████████| 50/50 [00:51<00:00,  1.02s/it]


[0.4848484848484849,
 0.4516129032258065,
 0.4444444444444444,
 0.6206896551724137,
 0.6060606060606061,
 0.4137931034482759,
 0.3478260869565218,
 0.45454545454545453,
 0.48275862068965514,
 0.4761904761904762,
 0.5333333333333333,
 0.3478260869565218,
 0.45714285714285713,
 0.4666666666666666,
 0.32,
 0.42857142857142855,
 0.3448275862068966,
 0.608695652173913,
 0.47619047619047616,
 0.35294117647058826,
 0.56,
 0.5945945945945945,
 0.3448275862068966,
 0.43478260869565216,
 0.3448275862068966,
 0.43478260869565216,
 0.30769230769230765,
 0.5333333333333333,
 0.5161290322580645,
 0.42857142857142855,
 0.4864864864864865,
 0.5806451612903226,
 0.7567567567567567,
 0.5185185185185185,
 0.5185185185185185,
 0.5365853658536585,
 0.2,
 0.4761904761904762,
 0.5454545454545455,
 0.2727272727272727,
 0.5333333333333333,
 0.48,
 0.4705882352941177,
 0.380952380952381,
 0.42857142857142855,
 0.5142857142857143,
 0.4,
 0.8148148148148148,
 0.5806451612903226,
 0.3076923076923077]

In [12]:
print(f1_score_list)

[0.4848484848484849, 0.4516129032258065, 0.4444444444444444, 0.6206896551724137, 0.6060606060606061, 0.4137931034482759, 0.3478260869565218, 0.45454545454545453, 0.48275862068965514, 0.4761904761904762, 0.5333333333333333, 0.3478260869565218, 0.45714285714285713, 0.4666666666666666, 0.32, 0.42857142857142855, 0.3448275862068966, 0.608695652173913, 0.47619047619047616, 0.35294117647058826, 0.56, 0.5945945945945945, 0.3448275862068966, 0.43478260869565216, 0.3448275862068966, 0.43478260869565216, 0.30769230769230765, 0.5333333333333333, 0.5161290322580645, 0.42857142857142855, 0.4864864864864865, 0.5806451612903226, 0.7567567567567567, 0.5185185185185185, 0.5185185185185185, 0.5365853658536585, 0.2, 0.4761904761904762, 0.5454545454545455, 0.2727272727272727, 0.5333333333333333, 0.48, 0.4705882352941177, 0.380952380952381, 0.42857142857142855, 0.5142857142857143, 0.4, 0.8148148148148148, 0.5806451612903226, 0.3076923076923077]


### ... as expected, the values are much lower.
### But, typically, we can't compute model metrics on production data right away 
### because ground truth labels might not be available until much-much later


### All we can do is try and look at "proxy" metrics that measure statistical differences (distances) between train data and production data

In [13]:
x = requests.get(API_URL + '/drift_data')
if x.ok:
    data = x.json()
else:
    x.raise_for_status()

df_p_vals = pd.DataFrame(json.loads(data))
df_p_vals

Unnamed: 0,time,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2023-11-27 13:13:47.021087,0.700415,0.661835,0.566960,7.799870e-01,0.997024,0.043390,1.000000,0.971579
1,2023-11-27 13:13:48.045407,0.920968,0.154056,0.569331,1.213740e-01,0.925950,1.000000,0.248822,0.664539
2,2023-11-27 13:13:49.059733,0.668299,0.959062,0.999548,5.606151e-01,0.995000,0.387848,0.187120,0.247931
3,2023-11-27 13:13:50.083925,0.188857,0.682097,0.076325,6.547415e-01,0.226388,1.000000,0.999087,0.173309
4,2023-11-27 13:13:51.106598,0.383172,0.101785,0.390056,7.216960e-01,0.750037,0.518144,0.962630,0.113638
...,...,...,...,...,...,...,...,...,...
95,2023-11-27 13:16:41.611093,0.568474,0.438538,0.687936,0.000000e+00,0.995000,1.000000,1.000000,0.054008
96,2023-11-27 13:16:42.635475,0.045403,0.681922,0.494441,0.000000e+00,1.000000,0.993182,0.405653,0.663227
97,2023-11-27 13:16:43.659379,0.052973,0.370600,0.970229,2.640685e-16,0.374066,0.999993,1.000000,0.606455
98,2023-11-27 13:16:44.687270,0.595435,0.903408,0.475300,0.000000e+00,0.999997,0.993182,0.712223,0.680527


### Values below the threshold (e.g. 0.05) indicate data drift

In [14]:
import plotly_express as px

fig = px.line(df_p_vals, x='time', y=feat_cols)
fig.add_hline(y=0.05, line_color='red')

In [15]:
df['Balance'].describe()

count      7491.000000
mean      62001.873114
std       64163.149401
min           0.000000
25%           0.000000
50%       62092.900000
75%      121281.295000
max      250898.090000
Name: Balance, dtype: float64

In [16]:
df_germany['Balance'].describe()

count      2509.000000
mean     119730.116134
std       27022.006157
min       27288.430000
25%      102800.720000
50%      119703.100000
75%      137560.380000
max      214346.960000
Name: Balance, dtype: float64