In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Data prep
from sklearn.model_selection import train_test_split
# Regression
from sklearn.linear_model import ElasticNet
# Classification
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


In [7]:
def psi(score_initial, score_new, num_bins = 10, mode = 'fixed'):

    eps = 1e-4

    # Sort the data
    score_initial.sort()
    score_new.sort()

    # Prepare the bins
    min_val = min(min(score_initial), min(score_new))
    max_val = max(max(score_initial), max(score_new))
    if mode == 'fixed':
        bins = [min_val + (max_val - min_val)*(i)/num_bins for i in range(num_bins+1)]
    elif mode == 'quantile':
        bins = pd.qcut(score_initial, q = num_bins, retbins = True)[1] # Create the quantiles based on the initial population
    else:
        raise ValueError(f"Mode \'{mode}\' not recognized. Your options are \'fixed\' and \'quantile\'")
    bins[0] = min_val - eps # Correct the lower boundary
    bins[-1] = max_val + eps # Correct the higher boundary


    # Bucketize the initial population and count the sample inside each bucket
    bins_initial = pd.cut(score_initial, bins = bins, labels = range(1,num_bins+1))
    df_initial = pd.DataFrame({'initial': score_initial, 'bin': bins_initial})
    grp_initial = df_initial.groupby('bin').count()
    grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial'])

    # Bucketize the new population and count the sample inside each bucket
    bins_new = pd.cut(score_new, bins = bins, labels = range(1,num_bins+1))
    df_new = pd.DataFrame({'new': score_new, 'bin': bins_new})
    grp_new = df_new.groupby('bin').count()
    grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new'])

    # Compare the bins to calculate PSI
    psi_df = grp_initial.join(grp_new, on = "bin", how = "inner")

    # Add a small value for when the percent is zero
    psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: eps if x == 0 else x)
    psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: eps if x == 0 else x)

    # Calculate the psi
    psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new'])

    # Return the psi values
    return psi_df['psi'].values

In [8]:
# from google.colab import files
# uploaded = files.upload()


df_test = pd.read_csv("data_test_regression.csv", sep = ";")
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred
0,1.18562,3.008328,7.407531,1,1
1,-3.142256,3.452865,6.675259,1,1
2,-1.23723,3.430706,6.826375,1,1
3,2.198279,1.454357,2.784291,0,0
4,-1.372756,1.743652,4.267764,0,0


In [9]:
Z = pd.read_csv("Z_regression.csv", sep = ";")
Z.head()

Unnamed: 0,x1,x2,x3,pred,proba
0,-1.463379,2.656211,11.838128,1,0.000763
1,1.030837,3.172744,9.03465,1,0.0008
2,1.546625,3.824892,3.214466,0,0.00331
3,1.574219,2.045063,8.716976,1,0.003619
4,-1.578048,3.907165,6.199979,1,0.004701


In [10]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]
# Fixed
print("CSI - Fixed size bins")
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

# Quantile
print("\nCSI - Quantile bins")
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

CSI - Fixed size bins
x1 -> csi=0.0019
x2 -> csi=0.0483
x3 -> csi=0.1701

CSI - Quantile bins
x1 -> csi=0.0004
x2 -> csi=0.0481
x3 -> csi=0.1272


  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()


In [11]:
df_test = pd.read_csv("data_test_classification.csv", sep = ";")
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred,proba
0,1.18562,3.008328,7.407531,1,1,0.000516
1,-3.142256,3.452865,6.675259,1,1,0.001372
2,-1.23723,3.430706,6.826375,1,1,0.001794
3,2.198279,1.454357,2.784291,0,0,0.002741
4,-1.372756,1.743652,4.267764,0,0,0.004005


In [12]:
Z = pd.read_csv("Z_classification.csv", sep = ";")
Z.head()

Unnamed: 0,x1,x2,x3,pred,proba
0,-1.463379,2.656211,11.838128,1,0.000763
1,1.030837,3.172744,9.03465,1,0.0008
2,1.546625,3.824892,3.214466,0,0.00331
3,1.574219,2.045063,8.716976,1,0.003619
4,-1.578048,3.907165,6.199979,1,0.004701


In [13]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]
# Fixed
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0019
x2 -> csi=0.0483
x3 -> csi=0.1701


  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()


In [14]:
# Quantile
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0004
x2 -> csi=0.0481
x3 -> csi=0.1272


  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
