# Generate base data

In [1]:
import polars as pl
import numpy as np
from scipy.stats import norm
from scipy.stats import expon

seed = 50

df_base = pl.DataFrame(
    {
        'col1_norm': norm.rvs(loc=0, size=1000, random_state=seed),
        'col2_norm_null': pl.concat([pl.Series(norm.rvs(loc=0, size=950, random_state=seed)), pl.Series(np.full(50, None), dtype=pl.Float64)], how='vertical'),
        'col3_norm_null_default': np.concatenate([norm.rvs(loc=0, size=950, random_state=seed), np.full(50, -1)]),
        'col4_str_abc': ['a'] * 250 + ['b'] * 500 + ['c'] * 250, # categorical column
        'col5_str_abc_null': ['a'] * 490 + ['b'] * 240 + ['c'] * 240 + [None] * 30, # categorical column with nulls
        'col6_binary': [1] * 250 + [0] * 750, # binary categorical
        'col7_binary_null': [1] * 250 + [0] * 715 + [None] * 35, # binary with null
        'col8_stacked_at_0': np.concatenate([[0] * 500, expon.rvs(size=500, random_state=seed)]), # Overlapping bins (highly skewed toward 0)
        'col9_stacked_at_1': np.concatenate([[1] * 500, expon.rvs(size=500, random_state=seed)]), # Overlapping bins (highly skewed toward 0)
        'col10_discrete_numeric': [1] * 400 + [2] * 150 + [3] * 150 + [4] * 250 + [5] * 50, # Numeric that takes on integer values
        'col11_cat_missing_level': [1] * 500 + [0] * 500 # categorical missing levels
        # constant numeric column
        # constant categorical column
        # Extremely skewed numeric that shifts. E.g., only 1-2 bins defined for base, compare shifts up
        # columns with all missing
    }
)

df_base.write_csv('./base_data.csv')

# Generate compare data

In [4]:
seed = 60

df_compare = pl.DataFrame(
    {
        'col1_norm': norm.rvs(loc=0, size=1000, random_state=seed),
        'col2_norm_null': pl.concat([pl.Series(norm.rvs(loc=0, size=950, random_state=seed)), pl.Series(np.full(50, None), dtype=pl.Float64)], how='vertical'),
        'col3_norm_null_default': np.concatenate([norm.rvs(loc=0, size=950, random_state=seed), np.full(50, -1)]),
        'col4_str_abc': ['a'] * 225 + ['b'] * 535 + ['c'] * 240, # categorical column
        'col5_str_abc_null': ['a'] * 480 + ['b'] * 230 + ['c'] * 230 + [None] * 60, # categorical column with nulls
        'col6_binary': [1] * 230 + [0] * 770, # binary categorical
        'col7_binary_null': [1] * 270 + [0] * 715 + [None] * 15, # binary with null
        'col8_stacked_at_0': np.concatenate([[0] * 300, expon.rvs(size=700, random_state=seed)]), # Overlapping bins (highly skewed toward 0)
        'col9_stacked_at_1': np.concatenate([[1] * 300, expon.rvs(size=700, random_state=seed)]), # Overlapping bins (highly skewed toward 0)
        'col10_discrete_numeric': [1] * 300 + [2] * 225 + [3] * 125 + [4] * 300 + [5] * 50, # Numeric that takes on integer values
        'col11_cat_missing_level': [1] * 500 + [None] * 500 # categorical missing levels
    }
)
df_compare.write_csv('./compare_data.csv')

# Descriptive stats

In [5]:
df_base.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

statistic,col1_norm,col2_norm_null,col3_norm_null_default,col4_str_abc,col5_str_abc_null,col6_binary,col7_binary_null,col8_stacked_at_0,col9_stacked_at_1,col10_discrete_numeric,col11_cat_missing_level
str,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64
"""count""",1000.0,950.0,1000.0,"""1000""","""970""",1000.0,965.0,1000.0,1000.0,1000.0,1000.0
"""null_count""",0.0,50.0,0.0,"""0""","""30""",0.0,35.0,0.0,0.0,0.0,0.0
"""mean""",-0.0236083956363207,-0.0314349049349975,-0.0798631596882476,,,0.25,0.2590673575129533,0.4703186205948364,0.9703186205948364,2.4,0.5
"""std""",1.0040113443251066,1.0009380471929,0.9981674249076232,,,0.4332293707583636,0.4383498392355631,0.7912180510200002,0.6367784893550478,1.3571447387223818,0.5002501876563868
"""min""",-3.809890377851563,-3.809890377851563,-3.809890377851563,"""a""","""a""",0.0,0.0,0.0,0.0023772891243267,1.0,0.0
"""10%""",-1.3010320527899553,-1.2894037585679976,-1.282292595889517,,,0.0,0.0,0.0,0.241628360943498,1.0,0.0
"""20%""",-0.8703601159613231,-0.8759740533938657,-1.0,,,0.0,0.0,0.0,0.484936157454177,1.0,0.0
"""30%""",-0.5407159807947849,-0.541114392946758,-0.6409140640980875,,,0.0,0.0,0.0,0.8774003168058225,1.0,0.0
"""40%""",-0.270646337937458,-0.274150965726486,-0.363433403196711,,,0.0,0.0,0.0,1.0,2.0,0.0
"""50%""",-0.0166698542668132,-0.0311109701201512,-0.1003958421916509,,,0.0,0.0,0.0023772891243267,1.0,2.0,1.0


In [6]:
df_compare.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

statistic,col1_norm,col2_norm_null,col3_norm_null_default,col4_str_abc,col5_str_abc_null,col6_binary,col7_binary_null,col8_stacked_at_0,col9_stacked_at_1,col10_discrete_numeric,col11_cat_missing_level
str,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64
"""count""",1000.0,950.0,1000.0,"""1000""","""940""",1000.0,985.0,1000.0,1000.0,1000.0,500
"""null_count""",0.0,50.0,0.0,"""0""","""60""",0.0,15.0,0.0,0.0,0.0,500
"""mean""",0.009950459452314,0.0098351082582625,-0.0406566471546505,,,0.23,0.2741116751269035,0.6708127916315002,0.9708127916315,2.575,1
"""std""",1.016800292189073,1.0245424902861384,1.0225643398462667,,,0.4210430824479572,0.4462921403160287,0.9471390066926312,0.8392806102040424,1.3214087638278782,0
"""min""",-2.8424996813250627,-2.8424996813250627,-2.8424996813250627,"""a""","""a""",0.0,0.0,0.0,0.0019164800180277,1.0,1
"""10%""",-1.2843471524175325,-1.2843471524175325,-1.2471144105859766,,,0.0,0.0,0.0,0.1561146950500294,1.0,1
"""20%""",-0.8423970136433064,-0.8504479917190357,-1.0,,,0.0,0.0,0.0,0.3244191441711725,1.0,1
"""30%""",-0.5258911703835001,-0.531536954859921,-0.6444862859746747,,,0.0,0.0,0.0019164800180277,0.5451632081855768,2.0,1
"""40%""",-0.2487622292791448,-0.2562110164205865,-0.3457546965043969,,,0.0,0.0,0.1561146950500294,0.7865575078319345,2.0,1
"""50%""",-0.0154446813613589,-0.0186834377779913,-0.075531782104036,,,0.0,0.0,0.3244191441711725,1.0,2.0,1


# Interactive testing

In [2]:
import sys

sys.path.append('../')
from psi import psi
import polars as pl

pl.Config.set_tbl_rows(-1)
pl.Config.set_fmt_float('full')

df_base = pl.read_csv('./base_data.csv')
df_compare = pl.read_csv('./compare_data.csv')

df_psi, df_base_freq, df_compare_freq = psi(
    df_base=df_base,
    df_compare=df_compare,
    bins=10,
    numeric_columns=['col1_norm', 'col2_norm_null', 'col8_stacked_at_0', 'col9_stacked_at_1', 'col10_discrete_numeric'],
    categorical_columns=['col4_str_abc', 'col5_str_abc_null', 'col6_binary', 'col7_binary_null', 'col11_cat_missing_level']
)

In [3]:
df_manual_psi = pl.read_csv('./manual_psi_results.csv')
df_manual_psi = df_manual_psi.rename({'psi': 'psi_manual'})

df_psi = df_psi.rename({'psi': 'psi_fn'}, strict=False)

df_manual_psi = df_manual_psi.join(df_psi, on=['attribute'], suffix='_fn')
df_manual_psi = df_manual_psi.with_columns(
    (((pl.col('psi_manual') - pl.col('psi_fn').abs()) <= 1e-8) | (pl.col('psi_manual').is_null() & pl.col('psi_fn').is_null())).alias('is_equal')
)
df_manual_psi

attribute,psi_manual,psi_fn,is_equal
str,f64,f64,bool
"""col1_norm""",0.008422534042,0.0084225340421378,True
"""col2_norm_null""",0.008447836307,0.0084478363065453,True
"""col8_stacked_at_0""",0.1775698374,0.1775698373678918,True
"""col9_stacked_at_1""",0.1807236258,0.1807236258182166,True
"""col10_discrete_numeric""",0.07285220711,0.072852207112837,True
"""col4_str_abc""",0.005410285533,0.0054102855332317,True
"""col5_str_abc_null""",0.02185180058,0.0218518005772016,True
"""col6_binary""",0.002193978345,0.0021939783451284,True
"""col7_binary_null""",0.01848517803,0.0184851780304666,True
"""col11_cat_missing_level""",,,True


In [3]:
df_base_freq

col1_norm,col2_norm_null,col8_stacked_at_0,col9_stacked_at_1,col10_discrete_numeric,col4_str_abc,col5_str_abc_null,col6_binary,col7_binary_null,col11_cat_missing_level
struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2]
"{""[-inf, -1.3013769564701396]"",100}","{""[-inf, -1.2895778994315654]"",95}","{""[-inf, 0]"",500}","{""[-inf, 0.24146240139778277]"",100}","{""[-inf, 1]"",400}","{""a"",250}","{""a"",490}","{""0"",750}","{""0"",715}","{""0"",500}"
"{""(-1.3013769564701396, -0.8714829034478315]"",100}","{""(-1.2895778994315654, -0.8761616817498066]"",95}","{""(0, 0.0011886445621633556]"",0}","{""(0.24146240139778277, 0.48452393915442477]"",100}","{""(1, 1.6000000000000227]"",0}","{""b"",500}","{""b"",240}","{""1"",250}","{""1"",250}","{""1"",500}"
"{""(-0.8714829034478315, -0.5408355044403768]"",100}","{""(-0.8761616817498066, -0.5417661688027671]"",95}","{""(0.0011886445621633556, 0.24063260366920644]"",100}","{""(0.48452393915442477, 0.8771514893399184]"",100}","{""(1.6000000000000227, 2]"",150}","{""c"",250}","{""c"",240}",,"{null,35}",
"{""(-0.5408355044403768, -0.2706955076429412]"",100}","{""(-0.5417661688027671, -0.2748712829586359]"",95}","{""(0.24063260366920644, 0.4834933934050441]"",100}","{""(0.8771514893399184, 1]"",528}","{""(2, 3]"",150}",,"{null,30}",,,
"{""(-0.2706955076429412, -0.019756045841092827]"",100}","{""(-0.2748712829586359, -0.03150085831307207]"",95}","{""(0.4834933934050441, 0.8767367768967448]"",100}","{""(1, 1.5235002676568767]"",72}","{""(3, 3.300000000000068]"",0}",,,,,
"{""(-0.019756045841092827, 0.23611511591281759]"",100}","{""(-0.03150085831307207, 0.2326668924514275]"",95}","{""(0.8767367768967448, 1.5235002676568767]"",100}","{""(1.5235002676568767, inf]"",100}","{""(3.300000000000068, 4]"",250}",,,,,
"{""(0.23611511591281759, 0.49443665687066946]"",100}","{""(0.2326668924514275, 0.48720290157795404]"",95}","{""(1.5235002676568767, inf]"",100}",,"{""(4, inf]"",50}",,,,,
"{""(0.49443665687066946, 0.8057441309517726]"",100}","{""(0.48720290157795404, 0.7925975400857329]"",95}",,,,,,,,
"{""(0.8057441309517726, 1.2633420474920432]"",100}","{""(0.7925975400857329, 1.2383348516011667]"",95}",,,,,,,,
"{""(1.2633420474920432, inf]"",100}","{""(1.2383348516011667, inf]"",95}",,,,,,,,


In [4]:
df_compare_freq

col1_norm,col2_norm_null,col8_stacked_at_0,col9_stacked_at_1,col10_discrete_numeric,col4_str_abc,col5_str_abc_null,col6_binary,col7_binary_null,col11_cat_missing_level
struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2]
"{""[-inf, -1.3013769564701396]"",98}","{""[-inf, -1.2895778994315654]"",95}","{""[-inf, 0]"",300}","{""[-inf, 0.24146240139778277]"",152}","{""[-inf, 1]"",300}","{""a"",225}","{""a"",480}","{""0"",770}","{""0"",715}","{""1"",500}"
"{""(-1.3013769564701396, -0.8714829034478315]"",97}","{""(-1.2895778994315654, -0.8761616817498066]"",91}","{""(0, 0.0011886445621633556]"",0}","{""(0.24146240139778277, 0.48452393915442477]"",122}","{""(1, 1.6000000000000227]"",0}","{""b"",535}","{""b"",230}","{""1"",230}","{""1"",270}","{null,500}"
"{""(-0.8714829034478315, -0.5408355044403768]"",95}","{""(-0.8761616817498066, -0.5417661688027671]"",94}","{""(0.0011886445621633556, 0.24063260366920644]"",152}","{""(0.48452393915442477, 0.8771514893399184]"",164}","{""(1.6000000000000227, 2]"",225}","{""c"",240}","{""c"",230}",,"{null,15}",
"{""(-0.5408355044403768, -0.2706955076429412]"",103}","{""(-0.5417661688027671, -0.2748712829586359]"",96}","{""(0.24063260366920644, 0.4834933934050441]"",121}","{""(0.8771514893399184, 1]"",323}","{""(2, 3]"",125}",,"{null,60}",,,
"{""(-0.2706955076429412, -0.019756045841092827]"",105}","{""(-0.2748712829586359, -0.03150085831307207]"",90}","{""(0.4834933934050441, 0.8767367768967448]"",165}","{""(1, 1.5235002676568767]"",103}","{""(3, 3.300000000000068]"",0}",,,,,
"{""(-0.019756045841092827, 0.23611511591281759]"",100}","{""(-0.03150085831307207, 0.2326668924514275]"",99}","{""(0.8767367768967448, 1.5235002676568767]"",126}","{""(1.5235002676568767, inf]"",136}","{""(3.300000000000068, 4]"",300}",,,,,
"{""(0.23611511591281759, 0.49443665687066946]"",91}","{""(0.2326668924514275, 0.48720290157795404]"",86}","{""(1.5235002676568767, inf]"",136}",,"{""(4, inf]"",50}",,,,,
"{""(0.49443665687066946, 0.8057441309517726]"",83}","{""(0.48720290157795404, 0.7925975400857329]"",81}",,,,,,,,
"{""(0.8057441309517726, 1.2633420474920432]"",113}","{""(0.7925975400857329, 1.2383348516011667]"",102}",,,,,,,,
"{""(1.2633420474920432, inf]"",115}","{""(1.2383348516011667, inf]"",116}",,,,,,,,


In [12]:
df_base = pl.read_csv('./base_data.csv')
df_compare = pl.read_csv('./compare_data.csv')

categorical_columns=['col4_str_abc', 'col5_str_abc_null', 'col6_binary', 'col7_binary_null', 'col11_cat_missing_level']
numeric_columns=['col1_norm', 'col2_norm_null', 'col8_stacked_at_0', 'col9_stacked_at_1', 'col10_discrete_numeric']
bins = 10
# Initialize frequency tables and Lazy dataframes
df_base_num_freq = pl.DataFrame()
df_compare_num_freq = pl.DataFrame()
df_base_cat_freq_structs = pl.DataFrame()
df_compare_cat_freq_structs = pl.DataFrame()
ldf_base = df_base.lazy()
ldf_compare = df_compare.lazy()