In [None]:
import polars as pl
import numpy as np
import polars_ds as pld

In [None]:
import numpy as np

def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
    '''Calculate the PSI (population stability index) across all variables

    Args:
       expected: numpy matrix of original values
       actual: numpy matrix of new values
       buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
       buckets: number of quantiles to use in bucketing variables
       axis: axis by which variables are defined, 0 for vertical, 1 for horizontal

    Returns:
       psi_values: ndarray of psi values for each variable

    Author:
       Matthew Burke
       github.com/mwburke
       mwburke.github.io.com
    '''

    def psi(expected_array, actual_array, buckets):
        '''Calculate the PSI for a single variable

        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into

        Returns:
           psi_value: calculated PSI value
        '''

        def scale_range (input, min, max):
            input += -(np.min(input))
            input /= np.max(input) / (max - min)
            input += min
            return input

        breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

        if buckettype == 'bins':
            breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
        elif buckettype == 'quantiles':
            breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])

        expected_fractions = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
        actual_fractions = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

        def sub_psi(e_perc, a_perc):
            '''Calculate the actual PSI value from comparing the values.
               Update the actual value to a very small number if equal to zero
            '''
            if a_perc == 0:
                a_perc = 0.0001
            if e_perc == 0:
                e_perc = 0.0001

            value = (e_perc - a_perc) * np.log(e_perc / a_perc)
            return(value)

        psi_value = sum(sub_psi(expected_fractions[i], actual_fractions[i]) for i in range(0, len(expected_fractions)))

        return(psi_value)

    if len(expected.shape) == 1:
        psi_values = np.empty(len(expected.shape))
    else:
        psi_values = np.empty(expected.shape[1 - axis])

    for i in range(0, len(psi_values)):
        if len(psi_values) == 1:
            psi_values = psi(expected, actual, buckets)
        elif axis == 0:
            psi_values[i] = psi(expected[:,i], actual[:,i], buckets)
        elif axis == 1:
            psi_values[i] = psi(expected[i,:], actual[i,:], buckets)

    return(psi_values)

In [None]:
df = pl.DataFrame({
    # "a": [1] * 1000 + list(range(1000)),
    "b": np.random.random(size=5_000), 
    "c": np.random.random(size=5_000), 
})
df.head()

In [None]:
expected_array = df["c"].to_numpy()
actual_array = df["b"].to_numpy()
buckets = 10
breakpoints = np.arange(0, buckets + 1) / (buckets) * 100
breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])

expected_fractions = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
actual_fractions = np.histogram(actual_array, breakpoints)[0] / len(actual_array)




In [None]:
vc = pl.col("b").qcut(10, left_closed = False, allow_duplicates=True, include_breaks=True).struct.field("brk").value_counts().sort()
df.select(
    vc
)

In [None]:
df.select(
    pl.col("b").num.psi(pl.col("c")),
    pl.col("b").min().alias("b_min"),
    pl.col("c").min().alias("c_min"),
)

In [None]:
calculate_psi(df["c"].to_numpy(), df["b"].to_numpy(), buckettype="quantiles")

In [None]:
df.select(
    pl.col("a").qcut(10, allow_duplicates=True, include_breaks=True).struct.field("brk").value_counts().sort(),  # .value_counts() # .struct.field("brk").unique()
).unnest("brk")

In [None]:
df.select(
    pl.col("a").qcut(10, include_breaks=True).struct.field("brk").unique()
)

In [None]:
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

In [None]:
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        threshold = 4, # <= threshold hamming distance away
        metric = "hamming"
    ).alias("similar_words_from_vocab"),
)

In [None]:
df.select(
    pl.col("a").str2.similar_words(
        vocab = ["WORLD", "AAAAA", "ABCDEFG", "ZIV"],
        k = 2,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

In [None]:
df = pl.DataFrame({
    "a": range(100_000),
    "b": range(100_000),
    "c": range(100_000),
})

In [None]:
df = pl.DataFrame({
    "y": [[1,0,0],[0,1,0],[0,0,1],[1,0,0],[0,1,0]],
    "pred":[[0.1, 0.5, 0.4], [0.2, 0.6, 0.2], [0.4, 0.1, 0.5], [0.9, 0.05, 0.05], [0.2, 0.5, 0.3]]
})

In [None]:
df.select(
    pl.col("pred").list.get(pl.col("y"))
)

In [None]:
df.select(
    pl.col("y").metric.categorical_cross_entropy(pl.col("pred"), normalize=True, dense = False)
).item(0,0)