In [1]:
from __future__ import annotations
import polars as pl
import inspect
import numpy as np
from typing import Tuple

In [None]:
pred = np.random.random(size=100_000) # .reshape(-1, 1) # Predictions
actual = np.round(np.random.random(size=100_000)).astype(np.int8)  # Actuals

In [None]:
def huber_loss(
    y_actual:np.ndarray
    , y_predicted:np.ndarray
    , delta:float
) -> float:
    
    y_a = y_actual.ravel()
    if y_predicted.ndim == 2:
        y_p = y_predicted[:, 1] # .ravel()
    else:
        y_p = y_predicted.ravel()
    
    if delta <= 0:
        raise ValueError("Delta in Huber loss must be positive.")
    
    huber = pl.from_records((y_a, y_p), schema=["y", "p"]).lazy().with_columns(
        abs_diff = (pl.col("y") - pl.col("p")).abs()
    ).select(
        pl.when(pl.col("abs_diff") <= delta).then(
            0.over(groupby)elta * (pl.col("abs_diff") - 0.5 * delta)
        ).alias("huber_ew").mean()
    ).collect().row(0)[0]

    return huber

def huber_loss2(y_actual:np.ndarray
    , y_predicted:np.ndarray
    , delta:float) -> float:

    abs_diff = np.abs(y_actual - y_predicted)
    mask = abs_diff <= delta
    unmask = ~mask
    total_loss = np.zeros(shape=abs_diff.shape)
    total_loss[mask] = 0.5 * (abs_diff[mask]**2)
    total_loss[unmask] = delta * (abs_diff[unmask] - 0.5 * delta)

    return np.mean(total_loss)


In [None]:
pred = np.random.random(size=100_000) # .reshape(-1, 1) # Predictions
actual = np.random.random(size=100_000) # Actuals

In [None]:
huber_loss(pred, actual, 1.35)

In [None]:
huber_loss2(pred, actual, 1.35)

In [None]:
%%timeit
huber_loss(pred, actual, 1.35)

In [None]:
%%timeit
huber_loss2(pred, actual, 1.35)

In [None]:
np.array([[1,2], [3,4]]).ndim

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, log_loss

In [None]:
log_loss(actual, pred)

In [None]:
%%timeit
logloss(actual, pred)

In [None]:
%%timeit
log_loss(actual, pred)

In [None]:
%%timeit
roc_auc(actual, a)

In [None]:
tp1, _, thresholds = roc_curve(actual, a)

In [None]:
tp1

In [None]:
%%timeit
roc_auc_score(actual, a)

In [None]:
roc_auc(actual, a)

In [None]:
roc_auc_score(actual, a)

In [None]:
grid = (np.arange(0,100,1)/100)
base = np.full(shape=(50_000, 100), fill_value=False)

In [None]:
base1 = np.full(shape=(50_000, 100), fill_value=False)
for i in range(50_000):
    base1[i, :] = grid.ravel() < a[i]

base1

In [None]:
base2 = grid < a
base2

In [None]:
np.sum(base1 == base2)

In [None]:
dist

In [None]:
pred = np.random.random(size=100_000) # .reshape(-1, 1) # Predictions
actual = np.random.random(size=100_000)  # Actuals

In [None]:

diff1 = actual - pred
ss_res = diff1.dot(diff1)
diff2 = actual - np.mean(actual)
ss_tot = diff2.dot(diff2)
ans = 1 - ss_res/ss_tot
ans

In [None]:
%%timeit 
ss_res = np.sum(np.square(actual - pred))
ss_tot = np.sum(np.square(actual - np.mean(actual)))
ans = 1 - ss_res/ss_tot
ans

In [None]:
import numpy as np
from scipy import stats
rng = np.random.default_rng()
sample1 = stats.uniform.rvs(size=100, random_state=rng)
sample2 = stats.norm.rvs(size=110, random_state=rng)
res = stats.ks_2samp(sample1, sample2)

In [None]:
res.statistic

In [None]:
import pandas as pd
df = pl.DataFrame({"a": range(20)}).lazy() # .to_pandas()

In [None]:
df.groupby((pl.col("a") < 10).alias("is_smaller")).agg(
    pl.all().first()
).collect()

In [None]:
df = pl.DataFrame({
    "a":["x", "y", "z"]
    , "num":[1,2,3]
})
df

In [None]:
mapping = pl.DataFrame({
    "a":["x", "y", "z"]
    ,"to":[3,2,1]
})
mapping

In [None]:
df.join(mapping, on="a").with_columns(
    pl.col("to").alias("a")
).drop("to")

In [None]:
import polars as pl
import random 

df = pl.scan_csv("../data/test.csv")
    
# Define polars custom functions to apply
def add_position_column(df:pl.LazyFrame):
    df = df.with_columns( 
        pl.when(pl.col('defensive_skill') > 50).then('CB')
        .when(pl.col('offensive_skill') > 50).then('FW')
        .otherwise('bench').alias("position")
    )
    return df

def add_squad_number_column(df:pl.LazyFrame):
    df = df.with_columns( 
        pl.when(pl.col('position') == 'CD').then(pl.lit(random.sample(range(2, 6), 1)[0], dtype=pl.Int8))
        .when(pl.col('position') == 'FW').then(pl.lit(random.sample(range(7, 19), 1)[0], dtype=pl.Int8))
        .otherwise('-').alias("squad_number")
    )
    return df

# Chain operations together using the pipe function


df.pipe(add_position_column)\
    .pipe(add_squad_number_column)\
    .write_json("pipe.json")


In [None]:
f = open("pipe.json", "r")
json_str = f.read()
f.close()
df3 = pl.read_csv("../data/test.csv")
df2 = df3.lazy().from_json(json_str)
df2.collect()

In [None]:
plan = pl.LazyFrame().from_json(json_str)
plan.write_json("pipe.json")

In [None]:
df2.collect()

In [None]:
test = ["a","b"]

"|".join(test)

In [None]:
def test() -> list[str]:
    return ["a"]

test.__annotations__.get("return", "") == list[str]

In [None]:
from typing import Callable, Concatenate

tt:Callable[[], list]
tt = test

In [None]:
inspect.signature(tt).return_annotation == "list[str]"

In [None]:
df = pl.DataFrame({
    "g": list(range(1000))
}).lazy()

# test = pl.Series("x",["a", "b"])


In [None]:
join_df = pl.LazyFrame((list(range(1000)), list(i*2 for i in range(1000))), schema=["g", "g_mapped"])
join_df.collect()

In [None]:
%%timeit 
df.with_columns(
    pl.col("g").map_dict({i:2*i for i in range(1000)})
).collect()

In [None]:
%%timeit 
df.join(join_df, on = "g").with_columns(
    pl.col("g_mapped").alias("g")
).drop(columns=["g_mapped"]).collect()

In [None]:
df.join(join_df, on = "g").with_columns(
    pl.col("g_mapped").alias("g")
).drop(columns=["g_mapped"]).write_json("test.json")

In [None]:
df = pl.DataFrame(
    {
        "a": list(range(1000)),
    }
)

In [None]:
import polars.functions as F

In [2]:
from dsds.sample import stratified_downsample

In [3]:
df = pl.read_csv("../data/advertising.csv")
df.head()

id,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
i64,f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,i64,i64,i64,str,i64,str
1,68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12,12,1,"""SSS""",0.0,"""A"""
2,80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9,13,1,"""SSS""",,"""B"""
3,69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11,11,1,"""SSS""",0.0,"""A"""
4,74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12,10,1,"""SSS""",,"""B"""
5,68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11,14,1,"""SSS""",0.0,"""A"""


In [4]:
stratified_downsample(df, groupby=["One_Hot_Test"], keep=0.5)

id,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
i64,f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,i64,i64,i64,str,i64,str
1,68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12,12,1,"""SSS""",0,"""A"""
6,59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""",1,"""Norway""","""5/19/2016 14:3…",0,"""B""",20,11,11,1,"""SSS""",,"""B"""
7,88.91,33,,208.36,"""Enhanced dedic…","""Brandonstad""",0,"""Myanmar""","""1/28/2016 20:5…",0,"""A""",30,10,,1,"""SSS""",0,"""A"""
8,66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…",1,"""Australia""","""3/7/2016 1:40""",1,"""A""",40,6,4,1,"""SSS""",,"""B"""
9,74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""",1,"""Grenada""","""4/18/2016 9:33…",0,"""A""",30,11,13,1,"""SSS""",,"""B"""
10,69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""",1,"""Ghana""","""7/11/2016 1:42…",0,"""A""",20,9,11,1,"""SSS""",,"""B"""
11,47.64,49,45632.51,122.02,"""Centralized ne…","""West Brandonto…",0,"""Qatar""","""3/16/2016 20:1…",1,"""C""",40,6,9,1,"""SSS""",0,"""A"""
12,83.07,37,,230.87,"""Team-oriented …","""East Theresash…",1,"""Burundi""","""5/8/2016 8:10""",0,"""A""",30,11,,1,"""SSS""",,"""B"""
13,69.57,48,51636.92,113.12,"""Centralized co…","""West Katiefurt…",1,"""Egypt""","""6/3/2016 1:14""",1,"""A""",40,5,10,1,"""SSS""",,"""B"""
14,79.52,24,51739.63,214.23,"""Synergistic fr…","""North Tara""",0,"""Bosnia and Her…","""4/20/2016 21:4…",0,"""C""",20,10,10,1,"""SSS""",0,"""A"""
