In [None]:
import polars as pl
import numpy as np
import polars_ds as pld

In [None]:
size = 100_000
df = pl.DataFrame({
    "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=size)).astype(np.int32)
    , "predicted": np.random.random(size=size)
    , "segments":["a"] * (size//2 + 100) + ["b"] * (size//2 - 100) 
})
print(df.head())

In [None]:
import scipy as sp

sp.fft.fft(df["a"].to_numpy())

In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score

df_pd = df.to_pandas()

segments = []
rocaucs = []

for (segment, subdf) in df_pd.groupby("segments"):
    segments.append(segment)
    rocaucs.append(
        roc_auc_score(subdf["actual"], subdf["predicted"])
    )

report = pd.DataFrame({
    "segments": segments,
    "roc_auc": rocaucs
})
print(report)

In [None]:
report = df.lazy().group_by("segments").agg(
    pl.col("actual").metric.roc_auc(pl.col("predicted")).alias("roc_auc"),
    pl.col("actual").metric.log_loss(pl.col("predicted")).alias("log_loss"),
).collect()
print(report)

In [None]:
data = df["val1"].to_numpy()

In [None]:
%%timeit
np.fft.rfft(data)

In [None]:
df.select(
    pl.col("val1").num.rfft()
)

In [None]:
ts = df["val1"].to_numpy()

In [None]:
%%timeit
np.fft.rfft(ts)

In [None]:
len(np.fft.rfft(df["val1"]))

In [None]:
from scipy.fft import fft

len(fft(df["val1"].to_numpy()))

In [None]:
df.select(
    pl.col("val1").num.rfft(n = 10)
)

In [None]:
fft(df["val1"].to_numpy())

In [None]:
df2 = df.with_columns(
    pl.col("id").num.query_radius_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        r = 0.03, 
        dist = "cosine", # actually this is squared l2
        parallel = True
    ).list.slice(offset=1).alias("edges"),
).with_columns(
    pl.col("edges").list.len().alias("connected node count")
)
df2.head()

In [None]:
df2.select(
    pl.col("edges").graph.shortest_path(105)
)

In [None]:
df2.select(
    pl.col("best friends").list.set_difference(pl.concat_list(pl.col("id")))
)

In [None]:
df2.head()

In [None]:
df2.select(
    pl.col("best friends").graph.eigen_centrality().arg_max()
)

In [None]:
df = pl.DataFrame({
    "a": range(1000),
    "b": ["cat"] * 200 + ["dogs"] * 500 + ["lizards"] * 300,
    "y": np.random.randint(0, high = 2, size = 1000)
})
df.head()

In [None]:
df = pl.DataFrame({
    "a": range(5000),
    "b": [np.sqrt(x) for x in range(5000)],
    "y": 0.1 * np.random.random(size=5000) + np.array(list(range(5000)))
})
df.head()

In [None]:

df.select(
    pl.col("y").num.lstsq_report(
        pl.col("a"), pl.col("b"),
        add_bias = False
    ).alias("report")
).unnest("report")

In [None]:
from statsmodels.api import OLS
target = df["y"].to_numpy()
data = df.select("a", "b").to_numpy()

In [None]:
df.select(
    pl.col("a1").num.psi(pl.col("a2"))
).item(0,0)

In [None]:
df.select(
    pl.col("b").num.psi_discrete(pl.col("a"))
)

In [None]:
df.select(
    pl.col("a").value_counts()
).unnest("a")

In [None]:
pl.col("a").value_counts().name