In [None]:
import polars as pl
import polars_ds as pds

In [None]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

In [None]:
import polars.selectors as cs



df.lazy().select(
    
    
    
    pl.col(['Gender', 'ID', 'City_Category', 'Employer_Category2']) & cs.string()
    # pl.col("Gender").unique().drop_nulls().implode().list.sort()
).collect()

In [None]:
from polars_ds.pipeline import Pipeline
import polars.selectors as cs

# df.select(pl.col("Existing_EMI"))

pipe = (
    Pipeline(df)
    .lowercase()
    .impute(["existing_emi"], method = "median")
    .select(cs.numeric())
    .append_expr([
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
    ])
    .scale(
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "min_max"
    ).remove(
        ["approved"]
    )
)

pipe.fit()
pipe.transform()

In [None]:
pipe.show_graph()

In [None]:
df = pds.random_data(size=100_000, n_cols = 0).select(
    pds.random_int(0, 200).alias("x"),
    pds.random_int(0, 200).alias("y"),
    pl.Series([1] * 50_000 + [2] * 50_000).alias("test")
)
df.head()

In [None]:
df.select(
    pds.query_lstsq_report("x", target="y", add_bias=True).alias("report")
)["report"]

In [None]:
df.select(
    pl.col("x").qcut(10, left_closed=False, allow_duplicates=True, include_breaks=True)
        .struct.field("brk")
        .value_counts()
        .sort()
).unnest("brk")

In [None]:
df.select(
    pl.corr("x", "y")
)

In [None]:

df.select(
    pds.kendall_tau("x", "y")
)

In [None]:
from scipy.stats import kendalltau

x = df["x"].to_numpy()
y = df["y"].to_numpy()

In [None]:
%%timeit
kendalltau(x,y, nan_policy="omit")

In [None]:
df.sort(pl.col("x").rank(method="random")).select(
    "x",
    "y",
    pl.col("y").rank(method="max").cast(pl.Float64).alias("r"),
    (-pl.col("y")).rank(method="max").cast(pl.Float64).alias("l"),
).with_columns(
    pl.col("r").diff().abs().alias("r_abs_diff"),
    (pl.col("l") * (pl.len() - pl.col("l"))).alias("l(n-l)"),
).select(
    1 - (pl.len() / 2) * (pl.col("r_abs_diff").sum() / pl.col("l(n-l)").sum())
)