In [None]:
import polars as pl
import polars_ds as pds

In [None]:
df = pl.DataFrame({
    "a": [1, None, 2, 3],
    "b": [3, None, None, 3]
})
df

In [None]:
import polars_ds.transforms as t

df.with_columns(
    t.impute(df, cols = ["a", "b"], method = "mean")
)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
mat = imputer.fit_transform(df.to_pandas())
pl.from_numpy(mat, schema=df.columns)

In [None]:
from category_encoders import TargetEncoder

In [None]:
df_pd = df.to_pandas()
y = df_pd["target"]
cols = ["cat"]
enc = TargetEncoder(min_samples_leaf= 20, smoothing = 10.0, cols = cols)
enc.fit(df_pd[cols], y)
df_transformed = enc.transform(df_pd[cols])
df_2 = pl.from_pandas(df_transformed[cols])
df_2

In [None]:
import polars_ds.transforms as t

df.select(
    t.target_encode(df, ["cat"], target = "target", min_samples_leaf = 20, smoothing = 10.0)
)

In [None]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

In [None]:
from polars_ds.pipeline import Pipeline
import polars.selectors as cs

# df.select(pl.col("Existing_EMI"))

pipe = (
    Pipeline(df)
    .lowercase() # lowercase all columns
    .impute(["existing_emi"], method = "median")
    .select(cs.numeric() | cs.by_name(["gender", "employer_category1"]))
    .append_expr([
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
    ])
    .scale(
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .append_expr(
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("gender_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .target_encode("employer_category1", target = "approved", min_samples_leaf = 20, smoothing = 10.0)
    .finish() # or .fit()
)

pipe.transform(return_lazy=False)

In [None]:
pipe.transform()

In [None]:
pipe.show_graph()

In [None]:
df = pds.random_data(size=100_000, n_cols = 0).select(
    pds.random_int(0, 200).alias("x"),
    pds.random_int(0, 200).alias("y"),
    pl.Series([1] * 50_000 + [2] * 50_000).alias("test")
)
df.head()

In [None]:
df.select(
    pds.query_lstsq_report("x", target="y", add_bias=True).alias("report")
)["report"]

In [None]:
df.select(
    pl.col("x").qcut(10, left_closed=False, allow_duplicates=True, include_breaks=True)
        .struct.field("brk")
        .value_counts()
        .sort()
).unnest("brk")

In [None]:
df.select(
    pl.corr("x", "y")
)

In [None]:

df.select(
    pds.kendall_tau("x", "y")
)

In [None]:
from scipy.stats import kendalltau

x = df["x"].to_numpy()
y = df["y"].to_numpy()

In [None]:
%%timeit
kendalltau(x,y, nan_policy="omit")

In [None]:
df.sort(pl.col("x").rank(method="random")).select(
    "x",
    "y",
    pl.col("y").rank(method="max").cast(pl.Float64).alias("r"),
    (-pl.col("y")).rank(method="max").cast(pl.Float64).alias("l"),
).with_columns(
    pl.col("r").diff().abs().alias("r_abs_diff"),
    (pl.col("l") * (pl.len() - pl.col("l"))).alias("l(n-l)"),
).select(
    1 - (pl.len() / 2) * (pl.col("r_abs_diff").sum() / pl.col("l(n-l)").sum())
)