In [1]:
import polars as pl
import polars_ds as pds

In [2]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""23/07/79""","""15/07/16""","""C10001""","""A""","""COM0044082""","""A""",4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""07/12/86""","""04/07/16""","""C10003""","""A""","""COM0000002""","""C""",1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""10/12/82""","""19/07/16""","""C10125""","""C""","""COM0005267""","""C""",4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""30/01/89""","""09/07/16""","""C10477""","""C""","""COM0004143""","""A""",4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""19/04/85""","""20/07/16""","""C10002""","""A""","""COM0001781""","""A""",4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000.0,2.0,,,10,0


In [3]:
from polars_ds.pipeline import Pipeline
import polars.selectors as cs

# df.select(pl.col("Existing_EMI"))

pipe = (
    Pipeline(df)
    .lowercase() # lowercase all columns
    .impute(["existing_emi"], method = "median")
    .select(cs.numeric() | cs.by_name(["gender", "employer_category1"]))
    .append_expr([
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
    ])
    .scale(
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .append_expr(
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("gender_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .target_encode("employer_category1", target = "approved", min_samples_leaf = 20, smoothing = 10.0)
    .finish() # or .fit()
)

pipe.transform(return_lazy=False)

employer_category2,monthly_income,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,employer_category1,existing_emi_log1p,loan_amount_log1p,loan_amount_sqrt,gender_is_missing,gender_Male
f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,u8,u8
0.346572,-0.020726,-0.157654,,,,,0,-0.121854,-0.031666,0.0,,,0,0
-3.369178,-0.012143,-0.157654,-0.63233,-1.619396,-1.019913,-0.197255,10,-0.121854,0.053992,0.0,-0.586098,-0.658017,0,1
0.346572,-0.019296,-0.157654,0.181271,0.09368,,,0,-0.121854,0.053992,0.0,0.53713,0.375943,0,1
0.346572,-0.012143,-0.157654,1.710841,0.950218,,,7,-0.121854,-0.031666,0.0,1.527677,1.709258,0,1
0.346572,0.025049,0.935146,0.343991,-1.619396,,,10,-0.121854,-0.031666,7.824446,0.683068,0.543732,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
-3.369178,-0.004133,-0.157654,,,,,10,-0.121854,-0.031666,0.0,,,0,0
0.346572,0.008971,0.47617,,,,,7,-0.121854,-0.031666,7.280008,,,0,0
0.346572,-0.023015,-0.157654,-0.502154,0.09368,2.785369,-0.210541,2,-0.121854,0.053992,0.0,-0.333565,-0.460644,0,0
-0.892011,0.024437,0.439452,1.320312,0.950218,,,10,-0.121854,0.053992,7.220374,1.334087,1.409903,0,1


In [None]:
pipe.transform()

In [None]:
pipe.show_graph()

In [None]:
df = pds.random_data(size=100_000, n_cols = 0).select(
    pds.random_int(0, 200).alias("x"),
    pds.random_int(0, 200).alias("y"),
    pl.Series([1] * 50_000 + [2] * 50_000).alias("test")
)
df.head()

In [None]:
df.select(
    pds.query_lstsq_report("x", target="y", add_bias=True).alias("report")
)["report"]

In [None]:
df.select(
    pl.col("x").qcut(10, left_closed=False, allow_duplicates=True, include_breaks=True)
        .struct.field("brk")
        .value_counts()
        .sort()
).unnest("brk")

In [None]:
df.select(
    pl.corr("x", "y")
)

In [None]:

df.select(
    pds.kendall_tau("x", "y")
)

In [None]:
from scipy.stats import kendalltau

x = df["x"].to_numpy()
y = df["y"].to_numpy()

In [None]:
%%timeit
kendalltau(x,y, nan_policy="omit")

In [None]:
df.sort(pl.col("x").rank(method="random")).select(
    "x",
    "y",
    pl.col("y").rank(method="max").cast(pl.Float64).alias("r"),
    (-pl.col("y")).rank(method="max").cast(pl.Float64).alias("l"),
).with_columns(
    pl.col("r").diff().abs().alias("r_abs_diff"),
    (pl.col("l") * (pl.len() - pl.col("l"))).alias("l(n-l)"),
).select(
    1 - (pl.len() / 2) * (pl.col("r_abs_diff").sum() / pl.col("l(n-l)").sum())
)