In [1]:
import polars as pl
import polars.selectors as cs
from polars_ds.modeling import Pipeline, Blueprint

# Builtin Pipeline Functions

To run this demo: use the latest version of polars_ds. Or the latest commit if the latest version doesn't work.

Need v0.8 or above

In [2]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""23/07/79""","""15/07/16""","""C10001""","""A""","""COM0044082""","""A""",4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""07/12/86""","""04/07/16""","""C10003""","""A""","""COM0000002""","""C""",1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""10/12/82""","""19/07/16""","""C10125""","""C""","""COM0005267""","""C""",4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""30/01/89""","""09/07/16""","""C10477""","""C""","""COM0004143""","""A""",4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""19/04/85""","""20/07/16""","""C10002""","""A""","""COM0001781""","""A""",4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000.0,2.0,,,10,0


In [3]:
sql = """
select
*
, 'TEST' as test_col
from df
where loan_amount is not null
"""

In [4]:
# Create a blueprint first. 
# A blueprint is a plan for a pipeline. No hard work will be done until the blueprint is materialized, which
# is when the tranforms are fitted (e.g. scale learns the mean and std from base data)
# If target is specified for the blueprint, target will be excluded from all transformations that require a fit,
# and target will be auto-filled if the transformation requires a target field and when no target field is explicitly given.

bp = (
    Blueprint(df, name = "example", target = "approved", lowercase=True) # You can optionally put target of the ML model here
    # Select only the columns we need
    .sql_transform(sql) # Run a SQL transform on the df
    # Say you want to remove a population for your data pipeline.
    .filter( 
        "city_category is not null" # or equivalently, you can do: pl.col("city_category").is_not_null()
    )
    # explicitly put target, since this is not the target for prediction. 
    # Use a linear regression with x1 = var1, x2=existing_emi to predict missing values in loan_period
    .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
    .impute(["existing_emi"], method = "median")
    .append_expr( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").clip(lower_bound = 0, upper_bound = 1000).alias("loan_amount_log1p_clipped"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
        pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform
    )
    .scale( # target is numerical, but will be excluded automatically because bp is initialzied with a target
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .append_expr(
        # Add missing flags
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .woe_encode("city_category") # No need to specify target because we initialized bp with a target
    .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above
)

print(bp)

Blueprint name: example
Column names: Lowercase all incoming columns.Blueprint current steps: 11
Features Expected: ['id', 'gender', 'dob', 'lead_creation_date', 'city_code', 'city_category', 'employer_code', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved']



In [5]:
# Materialize the blueprint
pipe:Pipeline = bp.materialize()
# Text representation of the pipeline
pipe

Naive Query Steps: 

Step 1:
Run SQL: 
select
*
, 'TEST' as test_col
from df
where loan_amount is not null



Step 2:
col("city_category").is_not_null()

Step 3:
col("loan_period").fill_null([[(col("var1")) * (dyn float: 0.509810011759666)].sum_horizontal([[(col("existing_emi")) * (dyn float: -0.0000076040796537527814)]])]).alias("loan_period")

Step 4:
col("existing_emi").fill_null([dyn float: 0])

Step 5:
col("existing_emi").log1p().alias("existing_emi_log1p"),
col("loan_amount").log1p().alias("loan_amount_log1p"),
col("loan_amount").clip([dyn int: 0, dyn int: 1000]).alias("loan_amount_log1p_clipped"),
col("loan_amount").sqrt().alias("loan_amount_sqrt"),
col("loan_amount").shift([dyn int: -1]).alias("loan_amount_lag_1")

Step 6:
[([(col("employer_category2")) - (dyn float: 3.679269695227142)]) / (dyn float: 0.862458786067542)],
[([(col("monthly_income")) - (dyn float: 7463.797309780022)]) / (dyn float: 225051.54436104206)],
[([(col("existing_emi")) - (dyn float: 265.6902475240454)]) 

In [6]:
# If you want separation between features (X) and target (y)
# you can run the following:
# df_x, df_y = pipe.transform(df, separate=True)

df_transformed = pipe.transform(df)
df_transformed.head()

id,dob,lead_creation_date,city_code,city_category,employer_code,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing,gender_Male
str,str,str,str,f64,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,i64,i64,str,f64,f64,f64,f64,f64,u8,u8
"""APPD40611263344""","""07/12/86""","""04/07/16""","""C10003""",0.080959,"""COM0000002""",0.026603,-3.106548,-0.017613,"""B002""","""P""","""Y""","""S122""","""G""",-0.096348,-0.632338,-1.619415,-1.019936,-0.197259,10,0,"""TEST""",0.0,-0.586105,,-0.658025,0.181257,0,1
"""APPE70289249423""","""10/12/82""","""19/07/16""","""C10125""",-0.479553,"""COM0005267""",0.026603,0.371879,-0.023167,"""B003""","""G""","""Y""","""S143""","""B""",-0.096348,0.181273,0.093681,,,0,0,"""TEST""",0.0,0.537137,,0.375948,1.710834,0,1
"""APPF80273865537""","""30/01/89""","""09/07/16""","""C10477""",-0.479553,"""COM0004143""",0.014737,0.371879,-0.017613,"""B003""","""G""","""Y""","""S143""","""B""",-0.096348,1.710861,0.950229,,,7,0,"""TEST""",0.0,1.527696,,1.709278,0.343978,0,1
"""APPG60994436641""","""19/04/85""","""20/07/16""","""C10002""",0.080959,"""COM0001781""",0.014737,0.371879,0.011269,"""B001""","""P""","""Y""","""S134""","""B""",0.810234,0.343995,-1.619415,,,10,0,"""TEST""",7.824446,0.683076,,0.543738,2.947513,0,1
"""APPK80327232033""","""28/03/73""","""02/07/16""","""C10022""",-0.046475,"""COM0030526""",0.024336,0.371879,0.000161,"""B003""","""G""","""Y""","""S122""","""C""",-0.096348,2.947549,0.950229,-0.746291,2.631433,10,0,"""TEST""",0.0,2.00661,,2.546276,-0.306906,0,1


In [7]:
# Empty. Because we filtered this to not null.
df_transformed.filter(
    pl.col("city_category").is_null()
)

id,dob,lead_creation_date,city_code,city_category,employer_code,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing,gender_Male
str,str,str,str,f64,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,i64,i64,str,f64,f64,f64,f64,f64,u8,u8


In [8]:
# Empty. Because we filtered this to not null in the SQL
df_transformed.filter(
    pl.col("loan_amount").is_null()
)

id,dob,lead_creation_date,city_code,city_category,employer_code,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing,gender_Male
str,str,str,str,f64,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,i64,i64,str,f64,f64,f64,f64,f64,u8,u8


# Serialization Methods

Pickle + JSON support.

In [9]:
import pickle
# The pipe object can be pickled
with open("pipe.pickle", "wb") as f:
    pickle.dump(pipe, f)

In [10]:
with open("pipe.pickle", "rb") as f:
    pipe2 = pickle.load(f)

pipe2

Naive Query Steps: 

Step 1:
Run SQL: 
select
*
, 'TEST' as test_col
from df
where loan_amount is not null



Step 2:
col("city_category").is_not_null()

Step 3:
col("loan_period").fill_null([[(col("var1")) * (dyn float: 0.509810011759666)].sum_horizontal([[(col("existing_emi")) * (dyn float: -0.0000076040796537527814)]])]).alias("loan_period")

Step 4:
col("existing_emi").fill_null([dyn float: 0])

Step 5:
col("existing_emi").log1p().alias("existing_emi_log1p"),
col("loan_amount").log1p().alias("loan_amount_log1p"),
col("loan_amount").clip([dyn int: 0, dyn int: 1000]).alias("loan_amount_log1p_clipped"),
col("loan_amount").sqrt().alias("loan_amount_sqrt"),
col("loan_amount").shift([dyn int: -1]).alias("loan_amount_lag_1")

Step 6:
[([(col("employer_category2")) - (dyn float: 3.679269695227142)]) / (dyn float: 0.862458786067542)],
[([(col("monthly_income")) - (dyn float: 7463.797309780022)]) / (dyn float: 225051.54436104206)],
[([(col("existing_emi")) - (dyn float: 265.6902475240454)]) 

In [11]:
df_transformed_2 = pipe2.transform(df)
df_transformed_2

id,dob,lead_creation_date,city_code,city_category,employer_code,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing,gender_Male
str,str,str,str,f64,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,i64,i64,str,f64,f64,f64,f64,f64,u8,u8
"""APPD40611263344""","""07/12/86""","""04/07/16""","""C10003""",0.080959,"""COM0000002""",0.026603,-3.106548,-0.017613,"""B002""","""P""","""Y""","""S122""","""G""",-0.096348,-0.632338,-1.619415,-1.019936,-0.197259,10,0,"""TEST""",0.0,-0.586105,,-0.658025,0.181257,0,1
"""APPE70289249423""","""10/12/82""","""19/07/16""","""C10125""",-0.479553,"""COM0005267""",0.026603,0.371879,-0.023167,"""B003""","""G""","""Y""","""S143""","""B""",-0.096348,0.181273,0.093681,,,0,0,"""TEST""",0.0,0.537137,,0.375948,1.710834,0,1
"""APPF80273865537""","""30/01/89""","""09/07/16""","""C10477""",-0.479553,"""COM0004143""",0.014737,0.371879,-0.017613,"""B003""","""G""","""Y""","""S143""","""B""",-0.096348,1.710861,0.950229,,,7,0,"""TEST""",0.0,1.527696,,1.709278,0.343978,0,1
"""APPG60994436641""","""19/04/85""","""20/07/16""","""C10002""",0.080959,"""COM0001781""",0.014737,0.371879,0.011269,"""B001""","""P""","""Y""","""S134""","""B""",0.810234,0.343995,-1.619415,,,10,0,"""TEST""",7.824446,0.683076,,0.543738,2.947513,0,1
"""APPK80327232033""","""28/03/73""","""02/07/16""","""C10022""",-0.046475,"""COM0030526""",0.024336,0.371879,0.000161,"""B003""","""G""","""Y""","""S122""","""C""",-0.096348,2.947549,0.950229,-0.746291,2.631433,10,0,"""TEST""",0.0,2.00661,,2.546276,-0.306906,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""APPS20215136404""","""04/03/86""","""30/09/16""","""C10002""",0.080959,"""COM0000003""",0.024336,-1.947072,-0.012725,"""B001""","""P""","""Y""","""S122""","""G""",-0.096348,-0.306893,-0.762867,-1.062693,-0.120198,10,0,"""TEST""",0.0,-0.024488,,-0.193265,-0.306906,0,1
"""APPT50870248519""","""03/03/91""","""30/09/16""","""C10041""",-0.479553,"""COM0000009""",0.014737,-3.106548,-0.022501,"""B003""","""G""","""Y""","""S122""","""G""",-0.096348,-0.306893,-0.762867,,,2,0,"""TEST""",0.0,-0.024488,,-0.193265,-0.502171,0,1
"""APPW50697209842""","""01/02/92""","""30/09/16""","""C10022""",-0.046475,"""COM0013284""",0.026603,0.371879,-0.026055,"""B030""","""P""","""Y""","""S122""","""G""",-0.096348,-0.50216,0.093681,2.785431,-0.210546,2,0,"""TEST""",0.0,-0.333569,,-0.46065,1.320303,0,0
"""APPY50870035036""","""27/06/78""","""30/09/16""","""C10002""",0.080959,"""COM0000098""",0.026603,-0.787597,0.010794,"""B002""","""P""","""Y""","""S122""","""G""",0.399009,1.320328,0.950229,,,10,0,"""TEST""",7.220374,1.334103,,1.40992,0.962317,0,1


In [12]:
from polars.testing import assert_frame_equal
# True
assert_frame_equal(df_transformed, df_transformed_2)

In [13]:
pipe.to_dict()

{'name': 'example',
 'feature_names_in_': ['id',
  'gender',
  'dob',
  'lead_creation_date',
  'city_code',
  'city_category',
  'employer_code',
  'employer_category1',
  'employer_category2',
  'monthly_income',
  'customer_existing_primary_bank_code',
  'primary_bank_type',
  'contacted',
  'source',
  'source_category',
  'existing_emi',
  'loan_amount',
  'loan_period',
  'interest_rate',
  'emi',
  'var1',
  'approved'],
 'feature_names_out_': ['id',
  'dob',
  'lead_creation_date',
  'city_code',
  'city_category',
  'employer_code',
  'employer_category1',
  'employer_category2',
  'monthly_income',
  'customer_existing_primary_bank_code',
  'primary_bank_type',
  'contacted',
  'source',
  'source_category',
  'existing_emi',
  'loan_amount',
  'loan_period',
  'interest_rate',
  'emi',
  'var1',
  'approved',
  'test_col',
  'existing_emi_log1p',
  'loan_amount_log1p',
  'loan_amount_log1p_clipped',
  'loan_amount_sqrt',
  'loan_amount_lag_1',
  'employer_category1_is_missin

In [14]:
# To save the pipeline as JSON

pipe.to_json("test.json")
pipe3 = Pipeline.from_json("test.json")
# True
assert_frame_equal(df_transformed, pipe3.transform(df))

# Combining Pipeline

In [15]:
bp1 = (
    Blueprint(df, name = "example", target = "approved", lowercase=True) 
    .append_expr(
        # Add missing flags
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .woe_encode("city_category") # No need to specify target because we initialized bp with a target
    .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above
)
pipe1 = bp1.materialize()

In [16]:
pipe1.transform(df)

id,dob,lead_creation_date,city_code,city_category,employer_code,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,employer_category1_is_missing,gender_Male
str,str,str,str,f64,str,f64,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64,u8,u8
"""APPC90493171225""","""23/07/79""","""15/07/16""","""C10001""",0.114988,"""COM0044082""",0.010829,4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0,0,0
"""APPD40611263344""","""07/12/86""","""04/07/16""","""C10003""",0.114988,"""COM0000002""",0.021114,1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000,2,13.25,953,10,0,0,1
"""APPE70289249423""","""10/12/82""","""19/07/16""","""C10125""",-0.606987,"""COM0005267""",0.021114,4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000,4,,,0,0,0,1
"""APPF80273865537""","""30/01/89""","""09/07/16""","""C10477""",-0.606987,"""COM0004143""",0.010829,4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000,5,,,7,0,0,1
"""APPG60994436641""","""19/04/85""","""20/07/16""","""C10002""",0.114988,"""COM0001781""",0.010829,4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000,2,,,10,0,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""APPU90955789628""","""31/07/83""","""30/09/16""","""C10006""",0.114988,"""COM0000010""",0.010829,1,4900.0,"""B002""","""P""","""N""","""S122""","""G""",0.0,,,,,10,0,0,0
"""APPV80989824738""","""27/01/71""","""30/09/16""","""C10116""",-0.606987,"""COM0045789""",0.010829,4,7190.1,"""B002""","""P""","""N""","""S122""","""G""",1450.0,,,,,7,0,0,0
"""APPW50697209842""","""01/02/92""","""30/09/16""","""C10022""",-0.091923,"""COM0013284""",0.021114,4,1600.0,"""B030""","""P""","""Y""","""S122""","""G""",0.0,24000,4,35.5,943,2,0,0,0
"""APPY50870035036""","""27/06/78""","""30/09/16""","""C10002""",0.114988,"""COM0000098""",0.021114,3,9893.0,"""B002""","""P""","""Y""","""S122""","""G""",1366.0,80000,5,,,10,0,0,1


In [17]:
bp2 = (
    Blueprint(df, target = "approved", lowercase=True)
    .impute(["existing_emi"], method = "median")
    .append_expr( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
    )
)
pipe2 = bp2.materialize()

In [18]:
pipe1.append_pipeline(pipe2)

In [19]:
pipe1.transform(df)

id,dob,lead_creation_date,city_code,city_category,employer_code,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,employer_category1_is_missing,gender_Male,existing_emi_log1p,loan_amount_log1p
str,str,str,str,f64,str,f64,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64,u8,u8,f64,f64
"""APPC90493171225""","""23/07/79""","""15/07/16""","""C10001""",0.114988,"""COM0044082""",0.010829,4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0,0,0,0.0,
"""APPD40611263344""","""07/12/86""","""04/07/16""","""C10003""",0.114988,"""COM0000002""",0.021114,1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000,2,13.25,953,10,0,0,1,0.0,9.903538
"""APPE70289249423""","""10/12/82""","""19/07/16""","""C10125""",-0.606987,"""COM0005267""",0.021114,4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000,4,,,0,0,0,1,0.0,10.71444
"""APPF80273865537""","""30/01/89""","""09/07/16""","""C10477""",-0.606987,"""COM0004143""",0.010829,4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000,5,,,7,0,0,1,0.0,11.429555
"""APPG60994436641""","""19/04/85""","""20/07/16""","""C10002""",0.114988,"""COM0001781""",0.010829,4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000,2,,,10,0,0,1,7.824446,10.819798
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""APPU90955789628""","""31/07/83""","""30/09/16""","""C10006""",0.114988,"""COM0000010""",0.010829,1,4900.0,"""B002""","""P""","""N""","""S122""","""G""",0.0,,,,,10,0,0,0,0.0,
"""APPV80989824738""","""27/01/71""","""30/09/16""","""C10116""",-0.606987,"""COM0045789""",0.010829,4,7190.1,"""B002""","""P""","""N""","""S122""","""G""",1450.0,,,,,7,0,0,0,7.280008,
"""APPW50697209842""","""01/02/92""","""30/09/16""","""C10022""",-0.091923,"""COM0013284""",0.021114,4,1600.0,"""B030""","""P""","""Y""","""S122""","""G""",0.0,24000,4,35.5,943,2,0,0,0,0.0,10.085851
"""APPY50870035036""","""27/06/78""","""30/09/16""","""C10002""",0.114988,"""COM0000098""",0.021114,3,9893.0,"""B002""","""P""","""Y""","""S122""","""G""",1366.0,80000,5,,,10,0,0,1,7.220374,11.289794


# Custom Tranformations in Pipeline

Need version >= v0.4.6 (Not released yet)

In [20]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""23/07/79""","""15/07/16""","""C10001""","""A""","""COM0044082""","""A""",4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""07/12/86""","""04/07/16""","""C10003""","""A""","""COM0000002""","""C""",1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""10/12/82""","""19/07/16""","""C10125""","""C""","""COM0005267""","""C""",4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""30/01/89""","""09/07/16""","""C10477""","""C""","""COM0004143""","""A""",4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""19/04/85""","""20/07/16""","""C10002""","""A""","""COM0001781""","""A""",4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000.0,2.0,,,10,0


In [21]:
from typing import Union, List

# Any custom function must satistfy the following function signature:
# func(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], ...) -> List[pl.Expr]
# where ... means kwargs
# Here is a custom imputer

def smallest_abs_impute(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], epsilon:float = 0.01) -> List[pl.Expr]:
    """
    Imputes columns by the min of the absolute values for c in columns, plus epsilon.
    """
    temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
    return [pl.col(c).fill_null(m) for c, m in zip(cols, temp)]


In [22]:
bp = (
    Blueprint(df, name = "example", target = "approved", lowercase=True)
    .append_fit_func(smallest_abs_impute, ["var1", "existing_emi", "loan_amount"], epsilon = 0.5)
    # Use append_fit_func for custom transforms
)
# Notice that the value to impute is correct, it is 0.5, because the min abs of the columns are 0.
pipe:Pipeline = bp.materialize()
pipe

Naive Query Steps: 

Step 1:
col("var1").fill_null([dyn float: 0.5]),
col("existing_emi").fill_null([dyn float: 0.5]),
col("loan_amount").fill_null([dyn float: 5000.5])


In [23]:
pipe.transform(df).null_count().select(["var1", "existing_emi", "loan_amount"])

var1,existing_emi,loan_amount
u32,u32,u32
0,0,0


# Scriptable Steps in Pipeline

What is a scriptable step? It means we can encode steps in a json or yaml file easily. As long as we can turn the text into 
a valid Python dictionary, the step (the transformation) can be defined.

In [24]:
df.select(
    pl.col("Existing_EMI").null_count() 
)

Existing_EMI
u32
51


In [25]:
bp = Blueprint(df, name = "example", target = "approved")

# Takes in a dict with 3 fields: `name`, `args`, and `kwargs`. 
# Args and kwargs are optional depending on whether the method call needs certain arguments.
step_dict_1 = {
    "name": "impute",
    "kwargs": {"cols": ["Existing_EMI"], "method": "median"} 
}

step_dict_2 = {
    "name": "does_not_exist",
    "kwargs": {"test": 1} 
}

# filter_step = {
#     "name": "filter",
#     "args": ["Employer_Category1 is not null"]
# }

bp.append_step_from_dict(
    step_dict_1
)

# .append_step_from_dict(
#     filter_step
# )

# bp.append_step_from_dict(step_dict_2) # Will error
pipe = bp.materialize()
# 
df_transformed = pipe.transform(df)
df_transformed.select(
    pl.col("Existing_EMI").null_count() # Imputed. So 0
)

Existing_EMI
u32
0


# Custom Transforms as a Scriptable Step in Pipeline

You need to inherit the blueprint class. Once the blueprint is materialized (learned). You do not need this class any more, because the "learned" info should all be encoded as Polars expressions

In [26]:
from polars_ds.modeling import Blueprint, FitStep
from typing import Union, List

from functools import partial

def smallest_abs_impute(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], epsilon:float = 0.01) -> List[pl.Expr]:
    """
    Imputes columns by the min of the absolute values for c in columns, plus epsilon.
    """
    temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
    return [pl.col(c).fill_null(m).name.suffix("_imputed") for c, m in zip(cols, temp)]

class ExtendedBlueprint(Blueprint):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def smallest_abs_impute(self, cols: List[str], epsilon:float = 0.01) -> "ExtendedBlueprint":
        # bind all arguments, except df and cols.
        # If you don't want to use partial from functool, you can define an inner function
        partial_func = partial(smallest_abs_impute, epsilon=epsilon)
        self._steps.append(
            FitStep(partial_func, cols, self.exclude)
        )
        return self

    def smallest_abs_impute2(self, cols: List[str], epsilon:float = 0.01) -> "ExtendedBlueprint":
        # bind all arguments, except df and cols.
        # Example of using an inner function
        def inner_func(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str]) -> List[pl.Expr]:
            temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
            return [pl.col(c).fill_null(m).name.suffix("_imputed2") for c, m in zip(cols, temp)]

        self._steps.append(
            FitStep(inner_func, cols, self.exclude)
        )
        return self


In [27]:
bp = ExtendedBlueprint(df, name = "example", target = "approved")

# Takes in a dict with 3 fields: `name`, `args`, and `kwargs`. 
# Args and kwargs are optional depending on whether the method call needs certain arguments.
step_dict_1 = {
    "name": "smallest_abs_impute",
    "kwargs": {"cols": ["Existing_EMI"], "epsilon": 0.01} 
}

step_dict_2 = {
    "name": "smallest_abs_impute2",
    "kwargs": {"cols": ["Existing_EMI"], "epsilon": 0.01} 
}

bp.append_step_from_dict(
    step_dict_1
).append_step_from_dict(
    step_dict_2
)

pipe = bp.materialize()
df_transformed = pipe.transform(df)

df_transformed.with_columns(
    impute_value = pl.col("Existing_EMI").abs().min() + 0.01
).filter(
    pl.col("Existing_EMI").is_null()
).select(
    pl.col("Existing_EMI"),
    pl.col("Existing_EMI_imputed"),
    pl.col("Existing_EMI_imputed2"),
    pl.col("impute_value")
)

Existing_EMI,Existing_EMI_imputed,Existing_EMI_imputed2,impute_value
f64,f64,f64,f64
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
…,…,…,…
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01


# Wrapping PDS Pipeline (blueprint) Inside a Sklearn Pipeline

It is not recommended, but it is possible. If there is a sklearn pipeline transform you want, and that is not implemented in PDS, please submit a feature request.

In [28]:
import polars_ds.modeling as pm
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class CustomPDSTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.pipe = None

    def fit(self, df, y=None):
        # specify all the rules for the transform here
        bp = (
            pm.Blueprint(df, name = "example", target = "approved", lowercase=True) 
            .filter( 
                "city_category is not null" # or equivalently, you can do: pl.col("city_category").is_not_null()
            )
            .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category", "test_col"]))
            .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
            .impute(["existing_emi"], method = "median")
        )
        self.pipe = bp.materialize()
        return self

    def transform(self, df, y=None):
        return self.pipe.transform(df)

# ---------------------------------------------------------------

df = pl.read_parquet("../examples/dependency.parquet")

pipe = Pipeline(
    steps=[
        ("CustomPDSTransformer", CustomPDSTransformer())    
    ]
)
df_transformed = pipe.fit_transform(df)
df_transformed


gender,city_category,employer_category1,employer_category2,monthly_income,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved
str,str,str,i64,f64,f64,i64,f64,f64,i64,i64,i64
"""Female""","""A""","""A""",4,2000.0,0.0,,0.0,,,0,0
"""Male""","""A""","""C""",1,3500.0,0.0,20000,2.0,13.25,953,10,0
"""Male""","""C""","""C""",4,2250.0,0.0,45000,4.0,,,0,0
"""Male""","""C""","""A""",4,3500.0,0.0,92000,5.0,,,7,0
"""Male""","""A""","""A""",4,10000.0,2500.0,50000,2.0,,,10,0
…,…,…,…,…,…,…,…,…,…,…,…
"""Female""","""A""","""A""",1,4900.0,0.0,,5.0981,,,10,0
"""Female""","""C""","""A""",4,7190.1,1450.0,,3.557644,,,7,0
"""Female""","""B""","""C""",4,1600.0,0.0,24000,4.0,35.5,943,2,0
"""Male""","""A""","""C""",3,9893.0,1366.0,80000,5.0,,,10,0
