In [1]:
import polars as pl
import polars.selectors as cs
from polars_ds.pipeline import Pipeline, Blueprint
from polars.testing import assert_frame_equal

# Builtin Pipeline Functions

You can use it for:

1. Data Science Pipelines
2. Data preparation, manipulation, wrangling pipelines

In [2]:
df = pl.read_parquet("../examples/dependency.parquet").select(pl.exclude(['DOB', 'Source', 'Lead_Creation_Date', 'City_Code', 'Employer_Code']))
df.head()

ID,Gender,City_Category,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,i64,f64,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""A""","""A""",4,2000.0,"""B001""","""P""","""N""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""A""","""C""",1,3500.0,"""B002""","""P""","""Y""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""C""","""C""",4,2250.0,"""B003""","""G""","""Y""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""C""","""A""",4,3500.0,"""B003""","""G""","""Y""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""A""","""A""",4,10000.0,"""B001""","""P""","""Y""","""B""",2500.0,50000.0,2.0,,,10,0


## 1. A data science pipeline with data science transformers

In [3]:
sql = """
select
*
, 'TEST' as test_col
from df
where loan_amount is not null
"""

In [4]:
# Create a blueprint first. 
# A blueprint is a plan for a pipeline. No hard work will be done until the blueprint is materialized, which
# is when the tranforms are fitted (e.g. scale learns the mean and std from base data)
# If target is specified for the blueprint, target will be excluded from all transformations that require a fit,
# and target will be auto-filled if the transformation requires a target field and when no target field is explicitly given.

bp = (
    Blueprint(df, name = "example", target = "approved", lowercase=True) # You can optionally put target of the ML model here
    # Select only the columns we need
    .sql_transform(sql) # Run a SQL transform on the df
    # Say you want to remove a population for your data pipeline.
    .filter( 
        "city_category is not null" # or equivalently, you can do: pl.col("city_category").is_not_null()
    )
    # explicitly put target, since this is not the target for prediction. 
    # Use a linear regression with x1 = var1, x2=existing_emi to predict missing values in loan_period
    .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
    .impute(["existing_emi"], method = "median")
    .with_columns( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").clip(lower_bound = 0, upper_bound = 1000).alias("loan_amount_log1p_clipped"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
        pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform
    )
    .scale( # target is numerical, but will be excluded automatically because bp is initialzied with a target
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .with_columns(
        # Add missing flags
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing")
    )
    .woe_encode(cols = pl.exclude('id')) # No need to specify target because we initialized bp with a target. None means encode all str columns
    .sort(by = "monthly_income", descending=True)
    # .one_hot_encode(cols=None, drop_first=True) # None means all str columns, or you can provide a list of columns
    # .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above
)

print(bp)

Blueprint name: example
Column names: Lowercase all incoming columns.
Blueprint current steps: 9
Features Expected: ['id', 'gender', 'city_category', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved']



In [5]:
# Materialize the blueprint
pipe:Pipeline = bp.materialize(
    # This is an optional parameter, which will be passed to .collect()
    # when there is a fit stet. User may decide which 
    optimizations = pl.QueryOptFlags() 
)
# Text representation of the pipeline
pipe

Pipeline(name='example', feature_names_in_=['id', 'gender', 'city_category', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved'], feature_names_out_=['id', 'gender', 'city_category', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved', 'test_col', 'existing_emi_log1p', 'loan_amount_log1p', 'loan_amount_log1p_clipped', 'loan_amount_sqrt', 'loan_amount_lag_1', 'employer_category1_is_missing'], transforms=[<polars_ds.pipeline._step.SQLStep object at 0x7ff62266e650>, <polars_ds.pipeline._step.ExprStep object at 0x7ff6117ad360>, <polars_ds.pipeline._step.ExprStep object at 0x7ff6117aca90>, <polars_ds.pipeline._step

In [6]:
# If you want separation between features (X) and target (y)
# you can run the following:
# df_x, df_y = pipe.transform(df, separate=True)

df_transformed = pipe.transform(
    df 
    # This is an optional parameter, which can be used to tune performance
    # during collect
    , optimizations = pl.QueryOptFlags() 
)
df_transformed.head()

id,gender,city_category,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,u8
"""APPJ70184404909""",0.042296,0.080959,-0.318275,0.371879,170.5226,0.062811,0.172773,-0.001157,-0.221381,-0.096348,3.598437,0.093681,-0.977179,3.93218,7,0,-0.001157,0.0,2.204828,,2.937331,0.343978,0
"""APPD50002916548""",0.042296,0.080959,,,53.332446,-0.572202,-0.481119,-0.001157,0.075118,197.696903,0.343995,0.950229,-0.635123,0.134903,10,0,-0.001157,13.209344,0.683076,,0.543738,-0.632348,1
"""APPA60086062823""",0.042296,0.080959,-0.318275,0.371879,44.4011,0.185567,0.172773,-0.001157,0.075118,-0.096348,-0.957782,-2.475963,,,10,0,-0.001157,0.0,-1.546165,,-1.263712,-0.306906,0
"""APPO10994188022""",0.042296,0.080959,-0.318275,0.371879,44.4011,0.185567,0.172773,-0.001157,0.075118,-0.096348,3.598437,-2.475963,,,10,0,-0.001157,0.0,2.204828,,2.937331,-0.306906,0
"""APPO90710036421""",0.042296,0.080959,0.193157,0.371879,44.4011,0.185567,0.172773,-0.001157,0.042189,0.32068,3.598437,0.093681,,,10,0,-0.001157,7.048386,2.204828,,2.937331,-0.632348,0


In [7]:
if tuple(int(v) for v in pl.__version__.split(".")) >= (1, 34, 0):
    batches = []
    for df_batch in pipe.transform(df, return_lazy=True).collect_batches():
        batches.append(df_batch)
        # Pass the batch to any ML model which can be updated. (Online training)

    # Here we test that the df combined from the batches is equivalent to the original df_transformed
    # up to reordering. If we don't sort by id, the frames are equal in terms of all the transformed records 
    # but may not be equal in terms of the ordering. 
    # The reason is that when collecting from batches, ordering may not be
    # so clear and strictly the same as the original. By testing, the `sort` transform should
    # be avoided if you want the same df output with the same ordering without manually ordering them.

    df_transformed_from_batches = pl.concat(batches).sort("id")
    assert_frame_equal(df_transformed.sort("id"), df_transformed_from_batches)

In [8]:
# Empty. Because we filtered this to not null.
df_transformed.filter(
    pl.col("city_category").is_null()
)

id,gender,city_category,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,u8


In [9]:
# Empty. Because we filtered this to not null in the SQL
df_transformed.filter(
    pl.col("loan_amount").is_null()
)

id,gender,city_category,employer_category1,employer_category2,monthly_income,customer_existing_primary_bank_code,primary_bank_type,contacted,source_category,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved,test_col,existing_emi_log1p,loan_amount_log1p,loan_amount_log1p_clipped,loan_amount_sqrt,loan_amount_lag_1,employer_category1_is_missing
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,u8


# A data preparation, manipulation pipeline

In [10]:

bp2 = (
    Blueprint(df, name = "example", target = "approved", lowercase=True) # You can optionally put target of the ML model here
    .filter( 
        "city_category is not null" # or equivalently, you can do: pl.col("city_category").is_not_null()
    )
    .with_columns( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").clip(lower_bound = 0, upper_bound = 1000).alias("loan_amount_log1p_clipped"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
        pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform
    )
    .group_by_agg(by = "city_category", agg = [
        pl.col("loan_amount").sqrt().mean().alias("loan_amount_sqrt_mean")
        , pl.col("loan_amount").min().alias("loan_amount_min")
        , pl.col("loan_amount").max().alias("loan_amount_max")
    ])
    .sort(by = ["city_category"] , descending=True)
)

print(bp2)

Blueprint name: example
Column names: Lowercase all incoming columns.
Blueprint current steps: 4
Features Expected: ['id', 'gender', 'city_category', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved']



In [11]:
pipe2 = bp2.materialize()
df_transformed2 = pipe2.transform(df)
df_transformed2.head()

city_category,loan_amount_sqrt_mean,loan_amount_min,loan_amount_max
str,f64,i64,i64
"""C""",169.847635,5000,250000
"""B""",183.365209,5000,300000
"""A""",190.146432,5000,300000


In [12]:
df_app = pl.read_csv("apple_stock.csv", try_parse_dates=True)
df_app.head()

Date,Close
date,f64
1981-02-23,24.62
1981-05-06,27.38
1981-05-18,28.0
1981-09-25,14.25
1982-07-08,11.0


In [13]:
bp3 = (
    Blueprint(df_app) 
    .sort(by = ["Date"] , descending=False)
    .group_by_dynamic_agg(
        index_column = "Date"
        , every="1y"
        , agg = [
            pl.col("Close").sqrt().mean().alias("sqrt_mean")
            , pl.col("Close").min().alias("min")
            , pl.col("Close").max().alias("max")
        ]
    )
)

pipe3 = bp3.materialize()
df_transformed3 = pipe3.transform(df_app)
df_transformed3.head()

Date,sqrt_mean,min,max
date,f64,f64,f64
1981-01-01,4.815216,14.25,28.0
1982-01-01,3.316625,11.0,11.0
1983-01-01,5.49082,23.13,40.0
1984-01-01,5.25184,27.13,28.12
1985-01-01,4.252269,14.75,20.0


# A Time Series Pipeline

In [14]:
import numpy as np
import random 

df_ts = pl.DataFrame(
    {
        "id": [1] * 9 + [2] * 15 + [3] * 6,
        "timestamp": list(range(9))
        + list(range(15))
        + list(range(6)),
        "var_1": np.random.rand(30),
        "var_2": np.random.rand(30),
        "target": random.choices([False, True], k=30),
    }
)

bp4 = (
    Blueprint(df_ts) 
    .scale(
        ["var_1", "var_2"]
        , method = "standard"
    ).group_by_agg(
        by = "id"
        , maintain_order = True
        , agg = [
            "timestamp",
            pl.col("var_1").mul(10),
            pl.col("var_2").truediv(10),
            "target",
        ]
    ).explode(columns = pl.exclude("id"))
    .group_by_dynamic_agg(
        index_column = "timestamp"
        , every="3i"
        , group_by = "id"
        , start_by="datapoint"
        , agg = [
            pl.concat_list("var_1", "var_2").alias("features"),
            pl.col("target").sum(),
        ]
    ).with_columns(
        pl.col("features").cast(pl.Array(pl.Float64, (3, 2)))
    )
)

pipe4 = bp4.materialize()
df_transformed4 = pipe4.transform(df_ts)
df_transformed4.head()

id,timestamp,features,target
i64,i64,"array[f64, (3, 2)]",u32
1,0,"[[-4.792937, -0.015359], [9.952815, -0.120919], [14.181205, 0.162681]]",1
1,3,"[[12.442229, -0.160611], [14.678651, -0.083225], [2.211781, 0.136785]]",2
1,6,"[[4.381065, 0.076757], [-6.968868, -0.03936], [-11.235111, -0.034404]]",2
2,0,"[[-14.00195, -0.0513], [6.935859, 0.053503], [-12.127647, 0.061455]]",1
2,3,"[[-14.697957, -0.068382], [-4.916968, 0.076963], [-8.356783, -0.059306]]",1


# Serialization Methods

You can always use a pickle to preserve the pipeline. So I won't demonstrate that here. What's more exciting is that PDS pipelines can be turned into JSON.

In [15]:
pipe.to_json("test.json")
pipe_reload = Pipeline.from_json(open("test.json").read())
# True
assert_frame_equal(df_transformed, pipe_reload.transform(df))

In [16]:
pipe2.to_json("test2.json")
pipe2_reload = Pipeline.from_json(open("test2.json").read())
# True
assert_frame_equal(df_transformed2, pipe2_reload.transform(df))

In [17]:
pipe3.to_json("test3.json")
pipe3_reload = Pipeline.from_json(open("test3.json").read())
# True
assert_frame_equal(df_transformed3, pipe3_reload.transform(df_app))

In [18]:
pipe4.to_json("test4.json")
pipe4_reload = Pipeline.from_json(open("test4.json").read())
# True
assert_frame_equal(df_transformed4, pipe4_reload.transform(df_ts))

# Custom Tranformations in Pipeline

Need version >= v0.4.6 (Not released yet)

In [19]:
df = pl.read_parquet("../examples/dependency.parquet")
df.head()

ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,i64,i64
"""APPC90493171225""","""Female""","""23/07/79""","""15/07/16""","""C10001""","""A""","""COM0044082""","""A""",4,2000.0,"""B001""","""P""","""N""","""S122""","""G""",0.0,,,,,0,0
"""APPD40611263344""","""Male""","""07/12/86""","""04/07/16""","""C10003""","""A""","""COM0000002""","""C""",1,3500.0,"""B002""","""P""","""Y""","""S122""","""G""",0.0,20000.0,2.0,13.25,953.0,10,0
"""APPE70289249423""","""Male""","""10/12/82""","""19/07/16""","""C10125""","""C""","""COM0005267""","""C""",4,2250.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,45000.0,4.0,,,0,0
"""APPF80273865537""","""Male""","""30/01/89""","""09/07/16""","""C10477""","""C""","""COM0004143""","""A""",4,3500.0,"""B003""","""G""","""Y""","""S143""","""B""",0.0,92000.0,5.0,,,7,0
"""APPG60994436641""","""Male""","""19/04/85""","""20/07/16""","""C10002""","""A""","""COM0001781""","""A""",4,10000.0,"""B001""","""P""","""Y""","""S134""","""B""",2500.0,50000.0,2.0,,,10,0


In [20]:
from typing import Union, List

# Any custom function must satistfy the following function signature:
# func(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], ...) -> List[pl.Expr]
# where ... means kwargs
# Here is a custom imputer

def smallest_abs_impute(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], epsilon:float = 0.01) -> List[pl.Expr]:
    """
    Imputes columns by the min of the absolute values for c in columns, plus epsilon.
    """
    temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
    return [pl.col(c).fill_null(m) for c, m in zip(cols, temp)]


In [21]:
bp = (
    Blueprint(df, name = "example", target = "approved", lowercase=True)
    .append_fit_func(smallest_abs_impute, ["var1", "existing_emi", "loan_amount"], epsilon = 0.5)
    # Use append_fit_func for custom transforms
)
# Notice that the value to impute is correct, it is 0.5, because the min abs of the columns are 0.
pipe:Pipeline = bp.materialize()
pipe

Pipeline(name='example', feature_names_in_=['id', 'gender', 'dob', 'lead_creation_date', 'city_code', 'city_category', 'employer_code', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved'], feature_names_out_=['id', 'gender', 'dob', 'lead_creation_date', 'city_code', 'city_category', 'employer_code', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved'], transforms=[<polars_ds.pipeline._step.ExprStep object at 0x7ff5f414a5f0>], ensure_features_in=False, ensure_features_out=True, lowercase=True, uppercase=False)

In [22]:
pipe.transform(df).null_count().select(["var1", "existing_emi", "loan_amount"])

var1,existing_emi,loan_amount
u32,u32,u32
0,0,0


# Scriptable Steps in Pipeline

What is a scriptable step? It means we can encode steps in a json or yaml file easily. As long as we can turn the text into 
a valid Python dictionary, the step (the transformation) can be defined.

In [23]:
df.select(
    pl.col("Existing_EMI").null_count() 
)

Existing_EMI
u32
51


In [24]:
bp = Blueprint(df, name = "example", target = "approved")

# Takes in a dict with 3 fields: `name`, `args`, and `kwargs`. 
# Args and kwargs are optional depending on whether the method call needs certain arguments.
step_dict_1 = {
    "name": "impute",
    "kwargs": {"cols": ["Existing_EMI"], "method": "median"} 
}

step_dict_2 = {
    "name": "does_not_exist",
    "kwargs": {"test": 1} 
}

# filter_step = {
#     "name": "filter",
#     "args": ["Employer_Category1 is not null"]
# }

bp.append_step_from_dict(
    step_dict_1
)

# .append_step_from_dict(
#     filter_step
# )

# bp.append_step_from_dict(step_dict_2) # Will error
pipe = bp.materialize()
# 
df_transformed = pipe.transform(df)
df_transformed.select(
    pl.col("Existing_EMI").null_count() # Imputed. So 0
)

Existing_EMI
u32
0


# Custom Transforms as a Scriptable Steps in Pipeline

You need to inherit the blueprint class. Once the blueprint is materialized (learned). You do not need this class any more, because the "learned" info should all be encoded as Polars expressions

In [25]:
from polars_ds.pipeline import Blueprint, FitStep
from typing import Union, List

from functools import partial

def smallest_abs_impute(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], epsilon:float = 0.01) -> List[pl.Expr]:
    """
    Imputes columns by the min of the absolute values for c in columns, plus epsilon.
    """
    temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
    return [pl.col(c).fill_null(m).name.suffix("_imputed") for c, m in zip(cols, temp)]

class ExtendedBlueprint(Blueprint):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def smallest_abs_impute(self, cols: List[str], epsilon:float = 0.01) -> "ExtendedBlueprint":
        # bind all arguments, except df and cols.
        # If you don't want to use partial from functool, you can define an inner function
        partial_func = partial(smallest_abs_impute, epsilon=epsilon)
        self._steps.append(
            FitStep(partial_func, cols, self.exclude)
        )
        return self

    def smallest_abs_impute2(self, cols: List[str], epsilon:float = 0.01) -> "ExtendedBlueprint":
        # bind all arguments, except df and cols.
        # Example of using an inner function
        def inner_func(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str]) -> List[pl.Expr]:
            temp = df.lazy().select(pl.col(cols).abs().min() + epsilon).collect().row(0)
            return [pl.col(c).fill_null(m).name.suffix("_imputed2") for c, m in zip(cols, temp)]

        self._steps.append(
            FitStep(inner_func, cols, self.exclude)
        )
        return self


In [26]:
bp = ExtendedBlueprint(df, name = "example", target = "approved")

# Takes in a dict with 3 fields: `name`, `args`, and `kwargs`. 
# Args and kwargs are optional depending on whether the method call needs certain arguments.
step_dict_1 = {
    "name": "smallest_abs_impute",
    "kwargs": {"cols": ["Existing_EMI"], "epsilon": 0.01} 
}

step_dict_2 = {
    "name": "smallest_abs_impute2",
    "kwargs": {"cols": ["Existing_EMI"], "epsilon": 0.01} 
}

bp.append_step_from_dict(
    step_dict_1
).append_step_from_dict(
    step_dict_2
)

pipe = bp.materialize()
df_transformed = pipe.transform(df)

df_transformed.with_columns(
    impute_value = pl.col("Existing_EMI").abs().min() + 0.01
).filter(
    pl.col("Existing_EMI").is_null()
).select(
    pl.col("Existing_EMI"),
    pl.col("Existing_EMI_imputed"),
    pl.col("Existing_EMI_imputed2"),
    pl.col("impute_value")
)

Existing_EMI,Existing_EMI_imputed,Existing_EMI_imputed2,impute_value
f64,f64,f64,f64
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
…,…,…,…
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01
,0.01,0.01,0.01


# Wrapping PDS Pipeline (blueprint) Inside a Sklearn Pipeline

It is not recommended, but it is possible. If there is a sklearn pipeline transform you want, and that is not implemented in PDS, please submit a feature request.

In [27]:
import polars_ds.pipeline as pm
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class CustomPDSTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.pipe = None

    def fit(self, df, y=None):
        # specify all the rules for the transform here
        bp = (
            pm.Blueprint(df, name = "example", target = "approved", lowercase=True) 
            .filter( 
                "city_category is not null" # or equivalently, you can do: pl.col("city_category").is_not_null()
            )
            .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"]))
            .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
            .impute(["existing_emi"], method = "median")
        )
        self.pipe = bp.materialize()
        return self

    def transform(self, df, y=None):
        return self.pipe.transform(df)

# ---------------------------------------------------------------

df = pl.read_parquet("../examples/dependency.parquet")

pipe = Pipeline(
    steps=[
        ("CustomPDSTransformer", CustomPDSTransformer())    
    ]
)
df_transformed = pipe.fit_transform(df)
df_transformed


gender,city_category,employer_category1,employer_category2,monthly_income,existing_emi,loan_amount,loan_period,interest_rate,emi,var1,approved
str,str,str,i64,f64,f64,i64,f64,f64,i64,i64,i64
"""Female""","""A""","""A""",4,2000.0,0.0,,0.0,,,0,0
"""Male""","""A""","""C""",1,3500.0,0.0,20000,2.0,13.25,953,10,0
"""Male""","""C""","""C""",4,2250.0,0.0,45000,4.0,,,0,0
"""Male""","""C""","""A""",4,3500.0,0.0,92000,5.0,,,7,0
"""Male""","""A""","""A""",4,10000.0,2500.0,50000,2.0,,,10,0
…,…,…,…,…,…,…,…,…,…,…,…
"""Female""","""A""","""A""",1,4900.0,0.0,,5.0981,,,10,0
"""Female""","""C""","""A""",4,7190.1,1450.0,,3.557644,,,7,0
"""Female""","""B""","""C""",4,1600.0,0.0,24000,4.0,35.5,943,2,0
"""Male""","""A""","""C""",3,9893.0,1366.0,80000,5.0,,,10,0
