# ML Pipeline

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn import set_config
set_config(transform_output="pandas")

In [None]:
train_path = Path("../datasets/train.csv")

cars = pd.read_csv(train_path)

cars.head()

In [None]:
cars.info()

In [None]:
cars_labels = cars['selling_price'].copy()
cars = cars.drop('selling_price', axis=1)

In [None]:
cars_labels.head()

In [None]:
cars.head()

## Preprocessing Pipeline

### Custom Transformers

### Pipelines

#### Helper functions

In [None]:
def get_numeric_part(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    df['engine'] = df['engine'].str.split().str[0]
    df['engine'] = pd.to_numeric(df['engine'], errors='coerce')

    df['max_power'] = df['max_power'].str.split().str[0]
    df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

    return df

In [None]:
def normalize_mileage(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df['mileage_unit'] = df['mileage'].str.split().str[-1]
    df['mileage'] = df['mileage'].astype(str).str.extract(r'([\d\.]+)', expand=False)
    df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

    mask_petrol = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "Petrol")
    mask_diesel = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "Diesel")
    mask_cng = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "CNG")
    mask_lpg = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "LPG")

    df.loc[mask_petrol, 'mileage'] /= 0.74
    df.loc[mask_diesel, 'mileage'] /= 0.832
    df.loc[mask_lpg, 'mileage'] /=   0.54
    df.loc[mask_cng, 'mileage'] /=   0.128

    df.drop(columns=['mileage_unit', 'fuel'], inplace=True)

    return df

In [None]:
def group_seats(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    conditions = [
        df['seats'] < 5,
        df['seats'] == 5,
        df['seats'] > 5
    ]
    choices = ['less_than_five', 'five', 'more_than_five']
    # ensure default has the same dtype (string) as choices to avoid dtype promotion errors
    df['seats'] = np.select(conditions, choices, default='missing')
    df['seats'] = df['seats'].astype('category')

    return df

In [None]:
from category_encoders import TargetEncoder

target_encoder = TargetEncoder(
    cols=["name"],
    smoothing=10,
    handle_unknown="value",
    handle_missing="value"
)

In [None]:
def group_rare_fuel(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['fuel'] = df['fuel'].replace({
        'CNG': 'other',
        'LPG': 'other'
    })
    return df

In [None]:
def update_owner_grouping(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['owner'] = df['owner'].replace({
        'Third Owner': 'Third & Above Owner',
        'Fourth & Above Owner': 'Third & Above Owner',
        'Test Drive Car': 'First Owner'
    })


    return df

In [None]:
from sklearn.preprocessing import OrdinalEncoder


ordinal_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    categories=[["Third & Above Owner", "Second Owner", "First Owner"]]
)

In [None]:
def convert_year_to_age(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['age'] = 2026 - df['year']
    df.drop('year', axis=1, inplace=True)
    return df

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class BaseNumericFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X["engine"] = pd.to_numeric(
            X["engine"].str.split().str[0],
            errors="coerce"
        )

        X["mileage"] = normalize_mileage(X)
        X["age"] = convert_year_to_age(X["year"])

        X["owner"] = update_owner_grouping(X["owner"])
        X["owner"] = X["owner"].map({
            "First Owner": 2,
            "Second Owner": 1,
            "Third & Above Owner": 0
        })

        return X


In [None]:
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        eps = 1e-6
        X = X.copy()

        X["engine_mileage_interaction"] = X["engine"] / (X["mileage"] + eps)
        X["mileage_per_year"] = X["mileage"] / (X["age"] + 1)
        X["km_driven_by_age"] = X["km_driven"] * X["age"]
        X["owner_per_year"] = X["owner"] / (X["age"] + eps)

        return X


#### Pipelines

In [None]:
cat_to_num_pip = Pipeline([
    ("get_numeric_part", FunctionTransformer(get_numeric_part)),
    ("median_imputer", SimpleImputer(strategy='median')),
    ("log", FunctionTransformer(np.log1p                                                     )),
    ("standarize", StandardScaler()),
])

mileage_pip = Pipeline([
    ("normalize_mileage", FunctionTransformer(normalize_mileage, feature_names_out= lambda self, _ : ['mileage'])),
    ("median_imputer", SimpleImputer(strategy='median')),
    ("log", FunctionTransformer(np.log1p)),
    ("standarize", StandardScaler()),
])

seats_pip = Pipeline([
    ("median_imputer", SimpleImputer(strategy='median')),
    ("regroup", FunctionTransformer(group_seats)),
    ("one-hot encoding", OneHotEncoder(sparse_output=False))
])

name_pip = Pipeline([
    ("target_encode", target_encoder),
    ("standarize", StandardScaler()),
])

fuel_pip = Pipeline([
    ("group_rare", FunctionTransformer(group_rare_fuel)),
    ("one-hot encoding", OneHotEncoder(sparse_output=False))
])

one_hot_pip = Pipeline([
    ("encoding", OneHotEncoder(sparse_output=False))
])


owner_pip = Pipeline([
    ("regroup", FunctionTransformer(update_owner_grouping)),
    ("ordinal_encoding", ordinal_encoder)
])

km_driven_pip = Pipeline([
    ("log", FunctionTransformer(np.log1p)),
    ("standarize", StandardScaler()),
])

year_pip = Pipeline([
    ("convert_year_to_age", FunctionTransformer(convert_year_to_age)),
    ("log", FunctionTransformer(np.log1p)),
    ("standarize", StandardScaler()),
])

new_features_pip = Pipeline([
    ("base_numeric_features", BaseNumericFeatures()),
    ("interaction_features", InteractionFeatures())
])


In [None]:
preproc = ColumnTransformer([
    
    ("cat_to_num", cat_to_num_pip, ['engine', 'max_power']),
    ("mileage", mileage_pip, ['mileage', 'fuel']),
    ("seats", seats_pip, ['seats']),
    # ("name", name_pip, ['name']),
    ("fuel", fuel_pip, ['fuel']),
    ("one_hot_encoding", one_hot_pip, ['seller_type', 'transmission']),
    ("owner", owner_pip, ['owner']),
    ("km_driven", km_driven_pip, ['km_driven']),
    ("year", year_pip, ['year']),
    # include 'fuel' so InteractionFeatures has access to it for mileage normalization
    ("new_features", new_features_pip, ['engine', 'mileage', 'year', 'km_driven', 'owner', 'fuel'])
    
])
preproc.fit_transform(cars)