# ML Pipeline

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn import set_config
set_config(transform_output="pandas")

In [None]:
train_path = Path("../datasets/train.csv")

cars = pd.read_csv(train_path)

cars.head()

In [None]:
cars.info()

In [None]:
n_before = len(cars)

Q1 = cars['selling_price'].quantile(0.25)
Q3 = cars['selling_price'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 2.5 * IQR, Q3 + 2.5 * IQR

mask = cars['selling_price'].between(lower, upper)
n_out = (~mask).sum()
print(f'Removing {n_out} outlier rows ({n_out/n_before:.2%})')

# overwrite cars with the filtered dataframe
cars = cars[mask].reset_index(drop=True)

In [None]:
cars_labels = cars['selling_price'].copy()
cars = cars.drop('selling_price', axis=1)

In [None]:
cars_labels.head()

In [None]:
cars.head()

## Preprocessing Pipeline

### Helper functions

In [None]:
def normalize_mileage(df: pd.DataFrame) -> pd.Series:
    df = df.copy()

    df['mileage_unit'] = df['mileage'].str.split().str[-1]
    df['mileage'] = df['mileage'].astype(str).str.extract(r'([\d\.]+)', expand=False)
    df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

    mask_petrol = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "Petrol")
    mask_diesel = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "Diesel")
    mask_cng = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "CNG")
    mask_lpg = (df['mileage_unit'] == "km/kg") & (df['fuel'] == "LPG")

    df.loc[mask_petrol, 'mileage'] /= 0.74
    df.loc[mask_diesel, 'mileage'] /= 0.832
    df.loc[mask_lpg, 'mileage'] /=   0.54
    df.loc[mask_cng, 'mileage'] /=   0.128

    return df['mileage']

In [None]:
def group_seats(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    conditions = [
        df['seats'] < 5,
        df['seats'] == 5,
        df['seats'] > 5
    ]
    choices = ['less_than_five', 'five', 'more_than_five']
    # ensure default has the same dtype (string) as choices to avoid dtype promotion errors
    df['seats'] = np.select(conditions, choices, default='missing')
    df['seats'] = df['seats'].astype('category')

    return df


In [None]:
from category_encoders import TargetEncoder

target_encoder = TargetEncoder(
    cols=["brand_effect"],
    smoothing=10,
    handle_unknown="value",
    handle_missing="value"
)

def group_rare_names(df: pd.DataFrame, threshold=10) -> pd.Series:
    df = df.copy()

    name_counts = df['name'].value_counts()
    rare_names = name_counts[name_counts < threshold].index
    df['name'] = df['name'].replace(rare_names, 'other')

    return df['name']

In [None]:
def group_rare_fuel(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['fuel'] = df['fuel'].replace({
        'CNG': 'other',
        'LPG': 'other'
    })
    return df

In [None]:
def update_owner_grouping(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['owner'] = df['owner'].replace({
        'Third Owner': 'Third & Above Owner',
        'Fourth & Above Owner': 'Third & Above Owner',
        'Test Drive Car': 'First Owner'
    })
    return df

In [None]:
from sklearn.preprocessing import OrdinalEncoder


ordinal_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    categories=[["Third & Above Owner", "Second Owner", "First Owner"]]
)

In [None]:
def convert_year_to_age(df: pd.DataFrame) -> pd.Series:
    df = df.copy()
    df['age'] = 2026 - df['year']
    return df['age']

### Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class BaseNumericFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X["engine"] = pd.to_numeric(
            X["engine"].str.split().str[0],
            errors="coerce"
        )

        X["max_power"] = pd.to_numeric(
            X["max_power"].str.split().str[0],
            errors="coerce"
        )

        X["mileage"] = normalize_mileage(X)
        X["age"] = convert_year_to_age(X)

        X.drop(columns=['year', 'fuel'], inplace=True)

        return X

    def get_feature_names_out(self, input_features=None):
        return ["engine", "max_power", "mileage", "age", "km_driven"]

In [None]:
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        eps = 1e-6
        X = X.copy()

        X["engine_mileage_ratio"] = X["engine"] / (X["mileage"] + eps)
        X["km_driven_age_interaction"] = X["km_driven"] * X["age"]

        return X

    def get_feature_names_out(self, input_features=None):
        return [
                "engine_mileage_ratio",
                 "km_driven_age_interaction",
                   "engine", "max_power", "mileage", "age", "km_driven"]

In [None]:
class NameTranformation(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=5):  # no *args or **kwargs!
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X['name'] = X['name'].str.split().str[0]
        X['name'] = group_rare_names(X, self.threshold)

        X.rename(columns={"name": "brand_effect"}, inplace=True)
        
        return X

    def get_feature_names_out(self, input_features=None):
        return ["brand_effect"]

### Pipelines

In [None]:
num_inter_pip = Pipeline([
    ("base_numeric_features", BaseNumericFeatures()),
    ("interaction_features", InteractionFeatures()),
    ("median_imputer", SimpleImputer(strategy='median')),
    ("log", FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
    ("standarize", StandardScaler()),
])

seats_pip = Pipeline([
    ("median_imputer", SimpleImputer(strategy='median')),
    ("regroup", FunctionTransformer(group_seats, feature_names_out='one-to-one')),
    ("one-hot encoding", OneHotEncoder(sparse_output=False))
])

name_pip = Pipeline([
    ("regroup", NameTranformation()),
    ("target_encode", target_encoder),
    ("standarize", StandardScaler()),
])

fuel_pip = Pipeline([
    ("group_rare", FunctionTransformer(group_rare_fuel, feature_names_out='one-to-one')),
    ("one-hot encoding", OneHotEncoder(sparse_output=False))
])

one_hot_pip = Pipeline([
    ("encoding", OneHotEncoder(sparse_output=False))
])

owner_pip = Pipeline([
    ("regroup", FunctionTransformer(update_owner_grouping, feature_names_out='one-to-one')),
    ("ordinal_encoding", ordinal_encoder)
])


### Preprocessing Column Transformer

In [None]:
preproc = ColumnTransformer([
    ("num_inter", num_inter_pip, ['engine', 'max_power', 'fuel', 'year', 'mileage', 'km_driven']),
    ("seats", seats_pip, ['seats']),
    ("name", name_pip, ['name']),
    ("fuel", fuel_pip, ['fuel']),
    ("one_hot_encoding", one_hot_pip, ['seller_type', 'transmission']),
    ("owner", owner_pip, ['owner']),
    
])
# preproc.fit_transform(cars)

## Full ML Pipeline

In [None]:
from sklearn.linear_model import Ridge

In [None]:
pipe = Pipeline([
    ("preproc", preproc),
    ("model", Ridge())
])

In [None]:
from sklearn.compose import TransformedTargetRegressor


full_pipe = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

In [None]:
full_pipe.fit(cars, cars_labels)

In [None]:
cars_predictions = full_pipe.predict(cars)
print(f" first seven pridictions: {cars_predictions[:7].round(2)}")
print(f" first seven real valuse: {cars_labels.iloc[:7].values}")

In [None]:
from sklearn.metrics import root_mean_squared_error


ridge_rmse = root_mean_squared_error(cars_labels, cars_predictions)
ridge_rmse

In [None]:
import matplotlib.pyplot as plt

residuals = cars_labels - cars_predictions

plt.scatter(cars_predictions, residuals, alpha=0.3)
plt.axhline(0, color="black")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residuals vs Predictions")
plt.show()

### Evaluate With Cross Validation

In [None]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_validate

scoring = {
    "mae": "neg_mean_absolute_error",
    "mape": "neg_mean_absolute_percentage_error",
    "rmse": "neg_root_mean_squared_error",
    "r2": "r2"
}

cv = KFold(
    n_splits=10,
    shuffle=True,
    random_state=42
)

def print_cv_summery(cv_results):
    cv_df = pd.DataFrame(cv_results)

    summary = pd.DataFrame({
        "metric": ["MAE", "MAPE", "RMSE", "R2"],
        "mean": [
            -cv_df["test_mae"].mean(),
            -cv_df["test_mape"].mean(),
            -cv_df["test_rmse"].mean(),
            cv_df["test_r2"].mean()
        ],
        "std": [
            cv_df["test_mae"].std(),
            cv_df["test_mape"].std(),
            cv_df["test_rmse"].std(),
            cv_df["test_r2"].std()
        ]
    })

    print(summary)

#### Ridge Model

In [None]:
cv_results = cross_validate(
    estimator=pipe,
    X=cars,
    y=cars_labels,
    scoring=scoring,
    cv=cv,
    return_train_score=True,
    n_jobs=-1
)

print_cv_summery(cv_results)

#### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

pipe = Pipeline([
    ("preproc", preproc),
    ("model", DecisionTreeRegressor(random_state=42))
])

full_pipe = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

cv_results = cross_validate(
    estimator=pipe,
    X=cars,
    y=cars_labels,
    scoring=scoring,
    cv=cv,
    return_train_score=True,
    n_jobs=-1
)

print_cv_summery(cv_results)

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

pipe = Pipeline([
    ("preproc", preproc),
    ("model", RandomForestRegressor(random_state=42))
])

full_pipe = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

cv_results = cross_validate(
    estimator=pipe,
    X=cars,
    y=cars_labels,
    scoring=scoring,
    cv=cv,
    return_train_score=True,
    n_jobs=-1
)

print_cv_summery(cv_results)

In [None]:
full_pipe.fit(cars, cars_labels)
cars_predictions = full_pipe.predict(cars)
forest_rmse = root_mean_squared_error(cars_labels, cars_predictions)
forest_rmse

In [None]:
full_pipe.fit(cars, cars_labels)
cars_predictions = full_pipe.predict(cars)
forest_mae = mean_absolute_error(cars_labels, cars_predictions)
forest_mae

In [None]:
forest_mape = mean_absolute_percentage_error(cars_labels, cars_predictions)
forest_mape

In [None]:
forest_r2 = r2_score(cars_labels, cars_predictions)
forest_r2

In [None]:
import matplotlib.pyplot as plt

residuals = cars_labels - cars_predictions

plt.scatter(cars_predictions, residuals, alpha=0.3)
plt.axhline(0, color="black")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residuals vs Predictions")
plt.show()

In [None]:
plt.scatter(cars_predictions, cars_labels, alpha=0.3)
plt.axhline(0, color="black")
plt.xlabel("Predicted")
plt.ylabel("labels")
plt.title("labels vs Predictions")
plt.show()

In [None]:
plt.hist(residuals, bins=50)
plt.title("Residual Distribution")
plt.show()

#### Notes:
- Random Forest was the best model with CV
- When we train the RF model on CV its perfromance was much less than on training => there is overfitting

### Fine Tune my RF model

In [None]:
from scipy.stats import randint


pipe = Pipeline([
    ("preproc", preproc),
    ("random_forest", RandomForestRegressor())
])

full_pipe = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

param_dist = {
    "regressor__preproc__name__regroup__threshold": randint(1, 31),
    
    "regressor__random_forest__n_estimators": randint(50, 800),
    "regressor__random_forest__max_depth": [2, 4, 8, 16, 24, 32],
    "regressor__random_forest__min_samples_leaf": randint(2, 150),
    "regressor__random_forest__max_features": ["sqrt", "log2", 0.3, 0.6, 0.8]
}

cv = KFold(
    n_splits=10,
    shuffle=True,
    random_state=42
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV


search = RandomizedSearchCV(
    estimator=full_pipe,
    param_distributions=param_dist,
    n_iter=200,
    scoring='neg_root_mean_squared_error',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    return_train_score=True,

)

search.fit(cars, cars_labels)

In [None]:
-search.best_score_

In [None]:
cv_res = pd.DataFrame(search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
# rename columns instead of replace (replace operates on values)
cv_res.rename(columns={
    'param_regressor__random_forest__n_estimators': 'rf_n_estimators',
    'param_regressor__random_forest__max_depth': 'rf_max_depth',
    'param_regressor__random_forest__min_samples_leaf': 'rf_min_samples_leaf',
    'param_regressor__random_forest__max_features': 'rf_max_features',
}, inplace=True)
cv_res.head()

#### Results
- best-score-condition |                                          median_validation_rmse  |                           median_train_rmse
- (the best) without outliers: |                                        93643              |                                 72000
- with outliers:       |                                               160000             |                                 130000
- without group fuel : |                                                200000             |                                 140000
- without group fuel and without outliers: |                            100000             |                                 90000
- without name-feature, without outliers, without new-features: |       94000              |                                 65000
- without name feature : |                                              175000             |                                 127000



In [None]:
search.best_params_

In [None]:
final_model = search.best_estimator_  # includes preprocessing
feature_importances = final_model.regressor_["random_forest"].feature_importances_

sorted(zip(feature_importances,
           final_model.regressor_["preproc"].get_feature_names_out()),
       reverse=True)

In [None]:
final_model.fit(cars, cars_labels)

In [None]:
cars_predictions = final_model.predict(cars)

In [None]:
print(cars_labels.head())
print(cars_predictions[:5])

In [None]:
root_mean_squared_error(cars_labels, cars_predictions)

In [None]:
r2_score(cars_labels, cars_predictions)

In [None]:
mean_absolute_error(cars_labels, cars_predictions)

In [None]:
plt.style.use('dark_background')

residuals = cars_labels - cars_predictions

plt.scatter(cars_predictions, residuals, alpha=0.3)
plt.axhline(0, color="black")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residuals vs Predictions")
plt.show()

In [None]:
plt.scatter(cars_predictions, cars_labels, alpha=0.3)
plt.axhline(0, color="black")
plt.xlabel("Predicted")
plt.ylabel("labels")
plt.title("labels vs Predictions")
plt.show()

In [None]:
plt.hist(residuals, bins=50)
plt.title("Residual Distribution")
plt.show()

We can see that:
- milage, km_driven_age_interaction, engine_mileage_ratio, brand_effect, and age are the most important features
- r2 = 0.94 
- rmse_train = 72000
- rmse_test = 93643
- mae_train = 46117.5

### Save model to .pkl file

In [None]:
import joblib
joblib.dump(final_model, "../models/final_car_price_model.pkl")