In this example, we will learn how to create pipeline to train a model.

# Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# Read datasets

In [2]:
train = pd.read_csv("data/car-price-prediction/train.csv")
test = pd.read_csv("data/car-price-prediction/test.csv")

# Feature engineering
## Miles to km convert

In [3]:
def miles_to_km(x):
    x = str(x)
    x = x.replace(',', '')
    if "miles" in x:
        x = x.split("miles")[0]
        x = 1.60934*int(x)
    else:
        x = x.replace(" km", "")
    return int(x)

train["running"] = train["running"].apply(miles_to_km)
test["running"] = test["running"].apply(miles_to_km)

In [4]:
print("train data")
display(train.head())

print("\ntest data")
display(test.tail())

train data


Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
0,toyota,2022,petrol,3000,left,skyblue,sedan,excellent,2.0,24500
1,mercedes-benz,2014,petrol,132000,left,black,sedan,excellent,2.0,25500
2,kia,2018,petrol,152887,left,other,sedan,excellent,2.0,11700
3,mercedes-benz,2002,petrol,220479,left,golden,sedan,excellent,3.2,12000
4,mercedes-benz,2017,petrol,130000,left,black,sedan,good,2.0,26000



test data


Unnamed: 0,Id,model,year,motor_type,running,wheel,color,type,status,motor_volume
406,406,nissan,2021,petrol,33400,left,black,suv,excellent,2.0
407,407,hyundai,2017,petrol,96560,left,other,sedan,excellent,2.0
408,408,mercedes-benz,2012,petrol,218000,left,white,sedan,good,2.0
409,409,kia,2020,petrol,64373,left,red,sedan,good,2.0
410,410,hyundai,2017,petrol,193120,left,white,sedan,excellent,2.0


## Correlation
Create corelation between X and y variable

In [5]:
X = train.drop('price', axis = 1)
y = train['price']

num = train.select_dtypes(include=['int64', 'float64']).columns
col = train.select_dtypes(include=['object']).columns

display(num)
display(col)

Index(['year', 'running', 'motor_volume', 'price'], dtype='object')

Index(['model', 'motor_type', 'wheel', 'color', 'type', 'status'], dtype='object')

In [6]:
correlation = train[num].corr()

# found running is negative correlation with price
correlation['price']

year            0.638699
running        -0.513133
motor_volume    0.000885
price           1.000000
Name: price, dtype: float64

## Build a heat map of correlations

In [7]:
correlation.style.background_gradient(cmap='coolwarm')

Unnamed: 0,year,running,motor_volume,price
year,1.0,-0.662845,-0.067525,0.638699
running,-0.662845,1.0,0.103905,-0.513133
motor_volume,-0.067525,0.103905,1.0,0.000885
price,0.638699,-0.513133,0.000885,1.0


## Create lineage features

In [8]:
def create_lineage_features(df: pd.DataFrame) -> None:
    df['running_per_year'] =  df['running'] / df['year']
    df['year_per_running'] = df['year'] / df['running']
    df['running_sq'] = df['running'] **2
    df['year_sq'] = df['year'] **2
    df['running_for_years'] =  2024 - df['year']
    df['run_per_year'] = df['running'] / (2024 - df['year'])

create_lineage_features(X)
create_lineage_features(test)

In [9]:
X.head()

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,running_per_year,year_per_running,running_sq,year_sq,running_for_years,run_per_year
0,toyota,2022,petrol,3000,left,skyblue,sedan,excellent,2.0,1.48368,0.674,9000000,4088484,2,1500.0
1,mercedes-benz,2014,petrol,132000,left,black,sedan,excellent,2.0,65.541212,0.015258,17424000000,4056196,10,13200.0
2,kia,2018,petrol,152887,left,other,sedan,excellent,2.0,75.761645,0.013199,23374434769,4072324,6,25481.166667
3,mercedes-benz,2002,petrol,220479,left,golden,sedan,excellent,3.2,110.129371,0.00908,48610989441,4008004,22,10021.772727
4,mercedes-benz,2017,petrol,130000,left,black,sedan,good,2.0,64.452157,0.015515,16900000000,4068289,7,18571.428571


# Data Pipeline

In [10]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [11]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
# Preprocessing for numerical data: imputation and scaling
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# Preprocessing for categorical data: imputation and one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)])

# Create model

In [13]:
XGB_model = xgb.XGBRegressor(**{
    'objective': 'reg:squarederror', 
    'eval_metric': 'rmse', 
    'booster': 'dart', 
    'learning_rate': 0.060224772079701436, 
    'max_depth': 5, 
    'subsample': 0.40449565258922265,
    'colsample_bytree': 0.7764568469218252, 
    'gamma': 0.16797673007239033, 
    'lambda': 0.9804682598571074, 
    'alpha': 0.9489652654399305
})

In [14]:
# Make predictions by XGBRegressor
XGB_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor), 
        ('model', XGB_model)
    ]
)

XGB_pipeline.fit(X, y)

# Visualize pipeline

In [15]:
from sklearn import set_config

set_config(display="diagram")
XGB_pipeline  # click on the diagram below to see the details of each step

# Predict

In [16]:
pred= XGB_pipeline.predict(test)

In [17]:
pd.Series(pred)

0      17255.683594
1      16813.558594
2      22906.082031
3      14545.666992
4       9849.642578
           ...     
406    27828.070312
407    14150.741211
408    13904.772461
409    16325.475586
410    13356.910156
Length: 411, dtype: float32