# Practical - 4: Model Saving with pipeline
A custom preprocessing pipeline is defined using the Scikit-Learn Pipeline class. This pipeline will include data preprocessing steps, standard scaling and the trained regression model.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
import joblib
import pickle
import re
import warnings

warnings.filterwarnings("ignore")

In [10]:
# Import dataset
df = pd.read_csv("C:/Users/razor/OneDrive/Desktop/Work/Semester 5/MD & CM/Practical/datasets/car details v4.csv")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

df = df.dropna(axis=0)
df.drop(list(df.loc[df['Engine'].isna()].index), inplace=True)
df['Drivetrain'].fillna(df['Drivetrain'].mode(), inplace=True)

for i in df.columns:
    if df[i].dtype != object:
        df[i].fillna(df[i].mean(), inplace=True)

In [11]:
# Split features and targets
X = df.drop('Price', axis=1)
y = df['Price']

# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [12]:
def transform(X):
    # Create dummy variables for specific features and drop those later
    X.drop(['Year','Model','Location','Make','Color','Fuel Type','Seller Type'], axis=1, inplace=True)

    # Modifying the 'Owner' feature values based on the number of previously owned owners 
    dict1 = {}
    counter1 = len(X['Owner'].unique())
    for i in X['Owner'].unique():
        dict1[i] = counter1
        counter1 -= 1
    X['Owner'] = X['Owner'].map(dict1)
    print(dict1)
    
    dict1 = {}
    counter1 = len(X['Drivetrain'].unique())
    for i in X['Drivetrain'].unique():
        if str(i) != 'nan':
            dict1[i] = counter1
            counter1 += 1
    print(dict1)


    X['Drivetrain'] = X['Drivetrain'].map(dict1)

    dict1 = {}
    counter1 = len(X['Drivetrain'].unique())
    for i in X['Drivetrain'].unique():
        if str(i) != 'nan':
            dict1[i] = counter1
            counter1 += 1
    print(dict1)

    X['Drivetrain'] = X['Drivetrain'].map(dict1)

    dict1 = {}
    counter1 = len(X['Transmission'].unique())
    for i in X['Transmission'].unique():
        if str(i) != 'nan':
            dict1[i] = counter1
            counter1 += 1
    print(dict1)

    X['Transmission'] = X['Transmission'].map(dict1)        

    # Creating a new feature 'Max Power' from two separate numerical values in a single feature 
    X['Power'] = X['Max Power'].str.extract(r'(\d+) bhp')
    X['RPM'] = X['Max Power'].str.extract(r'@ (\d+) rpm')

    power_values = X['Power'].tolist()
    rpm_values = X['RPM'].tolist()

    power_values1 = []
    rpm_values1 = []

    for i, j in enumerate(power_values):
        if type(power_values[i]) != float:
            power_values1.append(float(power_values[i]))
        elif type(power_values[i]) == float:
            power_values1.append(0)
    print(power_values1)
    for i, j in enumerate(rpm_values):
        if type(rpm_values[i]) != float:
            rpm_values1.append(float(rpm_values[i]))
        elif type(rpm_values[i]) == float:
            rpm_values1.append(0)
    print(rpm_values1)
    m_power = [power_values1[i] + rpm_values1[i] for i, j in enumerate(power_values1)]

    # Creating a new feature 'Max Torque' from two separate numerical values in a single feature
    X['C1'] = X['Max Torque'].str.extract(r'(\d+) Nm')
    X['C2'] = X['Max Torque'].str.extract(r'@ (\d+) rpm')

    c1 = X['Power'].tolist()
    c2 = X['RPM'].tolist()

    c11 = []
    c12 = []

    for i, j in enumerate(c1):
        if type(c1[i]) != float:
            c11.append(float(c1[i]))
        elif type(c1[i]) == float:
            c11.append(0)
    print(c11)
    for i, j in enumerate(c2):
        if type(c2[i]) != float:
            c12.append(float(c2[i]))
        elif type(c2[i]) == float:
            c12.append(0)
    print(c12)
    n_power = [c11[i] + c12[i] for i, j in enumerate(c11)]
    X['Max Power'] = m_power
    X['Max Torque'] = n_power
    X['Engine'] = X['Engine'].str.extract(r'(\d+)')
    X['Engine'] = X['Engine'].astype(int)
    X = X.fillna(X.median())
    print(n_power)
    return X

In [13]:
X_train = transform(X_train)

{'First': 4, 'Second': 3, 'UnRegistered Car': 2, 'Third': 1}
{'FWD': 3, 'AWD': 4, 'RWD': 5}
{3: 3, 4: 4, 5: 5}
{'Automatic': 2, 'Manual': 3}
[74.0, 121.0, 184.0, 177.0, 126.0, 138.0, 89.0, 91.0, 204.0, 67.0, 89.0, 99.0, 134.0, 80.0, 174.0, 117.0, 67.0, 85.0, 103.0, 81.0, 105.0, 188.0, 150.0, 99.0, 150.0, 117.0, 192.0, 67.0, 118.0, 47.0, 74.0, 295.0, 168.0, 453.0, 85.0, 103.0, 0, 170.0, 130.0, 82.0, 99.0, 160.0, 385.0, 184.0, 105.0, 62.0, 84.0, 89.0, 148.0, 188.0, 113.0, 244.0, 89.0, 82.0, 126.0, 58.0, 176.0, 0, 0, 188.0, 105.0, 188.0, 83.0, 85.0, 188.0, 108.0, 188.0, 0, 148.0, 81.0, 67.0, 67.0, 73.0, 227.0, 182.0, 113.0, 121.0, 117.0, 177.0, 118.0, 126.0, 168.0, 58.0, 85.0, 271.0, 147.0, 174.0, 0, 126.0, 89.0, 89.0, 85.0, 79.0, 182.0, 192.0, 74.0, 83.0, 126.0, 99.0, 120.0, 75.0, 82.0, 225.0, 58.0, 67.0, 188.0, 254.0, 67.0, 67.0, 190.0, 148.0, 0, 335.0, 67.0, 67.0, 241.0, 67.0, 81.0, 53.0, 68.0, 0, 74.0, 103.0, 182.0, 117.0, 67.0, 89.0, 184.0, 69.0, 117.0, 89.0, 87.0, 168.0, 126.0, 122.

In [6]:
scaler = StandardScaler()
# custom_transformer = CustomDataFrameTransformer()
# transformed_data = custom_transformer.transform(df)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1499 entries, 243 to 951
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Kilometer           1499 non-null   int64  
 1   Transmission        1499 non-null   int64  
 2   Owner               1499 non-null   int64  
 3   Engine              1499 non-null   int32  
 4   Max Power           1499 non-null   float64
 5   Max Torque          1499 non-null   float64
 6   Drivetrain          1499 non-null   int64  
 7   Length              1499 non-null   float64
 8   Width               1499 non-null   float64
 9   Height              1499 non-null   float64
 10  Seating Capacity    1499 non-null   float64
 11  Fuel Tank Capacity  1499 non-null   float64
 12  Power               1499 non-null   object 
 13  RPM                 1499 non-null   object 
 14  C1                  1499 non-null   object 
 15  C2                  1499 non-null   object 
dtypes: float64

In [8]:
# Create pipeline
pipeline = Pipeline([('scaler', scaler),('regressor', ElasticNet(alpha=0.56, l1_ratio=0.01))])
# Fit pipeline on training dataset
pipeline.fit(X_train, y_train)

In [15]:
X_test.iloc[0]

Make                                                Hyundai
Model                 Elite i20 Sportz 1.4 CRDI [2016-2017]
Year                                                   2016
Kilometer                                             38000
Fuel Type                                            Diesel
Transmission                                         Manual
Location                                              Patna
Color                                                Silver
Owner                                                 First
Seller Type                                      Individual
Engine                                              1396 cc
Max Power                                 89 bhp @ 4000 rpm
Max Torque                                220 Nm @ 1500 rpm
Drivetrain                                              FWD
Length                                               3985.0
Width                                                1734.0
Height                                  

In [21]:
def test_transform(X_test):
    
    X_test.drop(['Year','Model','Location','Make','Color','Fuel Type','Seller Type'], inplace=True)
    X_test['Owner'] = X_test['Owner'].map({'First': 4, 'Second': 3, 'UnRegistered Car': 2, 'Third': 1})
    X_test['Drivetrain'] = X_test['Drivetrain'].map({'FWD': 3, 'AWD': 4, 'RWD': 5})
    X_test['Transmission'] = X_test['Transmission'].map({'Automatic': 2, 'Manual': 3})        

    # Creating a new feature 'Max Power' from two separate numerical values in a single feature 
    X_test['Power'] = X_test['Max Power'].str.extract(r'(\d+) bhp')
    X_test['RPM'] = X_test['Max Power'].str.extract(r'@ (\d+) rpm')

    power_values = X_test['Power'].tolist()
    rpm_values = X_test['RPM'].tolist()

    power_values1 = []
    rpm_values1 = []

    for i, j in enumerate(power_values):
        if type(power_values[i]) != float:
            power_values1.append(float(power_values[i]))
        elif type(power_values[i]) == float:
            power_values1.append(0)

    for i, j in enumerate(rpm_values):
        if type(rpm_values[i]) != float:
            rpm_values1.append(float(rpm_values[i]))
        elif type(rpm_values[i]) == float:
            rpm_values1.append(0)

    m_power = [power_values1[i] + rpm_values1[i] for i, j in enumerate(power_values1)]

    # Creating a new feature 'Max Torque' from two separate numerical values in a single feature
    X_test['C1'] = X_test['Max Torque'].str.extract(r'(\d+) Nm')
    X_test['C2'] = X_test['Max Torque'].str.extract(r'@ (\d+) rpm')

    c1 = X_test['Power'].tolist()
    c2 = X_test['RPM'].tolist()

    c11 = []
    c12 = []

    for i, j in enumerate(c1):
        if type(c1[i]) != float:
            c11.append(float(c1[i]))
        elif type(c1[i]) == float:
            c11.append(0)

    for i, j in enumerate(c2):
        if type(c2[i]) != float:
            c12.append(float(c2[i]))
        elif type(c2[i]) == float:
            c12.append(0)

    n_power = [c11[i] + c12[i] for i, j in enumerate(c11)]
    X_test['Max Power'] = m_power
    X_test['Max Torque'] = n_power
    X_test['Engine'] = X_test['Engine'].str.extract(r'(\d+)')
    X_test['Engine'] = X_test['Engine'].astype(int)
    X_test = X_test.fillna(X_test.median())
    return X_test
    

In [22]:
test_transform(X_test.iloc[0])

AttributeError: 'str' object has no attribute 'map'

In [9]:
# Predict using optimal parameters of the model
preds = pipeline.predict(X_test)

mse = mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mse)

print("Mean Absolute Error (MAE):", mae)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Color
- Fuel Type
- Location
- Make
- Model
- ...
Feature names seen at fit time, yet now missing:
- C1
- C2
- Power
- RPM


In [None]:
# Save pipeline file
with open('pipeline_prac_4_render.pkl', 'wb') as preprocessor_file:
    joblib.dump(pipeline, preprocessor_file)

In [None]:
preds[0]