# Practical - 4: Model Saving with pipeline
A custom preprocessing pipeline is defined using the Scikit-Learn Pipeline class. This pipeline will include data preprocessing steps, standard scaling and the trained regression model.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
import joblib
import pickle
import re
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Import dataset
df = pd.read_csv("C:/Users/razor/OneDrive/Desktop/Work/Semester 5/MD & CM/Practical/datasets/car details v4.csv")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

df = df.dropna(axis=0)
df.drop(list(df.loc[df['Engine'].isna()].index), inplace=True)
df['Drivetrain'].fillna(df['Drivetrain'].mode(), inplace=True)

for i in df.columns:
    if df[i].dtype != object:
        df[i].fillna(df[i].mean(), inplace=True)

In [3]:
df

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,Maruti Suzuki,Ritz Vxi (ABS) BS-IV,245000,2014,79000,Petrol,Manual,Faridabad,White,Second,Individual,1197 cc,85 bhp @ 6000 rpm,113 Nm @ 4500 rpm,FWD,3775.0,1680.0,1620.0,5.0,43.0
2054,Mahindra,XUV500 W8 [2015-2017],850000,2016,90300,Diesel,Manual,Surat,White,First,Individual,2179 cc,138 bhp @ 3750 rpm,330 Nm @ 1600 rpm,FWD,4585.0,1890.0,1785.0,7.0,70.0
2055,Hyundai,Eon D-Lite +,275000,2014,83000,Petrol,Manual,Ahmedabad,White,Second,Individual,814 cc,55 bhp @ 5500 rpm,75 Nm @ 4000 rpm,FWD,3495.0,1550.0,1500.0,5.0,32.0
2056,Ford,Figo Duratec Petrol ZXI 1.2,240000,2013,73000,Petrol,Manual,Thane,Silver,First,Individual,1196 cc,70 bhp @ 6250 rpm,102 Nm @ 4000 rpm,FWD,3795.0,1680.0,1427.0,5.0,45.0


In [4]:
# Split features and targets
X = df.drop('Price', axis=1)
y = df['Price']

# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [5]:
def transform(X):
    # Create dummy variables for specific features and drop those later
    X.drop(['Year','Model','Location','Make','Color','Fuel Type','Seller Type'], axis=1, inplace=True)

    # Modifying the 'Owner' feature values based on the number of previously owned owners 
    dict1 = {}
    counter1 = len(X['Owner'].unique())
    for i in X['Owner'].unique():
        dict1[i] = counter1
        counter1 -= 1
    X['Owner'] = X['Owner'].map(dict1)
    
    dict1 = {}
    counter1 = len(X['Drivetrain'].unique())
    for i in X['Drivetrain'].unique():
        if str(i) != 'nan':
            dict1[i] = counter1
            counter1 += 1

    X['Drivetrain'] = X['Drivetrain'].map(dict1)

    dict1 = {}
    counter1 = len(X['Drivetrain'].unique())
    for i in X['Drivetrain'].unique():
        if str(i) != 'nan':
            dict1[i] = counter1
            counter1 += 1
            
    X['Drivetrain'] = X['Drivetrain'].map(dict1)

    dict1 = {}
    counter1 = len(X['Transmission'].unique())
    for i in X['Transmission'].unique():
        if str(i) != 'nan':
            dict1[i] = counter1
            counter1 += 1

    X['Transmission'] = X['Transmission'].map(dict1)        

    # Creating a new feature 'Max Power' from two separate numerical values in a single feature 
    X['Power'] = X['Max Power'].str.extract(r'(\d+) bhp')
    X['RPM'] = X['Max Torque'].str.extract(r'@ (\d+) rpm')

    power_values = X['Power'].tolist()
    rpm_values = X['RPM'].tolist()

    power_values1 = []
    rpm_values1 = []

    for i, j in enumerate(power_values):
        if type(power_values[i]) != float:
            power_values1.append(float(power_values[i]))
        elif type(power_values[i]) == float:
            power_values1.append(0)
    for i, j in enumerate(rpm_values):
        if type(rpm_values[i]) != float:
            rpm_values1.append(float(rpm_values[i]))
        elif type(rpm_values[i]) == float:
            rpm_values1.append(0)
    m_power = [power_values1[i] + rpm_values1[i] for i, j in enumerate(power_values1)]

    # Creating a new feature 'Max Torque' from two separate numerical values in a single feature
    X['C1'] = X['Max Torque'].str.extract(r'(\d+) Nm')
    X['C2'] = X['Max Power'].str.extract(r'@ (\d+) rpm')

    c1 = X['Power'].tolist()
    c2 = X['RPM'].tolist()

    c11 = []
    c12 = []

    for i, j in enumerate(c1):
        if type(c1[i]) != float:
            c11.append(float(c1[i]))
        elif type(c1[i]) == float:
            c11.append(0)
    for i, j in enumerate(c2):
        if type(c2[i]) != float:
            c12.append(float(c2[i]))
        elif type(c2[i]) == float:
            c12.append(0)
    n_power = [c11[i] + c12[i] for i, j in enumerate(c11)]
    X['Max Power'] = m_power
    X['Max Torque'] = n_power
    X['Engine'] = X['Engine'].str.extract(r'(\d+)')
    X['Engine'] = X['Engine'].astype(int)
    X = X.fillna(X.median())
    return X

In [6]:
X_train = transform(X_train)

In [7]:
X_train

Unnamed: 0,Kilometer,Transmission,Owner,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity,Power,RPM,C1,C2
243,43000,2,4,1248,2074.0,2074.0,3,3840.0,1735.0,1530.0,5.0,37.0,74,2000,190,4000
1863,38000,2,4,1591,4321.0,4321.0,3,4370.0,1700.0,1475.0,5.0,43.0,121,4200,158,6300
1096,41000,2,3,1968,1934.0,1934.0,4,4385.0,1831.0,1608.0,5.0,64.0,184,1750,380,3500
1311,56000,2,4,1999,1927.0,1927.0,5,5067.0,2091.0,1457.0,5.0,66.0,177,1750,430,4000
858,80000,2,4,1582,2026.0,2026.0,3,4530.0,1775.0,1470.0,5.0,56.0,126,1900,259,4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1752,62000,3,4,1591,4321.0,4321.0,3,4370.0,1700.0,1475.0,5.0,43.0,121,4200,158,6300
2045,75693,3,3,998,3558.0,3558.0,3,3599.0,1495.0,1700.0,5.0,35.0,58,3500,77,6200
1493,102000,2,4,2143,2970.0,2970.0,5,4596.0,1770.0,1447.0,5.0,66.0,170,2800,400,4200
1698,48568,3,4,1086,4568.0,4568.0,3,3585.0,1595.0,1550.0,5.0,35.0,68,4500,08,5500


In [8]:
scaler = StandardScaler()
# custom_transformer = CustomDataFrameTransformer()
# transformed_data = custom_transformer.transform(df)

In [9]:
# Create pipeline
pipeline = Pipeline([('scaler', scaler),('regressor', ElasticNet(alpha=0.56, l1_ratio=0.01))])
# Fit pipeline on training dataset
pipeline.fit(X_train, y_train)

In [10]:
def test_transform(X_test):
    
    X_test.drop(['Year','Model','Location','Make','Color','Fuel Type','Seller Type'], inplace=True)
     
    
    X_test['Owner'] = pd.DataFrame(pd.Series({'First': 4, 'Second': 3, 'UnRegistered Car': 2, 'Third': 1}[X_test['Owner']]))
    X_test['Drivetrain'] = pd.DataFrame(pd.Series({'FWD': 3, 'AWD': 4, 'RWD': 5}[X_test['Drivetrain']]))
    X_test['Transmission'] = pd.DataFrame(pd.Series({'Automatic': 2, 'Manual': 3}[X_test['Transmission']]))      

    power_values = pd.DataFrame(pd.Series(X_test['Max Power']))[0].str.extract(r'(\d+) bhp')
    rpm_values = pd.DataFrame(pd.Series(X_test['Max Power']))[0].str.extract(r'@ (\d+) rpm')
    
    power_values1 = []
    rpm_values1 = []
    
    X_test['Power'] = pd.DataFrame(pd.Series(X_test['Max Power']))[0].str.extract(r'(\d+) bhp')
    X_test['RPM'] = pd.DataFrame(pd.Series(X_test['Max Torque']))[0].str.extract(r'@ (\d+) rpm')

    for i, j in enumerate(power_values):
        if type(power_values[i]) != float:
            power_values1.append(float(power_values[i]))
        elif type(power_values[i]) == float:
            power_values1.append(0)

    for i, j in enumerate(rpm_values):
        if type(rpm_values[i]) != float:
            rpm_values1.append(float(rpm_values[i]))
        elif type(rpm_values[i]) == float:
            rpm_values1.append(0)

    m_power = [power_values1[i] + rpm_values1[i] for i, j in enumerate(power_values1)]

    # Creating a new feature 'Max Torque' from two separate numerical values in a single feature
    X_test['C1'] = pd.DataFrame(pd.Series(X_test['Max Torque']))[0].str.extract(r'(\d+) Nm')
    X_test['C2'] = pd.DataFrame(pd.Series(X_test['Max Power']))[0].str.extract(r'@ (\d+) rpm')

    c1 = power_values
    c2 = rpm_values

    c11 = []
    c12 = []

    for i, j in enumerate(c1):
        if type(c1[i]) != float:
            c11.append(float(c1[i]))
        elif type(c1[i]) == float:
            c11.append(0)

    for i, j in enumerate(c2):
        if type(c2[i]) != float:
            c12.append(float(c2[i]))
        elif type(c2[i]) == float:
            c12.append(0)

    n_power = [c11[i] + c12[i] for i, j in enumerate(c11)]
    X_test['Max Power'] = m_power[0]
    X_test['Max Torque'] = n_power[0]
   
    print(X_test['Owner'].values[0][0])
    X_test['Transmission'] = X_test['Transmission'].values[0][0]
    X_test['Owner'] = X_test['Owner'].values[0][0]
    X_test['C1'] = X_test['C1'].values[0][0]
    X_test['C2'] = X_test['C2'].values[0][0]
    X_test['Drivetrain'] = X_test['Drivetrain'].values[0][0]
    X_test['Power'] = X_test['Power'].values[0][0]
    X_test['RPM'] = X_test['RPM'].values[0][0]

    X_test['Engine'] = pd.DataFrame(pd.Series(X_test['Engine']))[0].str.extract(r'(\d+)').astype(int).values[0][0]
    X_test = X_test.fillna(0)
    print('nigga')
    X_test = pd.DataFrame(X_test).T
    X_test = X_test[['Kilometer', 'Transmission', 'Owner', 'Engine', 'Max Power', 'Max Torque', 'Drivetrain',
       'Length', 'Width', 'Height', 'Seating Capacity', 'Fuel Tank Capacity', 'Power', 'RPM', 'C1',
       'C2']]
    return X_test

In [11]:
pd.DataFrame(pd.Series(X_test.iloc[0]['Engine']))[0].str.extract(r'(\d+)').astype(int).values[0][0]

1396

In [12]:
nigga1 = test_transform(X_test.iloc[0])

4
nigga


In [13]:
nigga1

Unnamed: 0,Kilometer,Transmission,Owner,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity,Power,RPM,C1,C2
1211,38000,3,4,1396,4089.0,4089.0,3,3985.0,1734.0,1505.0,5.0,45.0,89,1500,220,4000


In [14]:
y_test.iloc[0]

575000

In [15]:
# Predict using optimal parameters of the model
preds = pipeline.predict(nigga1)

In [16]:
# Save pipeline file
with open('pipeline_prac_4_render.pkl', 'wb') as preprocessor_file:
    joblib.dump(pipeline, preprocessor_file)

In [17]:
preds[0]

838245.0314480967