In [1]:
import pickle

with open('../data/unprocessed_data.pkl', 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)
    
X_train

Unnamed: 0,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand,Model,Age
2952,Mumbai,22000,Petrol,Manual,First,16.47,1198.0,74.0,5.0,Volkswagen,Polo,5
1647,Pune,69000,Diesel,Automatic,First,,2987.0,165.0,5.0,Mercedes-Benz,M-Class,5
5301,Coimbatore,20026,Petrol,Automatic,First,19.00,1199.0,88.7,5.0,Honda,Jazz,2
657,Hyderabad,13000,Petrol,Automatic,First,22.00,1197.0,81.8,5.0,Maruti,Dzire,3
2273,Mumbai,122000,Diesel,Manual,Second,11.50,2982.0,171.0,7.0,Toyota,Fortuner,10
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,Delhi,70000,Petrol,Manual,First,19.00,998.0,66.1,5.0,Maruti,A-Star,10
5191,Kolkata,28000,Petrol,Manual,First,18.90,1197.0,82.0,5.0,Hyundai,Grand,7
5226,Chennai,123000,Diesel,Manual,Second,12.55,2982.0,168.5,7.0,Toyota,Fortuner,7
5390,Hyderabad,78000,Petrol,Manual,Second,20.92,998.0,67.1,5.0,Maruti,Alto,9


In [2]:
num_cols = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Age', 'Seats']  # Impute Missing Values and Scale
nom_cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Brand', 'Model']  # USE Binary ENCODING
ord_cat_cols = ['Owner_Type']   # USE ORDINAL ENCODING

In [3]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from  category_encoders import BinaryEncoder

def prep(poly_degree):
    numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
                                         ('poly', PolynomialFeatures(degree=poly_degree)),
                                         ('scaler', StandardScaler())])

    # Preprocessing for categorical data
    nominal_categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
    ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])

    # Bundle preprocessing for numerical and categorical data
    prep = ColumnTransformer(transformers=[('num_prep', numerical_transformer, num_cols),
                                                    ('nom_prep', nominal_categorical_transformer, nom_cat_cols),
                                                    ('ord_prep', ordinal_categorical_transformer, ord_cat_cols)])

    return prep

# Cross Validation

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

model = Pipeline(steps=[('preprocessor', prep(2)),
                        ('model', LinearRegression())])

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
scores

array([0.8535847 , 0.87717511, 0.8754437 , 0.87177333, 0.80642056])

In [5]:
print('Mean CV R2:', scores.mean())
print('Std Dev of CV R2:', scores.std())

Mean CV R2: 0.8568794800678139
Std Dev of CV R2: 0.026590703417920063


# Hyperparameter Tuning

In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
                                         ('poly', PolynomialFeatures()),
                                         ('scaler', StandardScaler())])

# Preprocessing for categorical data
nominal_categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num_prep', numerical_transformer, num_cols),
                                                ('nom_prep', nominal_categorical_transformer, nom_cat_cols),
                                                ('ord_prep', ordinal_categorical_transformer, ord_cat_cols)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', Ridge())])

param_grid = {
                'preprocessor__num_prep__poly__degree': [2, 3, 4],
                'model__alpha': [0.01, 0.1, 1, 10]
            }

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


0,1,2
,estimator,"Pipeline(step...l', Ridge())])"
,param_grid,"{'model__alpha': [0.01, 0.1, ...], 'preprocessor__num_prep__poly__degree': [2, 3, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num_prep', ...), ('nom_prep', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,4
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,mapping,
,drop_invariant,False
,return_df,True
,base,2
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,categories,"[['Fourth & Above', 'Third', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [7]:
grid_search.best_params_

{'model__alpha': 0.1, 'preprocessor__num_prep__poly__degree': 4}

In [8]:
import pandas as pd

pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_preprocessor__num_prep__poly__degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.28532,0.213909,0.070872,0.061868,0.01,2,"{'model__alpha': 0.01, 'preprocessor__num_prep...",0.853566,0.877185,0.875448,0.871788,0.806424,0.856882,0.026594,11
1,0.22952,0.06691,0.053118,0.036155,0.01,3,"{'model__alpha': 0.01, 'preprocessor__num_prep...",0.86778,0.885851,0.896957,0.877181,0.818488,0.869251,0.027148,4
2,0.421228,0.158675,0.050893,0.010737,0.01,4,"{'model__alpha': 0.01, 'preprocessor__num_prep...",0.868231,0.879914,0.894522,0.897093,0.819585,0.871869,0.028146,2
3,0.323951,0.375576,0.033777,0.007285,0.1,2,"{'model__alpha': 0.1, 'preprocessor__num_prep_...",0.853403,0.87727,0.875483,0.871913,0.806444,0.856903,0.026622,10
4,0.158987,0.014875,0.038149,0.005116,0.1,3,"{'model__alpha': 0.1, 'preprocessor__num_prep_...",0.866992,0.885225,0.896442,0.876024,0.819592,0.868855,0.026498,5
5,0.371192,0.318611,0.046093,0.004862,0.1,4,"{'model__alpha': 0.1, 'preprocessor__num_prep_...",0.869879,0.887273,0.895252,0.889124,0.819381,0.872182,0.027714,1
6,0.284896,0.274922,0.052783,0.023023,1.0,2,"{'model__alpha': 1, 'preprocessor__num_prep__p...",0.852263,0.877738,0.875479,0.87277,0.806414,0.856933,0.026839,9
7,0.275538,0.144861,0.052403,0.019027,1.0,3,"{'model__alpha': 1, 'preprocessor__num_prep__p...",0.864266,0.88285,0.893148,0.878901,0.820533,0.86794,0.025451,6
8,0.328711,0.01197,0.068566,0.019573,1.0,4,"{'model__alpha': 1, 'preprocessor__num_prep__p...",0.868701,0.887915,0.89735,0.879875,0.820391,0.870846,0.026928,3
9,0.260352,0.115713,0.060341,0.025933,10.0,2,"{'model__alpha': 10, 'preprocessor__num_prep__...",0.849835,0.879009,0.873291,0.876271,0.805077,0.856696,0.027814,12


In [9]:
pd.DataFrame(grid_search.cv_results_)[['param_model__alpha', 'param_preprocessor__num_prep__poly__degree', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_model__alpha,param_preprocessor__num_prep__poly__degree,mean_test_score,rank_test_score
0,0.01,2,0.856882,11
1,0.01,3,0.869251,4
2,0.01,4,0.871869,2
3,0.1,2,0.856903,10
4,0.1,3,0.868855,5
5,0.1,4,0.872182,1
6,1.0,2,0.856933,9
7,1.0,3,0.86794,6
8,1.0,4,0.870846,3
9,10.0,2,0.856696,12


In [10]:
grid_search.best_estimator_

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_prep', ...), ('nom_prep', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,4
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,mapping,
,drop_invariant,False
,return_df,True
,base,2
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,categories,"[['Fourth & Above', 'Third', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [11]:
model = grid_search.best_estimator_
model.fit(X_train, y_train)  # Not needed as already fitted

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_prep', ...), ('nom_prep', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,4
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,mapping,
,drop_invariant,False
,return_df,True
,base,2
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,categories,"[['Fourth & Above', 'Third', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


* Grid Search CV    
    * Ridge + Polynomial Features : Hyperparameter Tuning with CV >> Best Estimator (1)
    * Decision Tree Regressor : Hyperparameter Tuning with CV >> Best Estimator (2)
    * Random Forest Regressor : Hyperparameter Tuning with CV >> Best Estimator (3)
> All the above models are trained on Training Set with Cross Validation and Hyperparameter Tuning.

* Choose the Best Model >> Final Evaluation on Test Set

In [12]:
print('Test R2:', model.score(X_test, y_test))

Test R2: 0.8800445695076108


# Log Transformation

In [13]:
import numpy as np

# Log Transform for the target
def log_transform(x):
    return np.log1p(x)

# Log Transform
y_train_log = log_transform(y_train)
y_test_log = log_transform(y_test)

In [14]:
x = 100
np.log(x + 1)

4.61512051684126

In [15]:
from  plotly.subplots  import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2, subplot_titles=['Before Log Transform', 'After Log Transform'])

fig.add_trace(go.Histogram(x=y_train, name='Before Log Transform'), row=1, col=1)
fig.add_trace(go.Histogram(x=y_train_log, name='After Log Transform'), row=1, col=2)

fig.show()

In [16]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train_log)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


0,1,2
,estimator,Pipeline(step...(alpha=0.1))])
,param_grid,"{'model__alpha': [0.01, 0.1, ...], 'preprocessor__num_prep__poly__degree': [2, 3, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num_prep', ...), ('nom_prep', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,4
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,mapping,
,drop_invariant,False
,return_df,True
,base,2
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,categories,"[['Fourth & Above', 'Third', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [17]:
grid_search.best_params_

{'model__alpha': 0.1, 'preprocessor__num_prep__poly__degree': 4}

In [18]:
grid_search.best_score_

0.9142203214934896

In [19]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train_log)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_prep', ...), ('nom_prep', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,4
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,mapping,
,drop_invariant,False
,return_df,True
,base,2
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,categories,"[['Fourth & Above', 'Third', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [20]:
print('Test R2:', best_model.score(X_test, y_test_log))

Test R2: 0.9147093700159457


`Log Transformation of the target variable affects the model performance positively`

In [21]:
best_model.predict(X_test.iloc[0].to_frame().T)  # this is log(price + 1) not price

array([1.93923523])

In [22]:
log_price = best_model.predict(X_test.iloc[0].to_frame().T)
np.expm1(log_price)  # convert back to price

array([5.95343116])

In [23]:
y_test.iloc[0]  # actual price

5.1

# Saving 

In [24]:
import pickle  # or import joblib

# Model Saving
pickle.dump(best_model, open('../models/ml_model.pkl', 'wb'))

In [25]:
# Inferencing

loaded_model = pickle.load(open('../models/ml_model.pkl', 'rb'))

In [26]:
# New Data
new_data = X_test.sample(1)
new_data

Unnamed: 0,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand,Model,Age
5083,Chennai,97000,Diesel,Automatic,First,11.5,2982.0,169.0,7.0,Toyota,Fortuner,8


In [27]:
y_test[new_data.index]

5083    16.75
Name: Price, dtype: float64

In [28]:
# Preprocessing & Prediction
log_price = loaded_model.predict(new_data) # in log scale
price = np.expm1(log_price) # in original scale

print('Price:', price[0])

Price: 17.75447808732715
