# Hyperparameter Tunning and Cross Validation

In [214]:
from warnings import filterwarnings
filterwarnings('ignore')

In [215]:
import pandas  as pd
df = pd.read_csv('Datasets/Cars93.csv')
df.head()

Unnamed: 0,Id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


## Seprate X and Y features

In [216]:
X = df.drop(columns=['Id','Weight'])
Y = df[['Weight']]
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [217]:
Y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


## Sepearate cat con features

In [218]:
from PM6func import catconsep
cat, con = catconsep(X)

In [219]:
X.isna().sum()

Manufacturer           0
Model                  0
Type                   0
Min.Price              0
Price                  0
Max.Price              0
MPG.city               0
MPG.highway            0
AirBags               37
DriveTrain             0
Cylinders              0
EngineSize             0
Horsepower             0
RPM                    0
Rev.per.mile           0
Man.trans.avail        0
Fuel.tank.capacity     0
Passengers             0
Length                 0
Wheelbase              0
Width                  0
Turn.circle            0
Rear.seat.room         2
Luggage.room          11
Origin                 0
Make                   0
dtype: int64

## Preprocess the data

In [220]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline(steps=[('impute',SimpleImputer(strategy='mean')),
                               ('scaler',StandardScaler())])
cat_pipeline = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                               ('ohe',OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num',num_pipeline,con),
                         ('cat',cat_pipeline,cat)])

In [221]:
X_pre = pre.fit_transform(X).toarray()
X_pre

array([[-0.48578741, -0.37572014, -0.28246529, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.38801699,  1.49784409,  1.53140881, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.00865782,  0.99822696,  0.94805231, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.66378585,  0.39452293,  0.16416702, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.53733279,  0.33207079,  0.14593713, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.88220476,  0.7484184 ,  0.60168439, ...,  0.        ,
         0.        ,  1.        ]], shape=(93, 256))

In [222]:
cols = pre.get_feature_names_out()
cols

array(['num__Min.Price', 'num__Price', 'num__Max.Price', 'num__MPG.city',
       'num__MPG.highway', 'num__EngineSize', 'num__Horsepower',
       'num__RPM', 'num__Rev.per.mile', 'num__Fuel.tank.capacity',
       'num__Passengers', 'num__Length', 'num__Wheelbase', 'num__Width',
       'num__Turn.circle', 'num__Rear.seat.room', 'num__Luggage.room',
       'cat__Manufacturer_Acura', 'cat__Manufacturer_Audi',
       'cat__Manufacturer_BMW', 'cat__Manufacturer_Buick',
       'cat__Manufacturer_Cadillac', 'cat__Manufacturer_Chevrolet',
       'cat__Manufacturer_Chrylser', 'cat__Manufacturer_Chrysler',
       'cat__Manufacturer_Dodge', 'cat__Manufacturer_Eagle',
       'cat__Manufacturer_Ford', 'cat__Manufacturer_Geo',
       'cat__Manufacturer_Honda', 'cat__Manufacturer_Hyundai',
       'cat__Manufacturer_Infiniti', 'cat__Manufacturer_Lexus',
       'cat__Manufacturer_Lincoln', 'cat__Manufacturer_Mazda',
       'cat__Manufacturer_Mercedes-Benz', 'cat__Manufacturer_Mercury',
       'cat__Man

In [223]:
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,-0.485787,-0.375720,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.129530,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.388017,1.497844,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.008658,0.998227,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.755752,1.091905,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,-0.060445,0.019810,0.073018,-0.959938,-1.524806,-0.162577,-0.668585,-1.315292,1.180155,1.359872,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
89,0.054512,0.051036,0.045673,-0.244313,0.172352,-0.647181,-0.188665,0.875050,0.714407,0.562740,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
90,0.663786,0.394523,0.164167,-0.781032,-0.770514,0.128186,0.655993,0.875050,0.106911,0.562740,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
91,0.537333,0.332071,0.145937,-0.244313,-0.204794,-0.356418,-0.572601,0.201098,-0.237337,-0.265051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Train test split

In [224]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre,Y, test_size=0.2, random_state=10)
xtrain.shape


(74, 256)

In [225]:
xtest.shape

(19, 256)

## Create Simple Model

In [226]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [227]:
from PM6func import evaluate_model
evaluate_model(xtrain,xtest,ytrain,ytest,model)

Metrics of Train Data
MSE: 0.00
MAE: 0.00
RMSE: 0.00
R2: 1.00


Metrics of Test Data
MSE: 35017.14
MAE: 142.93
RMSE: 187.13
R2: 0.85


## cross validation

In [228]:
from sklearn.model_selection import cross_val_score

r2_cv=cross_val_score(model,xtrain,ytrain,cv=5,scoring='r2')
r2_cv

array([0.97234143, 0.9603542 , 0.88368575, 0.9585701 , 0.93252518])

In [229]:
r2_cv.mean()

np.float64(0.9414953301243653)

## MSE cross validation

In [230]:
mse_cv = cross_val_score(model,xtrain,ytrain,cv=5,scoring='neg_mean_squared_error')
mse_cv

array([-10059.80950465, -18202.20595306, -37647.30549391, -17814.56223279,
       -17937.87230003])

In [231]:
-mse_cv.mean()

np.float64(20332.351096887276)

## RMSE cross validation

In [232]:
rmse_cv = cross_val_score(model,xtrain,ytrain,cv=5,scoring='neg_root_mean_squared_error')
rmse_cv

array([-100.29860171, -134.91555119, -194.02913568, -133.47120376,
       -133.93234225])

In [233]:
-rmse_cv.mean()

np.float64(139.32936691772403)

## MAE cross validation

In [234]:
mae_cv = cross_val_score(model,xtrain,ytrain,cv=5,scoring='neg_mean_absolute_error')
mae_cv

array([ -78.8406222 , -103.43081043, -157.39604617, -105.28839536,
       -101.68892746])

In [235]:
-mae_cv.mean()

np.float64(109.32896032358869)

## Hyperparameter Tuning

In [236]:
import numpy as np
params={'alpha': np.arange(0.1,100,0.1)}
params

{'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
         1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
         2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
         3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
         4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
         5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
         6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
         7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
         8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
        10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
        11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
        13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
        14.4, 14.5, 14.6, 14.

In [237]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

rr = Ridge()
gscv = GridSearchCV(rr, param_grid= params, cv=5, scoring='neg_mean_squared_error')
gscv.fit(xtrain,ytrain)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': array([ 0.1, ..., 99.8, 99.9])}"
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(3.3000000000000003)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [238]:
gscv.best_params_

{'alpha': np.float64(3.3000000000000003)}

In [239]:
gscv.best_score_

np.float64(-19485.692686101156)

In [240]:
best_ridge = gscv.best_estimator_
best_ridge

0,1,2
,alpha,np.float64(3.3000000000000003)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [241]:
best_ridge.score(xtrain,ytrain)

0.9916243669101344

In [242]:
best_ridge.score(xtest,ytest)

0.882535175103706

In [243]:
evaluate_model(xtrain, xtest, ytrain, ytest, best_ridge)

Metrics of Train Data
MSE: 3115.02
MAE: 42.27
RMSE: 55.81
R2: 0.99


Metrics of Test Data
MSE: 27291.18
MAE: 129.92
RMSE: 165.20
R2: 0.88


## Use lasso

In [244]:
from sklearn.linear_model import Lasso
ls = Lasso()
gscv2 = GridSearchCV(ls, param_grid=params, cv=5 , scoring='neg_mean_squared_error')
gscv2.fit(xtrain,ytrain)

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': array([ 0.1, ..., 99.8, 99.9])}"
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(4.7)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [245]:
gscv2.best_score_

np.float64(-20339.424655870538)

In [246]:
gscv2.best_params_

{'alpha': np.float64(4.7)}

In [247]:
best_lasso = gscv2.best_estimator_
best_lasso

0,1,2
,alpha,np.float64(4.7)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [248]:
evaluate_model(xtrain,xtest,ytrain,ytest,best_lasso)

Metrics of Train Data
MSE: 10634.03
MAE: 82.63
RMSE: 103.12
R2: 0.97


Metrics of Test Data
MSE: 30737.45
MAE: 132.42
RMSE: 175.32
R2: 0.87


In [249]:
ypred_tr = best_ridge.predict(xtrain)
ypred_ts = best_ridge.predict(xtest)

In [250]:
ypred_tr[0:5]

array([3026.59948831, 2477.65873578, 3510.02215448, 3523.16230693,
       3078.93839347])

In [251]:
ytrain.head()

Unnamed: 0,Weight
42,3040
53,2440
21,3570
6,3470
26,3080


In [252]:
ypred_ts[0:5]

array([2900.55862069, 3168.8632676 , 3576.33227981, 3984.44889469,
       3475.47249895])

In [253]:
ytest.head()

Unnamed: 0,Weight
34,2710
90,2810
3,3405
35,3735
19,3515


## Test model on Sample dataset

In [254]:
sample = pd.read_csv('Datasets/sample.csv')
sample.head()

Unnamed: 0,Id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,94,Honda,PreludeS,Sporty,17.9,18.6,20.5,26,32,Driver & Passenger,...,4,188,102,68,36,22.6,7.7,2997,non-USA,Honda PreludeS
1,95,Dodge,ColtZ,Small,8.0,9.0,11.6,31,34,,...,5,172,92,71,32,26.8,11.4,2106,USA,Dodge ColtZ
2,96,Mazda,MPVX,Van,16.7,18.0,23.6,18,25,,...,7,179,108,78,38,25.4,,3628,non-USA,Mazda MPVX
3,97,Pontiac,LeMansPlus,Small,7.8,9.3,8.9,30,39,,...,4,168,92,62,35,25.0,15.5,2234,USA,Pontiac LeMansPlus
4,98,Acura,IntegraS,Small,12.3,15.9,18.1,24,28,,...,5,168,95,68,41,25.1,11.4,2847,non-USA,Acura IntegraS


In [255]:
X_smp = sample.drop(columns=['Id','Weight'])
Y_smp = sample[['Weight']]

In [259]:
X_smpnew = pre.transform(X_smp).toarray()
X_smpnew

array([[ 0.08899922, -0.0946855 , -0.12751122, ...,  0.        ,
         0.        ,  0.        ],
       [-1.0490783 , -1.09391976, -0.93874135, ...,  0.        ,
         0.        ,  0.        ],
       [-0.04894957, -0.15713765,  0.15505208, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.94561671, -1.02105893, -0.96608618, ...,  0.        ,
         0.        ,  0.        ],
       [-1.06057403, -0.91697203, -1.01166091, ...,  0.        ,
         0.        ,  0.        ],
       [-0.22138556,  0.12389699,  0.4011556 , ...,  0.        ,
         0.        ,  0.        ]], shape=(12, 256))

In [257]:
# new_cols = pre.get_feature_names_out()
# new_cols

In [260]:
X_smpnew = pd.DataFrame(X_smpnew, columns=[cols])
X_smpnew
# len(cols)

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,0.088999,-0.094686,-0.127511,0.650219,0.549498,-0.259498,0.483222,0.04272,0.894631,-0.6023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.049078,-1.09392,-0.938741,1.544751,0.926644,-1.228705,-1.071717,0.283657,2.208849,-1.368773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.04895,-0.157138,0.155052,-0.781032,-0.770514,0.515869,0.291254,-0.816568,-0.545136,0.83867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.07207,-1.062694,-1.184845,1.365844,1.86951,-1.131784,-1.321275,0.509431,1.858526,-1.246137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.554762,-0.37572,-0.34627,0.292406,-0.204794,-0.841022,-0.073484,0.765532,0.870331,-0.724935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.359334,-0.313268,-0.510339,0.1135,-0.581941,0.031265,-0.726175,-1.374263,0.27701,-0.01978,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-0.600745,-0.760842,-0.656178,1.008032,1.115217,-1.131784,-0.879749,0.888529,1.540603,-1.18482,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.566258,-0.687981,-0.665293,0.650219,0.738071,-0.356418,0.444829,1.921359,0.515958,-0.173075,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2.503103,2.268087,1.677248,-0.959938,-0.959087,1.678918,3.189968,1.515303,-0.421611,1.206577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.945617,-1.021059,-0.966086,0.1135,0.360925,-0.937943,-0.380633,1.356925,1.13358,-1.307455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
y_pred = best_ridge.predict(X_smpnew)
y_pred

array([2920.58449537, 2263.89572259, 3699.62098043, 2077.40243205,
       2777.22834489, 3099.035821  , 2553.27278297, 2945.31962564,
       3723.92264155, 2634.95280849, 2163.90582354, 4139.84756429])