In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns 

In [2]:
df = pd.read_csv("C:\\Users\\emman\\Desktop\\ML-Zoomcamp  projects\\car price prediction data.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [4]:
data = df.drop(['Engine Fuel Type','Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Popularity'], axis=1)

In [5]:
data.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


# Data Preparation:

#Select only the features from above and transform their names and records to lower cases

In [6]:
data.columns = data.columns.str.lower().str.replace(' ','_')
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [7]:
categorical_features = list(data.dtypes[data.dtypes == 'object'].index)
categorical_features

['make', 'model', 'transmission_type', 'vehicle_style']

In [8]:
for c in categorical_features:
    data[c] = data[c].str.lower().str.replace(' ','_')

In [9]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500


#Fill in the missing values of the selected features with 0.

In [10]:
data.isnull().sum()             # missing values of features

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [11]:
data = data.fillna(0)               # filled values of features
data.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

#Rename MSRP variable to price.

In [12]:
data = data.rename(columns={'msrp':'price'})
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500


#1. What is the most frequent observation (mode) for the column transmission_type?

In [13]:
data['transmission_type'].mode()

0    automatic
Name: transmission_type, dtype: object

#2. Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset. What are the two features that have the biggest correlation in this dataset?

In [14]:
data.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [15]:
numerical_features = ['year','engine_hp','engine_cylinders','highway_mpg','city_mpg']
numerical_features

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [16]:
data[numerical_features].corrwith(data['price'])

year                0.227590
engine_hp           0.650095
engine_cylinders    0.526274
highway_mpg        -0.160043
city_mpg           -0.157676
dtype: float64

#Now we need to turn the price variable from numeric into a binary format.

#Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [17]:
data['price'].mean()

40594.737032063116

In [18]:
data['above_average'] = data['price'] / data['price'].mean()
data['above_average'] = (data['above_average'] > 1).astype(int)
data.sample(7)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
10090,pontiac,sunfire,2004,140.0,4.0,manual,coupe,33,23,14930,0
412,bmw,4_series_gran_coupe,2015,240.0,4.0,automatic,sedan,34,23,40300,0
2671,chevrolet,c/k_2500_series,1998,255.0,8.0,manual,extended_cab_pickup,18,12,3360,0
1991,pontiac,bonneville,2005,275.0,8.0,automatic,sedan,22,15,35585,0
6447,subaru,loyale,1993,90.0,4.0,manual,wagon,26,21,2000,0
1773,subaru,b9_tribeca,2007,245.0,6.0,automatic,4dr_suv,21,16,35495,0
7774,infiniti,q50,2015,328.0,6.0,automatic,sedan,29,20,43650,1


#Split your data in train/val/test sets with 60%/20%/20% distribution.

#Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

#Make sure that the target value (above_average) is not in your dataframe.

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
data_full_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

In [21]:
len(data_full_train), len(data_test)

(9531, 2383)

In [22]:
data_train, data_val = train_test_split(data_full_train, test_size=0.25, random_state=42)

In [23]:
len(data_train), len(data_val)

(7148, 2383)

In [24]:
data_train = data_train.reset_index(drop=True)

In [25]:
data_val = data_val.reset_index(drop=True)

In [26]:
data_test = data_test.reset_index(drop=True)

In [27]:
data_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,mitsubishi,endeavor,2011,225.0,6.0,automatic,4dr_suv,19,15,33599,0
1,kia,borrego,2009,276.0,6.0,automatic,4dr_suv,21,17,26245,0
2,lamborghini,gallardo,2012,570.0,10.0,manual,convertible,20,12,248000,1
3,chevrolet,colorado,2016,200.0,4.0,automatic,crew_cab_pickup,27,20,24990,0
4,pontiac,vibe,2009,158.0,4.0,automatic,4dr_hatchback,26,20,20475,0


In [28]:
data_val.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,volkswagen,beetle,2015,210.0,4.0,manual,2dr_hatchback,31,23,28675,0
1,audi,sq5,2015,354.0,6.0,automatic,4dr_suv,24,17,60200,1
2,pontiac,grand_am,2005,140.0,4.0,automatic,sedan,31,22,20090,0
3,nissan,350z,2009,306.0,6.0,manual,convertible,24,17,39220,0
4,ford,e-150,1996,199.0,6.0,automatic,passenger_van,15,11,2000,0


In [29]:
data_test.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,gmc,envoy_xl,2005,275.0,6.0,automatic,4dr_suv,18,13,29695,0
1,volkswagen,passat,2016,170.0,4.0,automatic,sedan,38,25,30495,0
2,honda,odyssey,2016,248.0,6.0,automatic,passenger_minivan,28,19,37650,0
3,chevrolet,cruze,2015,138.0,4.0,manual,sedan,36,25,16170,0
4,volvo,740,1991,162.0,4.0,automatic,sedan,20,17,2000,0


In [30]:
y_train = data_train['above_average'].values
y_val = data_val['above_average'].values
y_test = data_test['above_average']. values

In [31]:
del data_train['above_average']
del data_val['above_average']
del data_test['above_average']

In [32]:
data_full_train = data_full_train.reset_index(drop=True)
data_full_train.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
above_average        0
dtype: int64

In [33]:
data_full_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,cadillac,ct6,2016,265.0,4.0,automatic,sedan,31,22,53495,1
1,mercedes-benz,gls-class,2017,449.0,8.0,automatic,4dr_suv,18,14,93850,1
2,kia,forte,2016,173.0,4.0,automatic,coupe,34,25,19890,0
3,dodge,ram_250,1993,180.0,6.0,manual,regular_cab_pickup,16,11,2000,0
4,hyundai,tiburon,2008,172.0,6.0,automatic,2dr_hatchback,24,17,21270,0


In [34]:
data_full_train['above_average'].value_counts()

0    6893
1    2638
Name: above_average, dtype: int64

In [35]:
data_full_train['above_average'].value_counts(normalize=True)

0    0.723219
1    0.276781
Name: above_average, dtype: float64

#3. Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only. Round the scores to 2 decimals using round(score, 2).

In [36]:
global_above_avg_rate = data_full_train['above_average'].mean()
round(global_above_avg_rate,2)

0.28

In [37]:
categorical_features = list(data_full_train.dtypes[data_full_train.dtypes == 'object'].index)
categorical_features

['make', 'model', 'transmission_type', 'vehicle_style']

In [38]:
from IPython.display import display

In [39]:
for x in categorical_features:
    print(x)
    data_group = data_full_train.groupby(x)['above_average'].agg(['mean','count'])
    data_group['diff'] = data_group['mean'] - global_above_avg_rate
    data_group['risk'] = data_group['mean'] / global_above_avg_rate
    display(data_group)
    print()
    print()

make


Unnamed: 0_level_0,mean,count,diff,risk
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acura,0.391089,202,0.114308,1.412991
alfa_romeo,1.0,5,0.723219,3.612964
aston_martin,1.0,74,0.723219,3.612964
audi,0.654412,272,0.377631,2.364366
bentley,1.0,55,0.723219,3.612964
bmw,0.822064,281,0.545283,2.970088
bugatti,1.0,3,0.723219,3.612964
buick,0.123377,154,-0.153404,0.445755
cadillac,0.881988,322,0.605207,3.18659
chevrolet,0.181313,899,-0.095468,0.655076




model


Unnamed: 0_level_0,mean,count,diff,risk
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.000000,11,-0.276781,0.000000
124_spider,0.000000,2,-0.276781,0.000000
190-class,0.000000,4,-0.276781,0.000000
1_series,0.416667,12,0.139886,1.505402
2,0.000000,10,-0.276781,0.000000
...,...,...,...,...
z3,0.000000,8,-0.276781,0.000000
z4,1.000000,6,0.723219,3.612964
z4_m,1.000000,4,0.723219,3.612964
z8,1.000000,2,0.723219,3.612964




transmission_type


Unnamed: 0_level_0,mean,count,diff,risk
transmission_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
automated_manual,0.474206,504,0.197425,1.713291
automatic,0.310017,6619,0.033236,1.120079
direct_drive,0.458333,48,0.181552,1.655942
manual,0.138652,2344,-0.138129,0.500944
unknown,0.0,16,-0.276781,0.0




vehicle_style


Unnamed: 0_level_0,mean,count,diff,risk
vehicle_style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2dr_hatchback,0.0,421,-0.276781,0.0
2dr_suv,0.035398,113,-0.241383,0.127893
4dr_hatchback,0.046099,564,-0.230682,0.166555
4dr_suv,0.374619,1970,0.097838,1.353486
cargo_minivan,0.0,60,-0.276781,0.0
cargo_van,0.0,73,-0.276781,0.0
convertible,0.55538,632,0.278599,2.006567
convertible_suv,0.153846,26,-0.122935,0.555841
coupe,0.496257,935,0.219476,1.792958
crew_cab_pickup,0.337017,543,0.060236,1.217629






In [40]:
from sklearn.metrics import mutual_info_score

In [41]:
score = data_full_train[categorical_features].apply(lambda categorical_features : mutual_info_score(data_full_train['above_average'], categorical_features))
round(score.sort_values(ascending=False),2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

#4. Now let's train a logistic regression.

#Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

#Fit the model on the training dataset.

#To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

#model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

#Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [42]:
dict_train = data_train[categorical_features + numerical_features].to_dict(orient='records')

In [87]:
from sklearn.feature_extraction import DictVectorizer

In [88]:
dv = DictVectorizer(sparse=False)

In [89]:
X_train = dv.fit_transform(dict_train)
X_train

array([[1.500e+01, 6.000e+00, 2.250e+02, ..., 0.000e+00, 0.000e+00,
        2.011e+03],
       [1.700e+01, 6.000e+00, 2.760e+02, ..., 0.000e+00, 0.000e+00,
        2.009e+03],
       [1.200e+01, 1.000e+01, 5.700e+02, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       ...,
       [1.700e+01, 6.000e+00, 2.600e+02, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       [1.900e+01, 4.000e+00, 1.360e+02, ..., 0.000e+00, 0.000e+00,
        1.993e+03],
       [1.700e+01, 6.000e+00, 3.650e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [90]:
dict_val = data_val[categorical_features + numerical_features].to_dict(orient='records')

In [91]:
X_val = dv.transform(dict_val)
X_val

array([[2.300e+01, 4.000e+00, 2.100e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.700e+01, 6.000e+00, 3.540e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [2.200e+01, 4.000e+00, 1.400e+02, ..., 1.000e+00, 0.000e+00,
        2.005e+03],
       ...,
       [1.200e+01, 6.000e+00, 1.900e+02, ..., 0.000e+00, 0.000e+00,
        2.003e+03],
       [1.400e+01, 8.000e+00, 4.300e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.800e+01, 6.000e+00, 3.210e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [92]:
from sklearn.linear_model import LogisticRegression

In [93]:
LogReg_model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [94]:
LogReg_model.fit(X_train, y_train)

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [102]:
y_pred = LogReg_model.predict_proba(X_val)[:,1]
y_pred

array([4.47467556e-04, 9.97408254e-01, 9.69186415e-05, ...,
       1.16567405e-04, 9.89365634e-01, 9.89637335e-01])

In [103]:
y_pred = y_pred >= 0.5

In [106]:
accuracy = (y_val == y_pred).mean()
round(accuracy,2)

0.95

In [107]:
from sklearn.metrics import r2_score

In [109]:
round(r2_score(y_val, y_pred),2)      #sklearn to confirm the % of how well regression prediction fit the data.

0.73

In [110]:
from sklearn.metrics import accuracy_score    

In [112]:
round(accuracy_score(y_val, y_pred),2)         #using sklearn to confirm the accuracy of the model.

0.95

#5. Let's find the least useful feature using the feature elimination technique.

#Train a model with all these features (using the same parameters as in Q4).

#Now exclude each feature from this set and train a model without it. Record the accuracy for each model.

#For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [113]:
categorical_features

['make', 'model', 'transmission_type', 'vehicle_style']

In [114]:
numerical_features

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [115]:
feature_no_year = ['make', 'model', 'transmission_type', 'vehicle_style','engine_hp', 'engine_cylinders', 
                   'highway_mpg', 'city_mpg']

In [116]:
dict_no_year = data_train[feature_no_year].to_dict(orient='records')

In [117]:
X_no_year = dv.fit_transform(dict_no_year)

In [118]:
model_year = LogReg_model.fit(X_no_year,y_train)
model_year

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [119]:
accuracy_year = model_year.score(X_no_year, y_train)
accuracy_year

0.9559317291550083

In [120]:
diff_in_accuracy_year = accuracy - accuracy_year
round(diff_in_accuracy_year,2)

-0.01

In [121]:
feature_no_engineHP = ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_cylinders', 
                       'highway_mpg', 'city_mpg']

In [122]:
dict_no_engineHP = data_train[feature_no_engineHP].to_dict(orient='records')

In [123]:
X_no_engineHP = dv.fit_transform(dict_no_engineHP)

In [124]:
model_engineHP = LogReg_model.fit(X_no_engineHP,y_train)
model_engineHP

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [125]:
accuracy_engineHP = model_engineHP.score(X_no_engineHP, y_train)
accuracy_engineHP

0.9383044208170117

In [126]:
diff_in_accuracy_engineHP = accuracy - accuracy_engineHP
round(diff_in_accuracy_engineHP,2)

0.01

In [127]:
feature_no_trans_type = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'make', 
                         'model', 'vehicle_style']

In [128]:
dict_no_trans_type = data_train[feature_no_trans_type].to_dict(orient='records')

In [129]:
X_no_trans_type = dv.fit_transform(dict_no_trans_type)

In [130]:
model_trans_type = LogReg_model.fit(X_no_trans_type, y_train)
model_trans_type

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [131]:
accuracy_trans_type = model_trans_type.score(X_no_trans_type, y_train)
accuracy_trans_type

0.9492165640738668

In [132]:
diff_in_accuracy_trans_type = accuracy - accuracy_trans_type
round(diff_in_accuracy_trans_type,2)

-0.0

In [133]:
feature_no_city_mpg = ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_hp',
                       'engine_cylinders', 'highway_mpg']

In [134]:
dict_no_city_mpg = data_train[feature_no_city_mpg].to_dict(orient='records')

In [135]:
X_no_city_mpg = dv.fit_transform(dict_no_city_mpg)

In [136]:
model_city_mpg = LogReg_model.fit(X_no_city_mpg, y_train)
model_city_mpg

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [137]:
accuracy_city_mpg = model_city_mpg.score(X_no_city_mpg, y_train)
accuracy_city_mpg

0.9552322327923894

In [145]:
diff_in_accuracy_city_mpg = accuracy - accuracy_city_mpg
round(diff_in_accuracy_city_mpg,2)

-0.01

In [146]:
smallest_diff = pd.DataFrame({'year' : diff_in_accuracy_year, 'engine_hp' : diff_in_accuracy_engineHP, 
                             'transmission_type' : diff_in_accuracy_trans_type, 'city_mpg' : diff_in_accuracy_city_mpg}, 
                             index = [0])

In [147]:
smallest_diff

Unnamed: 0,year,engine_hp,transmission_type,city_mpg
0,-0.010485,0.007142,-0.00377,-0.009785


#6. For this question, we'll see how to use a linear regression model from Scikit-Learn.

#We'll need to use the original column price. Apply the logarithmic transformation to this column.

#Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.

#This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].

#Round your RMSE scores to 3 decimal digits.

In [148]:
data_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,mitsubishi,endeavor,2011,225.0,6.0,automatic,4dr_suv,19,15,33599
1,kia,borrego,2009,276.0,6.0,automatic,4dr_suv,21,17,26245
2,lamborghini,gallardo,2012,570.0,10.0,manual,convertible,20,12,248000
3,chevrolet,colorado,2016,200.0,4.0,automatic,crew_cab_pickup,27,20,24990
4,pontiac,vibe,2009,158.0,4.0,automatic,4dr_hatchback,26,20,20475


In [149]:
data_val.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,volkswagen,beetle,2015,210.0,4.0,manual,2dr_hatchback,31,23,28675
1,audi,sq5,2015,354.0,6.0,automatic,4dr_suv,24,17,60200
2,pontiac,grand_am,2005,140.0,4.0,automatic,sedan,31,22,20090
3,nissan,350z,2009,306.0,6.0,manual,convertible,24,17,39220
4,ford,e-150,1996,199.0,6.0,automatic,passenger_van,15,11,2000


In [150]:
data_train['price'] = np.log1p(data_train['price'])
data_val['price'] = np.log1p(data_val['price'])

In [151]:
data_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,mitsubishi,endeavor,2011,225.0,6.0,automatic,4dr_suv,19,15,10.422281
1,kia,borrego,2009,276.0,6.0,automatic,4dr_suv,21,17,10.175269
2,lamborghini,gallardo,2012,570.0,10.0,manual,convertible,20,12,12.421188
3,chevrolet,colorado,2016,200.0,4.0,automatic,crew_cab_pickup,27,20,10.126271
4,pontiac,vibe,2009,158.0,4.0,automatic,4dr_hatchback,26,20,9.927009


In [152]:
data_val.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,volkswagen,beetle,2015,210.0,4.0,manual,2dr_hatchback,31,23,10.263816
1,audi,sq5,2015,354.0,6.0,automatic,4dr_suv,24,17,11.005444
2,pontiac,grand_am,2005,140.0,4.0,automatic,sedan,31,22,9.908027
3,nissan,350z,2009,306.0,6.0,manual,convertible,24,17,10.576968
4,ford,e-150,1996,199.0,6.0,automatic,passenger_van,15,11,7.601402


In [153]:
new_dict_train = data_train.to_dict(orient='records')

In [154]:
X_train = dv.fit_transform(new_dict_train)

In [155]:
new_dict_val = data_val.to_dict(orient='records')

In [156]:
X_val = dv.transform(new_dict_val)

In [157]:
#y_full_train = np.concatenate([y_train, y_val])

In [158]:
from sklearn.linear_model import Ridge

In [159]:
from sklearn.metrics import mean_squared_error

In [160]:
alpha = [0, 0.01, 0.1, 1, 10]

In [161]:
# if alpha = 0 

model_ridge = Ridge(solver='sag', alpha=0, random_state=42)

In [162]:
model_ridge.fit(X_train, y_train)

Ridge(alpha=0, random_state=42, solver='sag')

In [163]:
y_pred_1 = model_ridge.predict(X_val)
y_pred_1

array([0.15482491, 0.6593281 , 0.08341906, ..., 0.07830357, 0.7682632 ,
       0.5931935 ])

In [164]:
mse_1 = mean_squared_error(y_val, y_pred_1)
mse_1

0.09110035402877921

In [165]:
rmse_1 = np.sqrt(mse_1)
rmse_1.round(3)

0.302

In [166]:
# if alpha = 0.01

model_ridge = Ridge(solver='sag', alpha=0.01, random_state=42)

In [167]:
model_ridge.fit(X_train, y_train)

Ridge(alpha=0.01, random_state=42, solver='sag')

In [168]:
y_pred_2 = model_ridge.predict(X_val)
y_pred_2

array([0.15482511, 0.65932776, 0.08341876, ..., 0.07830338, 0.7682632 ,
       0.59319313])

In [169]:
mse_2 = mean_squared_error(y_val, y_pred_2)
mse_2

0.09110038468158982

In [170]:
rmse_2 = np.sqrt(mse_2)
rmse_2.round(3)

0.302

In [171]:
# if alpha = 0.1

model_ridge = Ridge(solver='sag', alpha=0.1, random_state=42)

In [172]:
model_ridge.fit(X_train, y_train)

Ridge(alpha=0.1, random_state=42, solver='sag')

In [173]:
y_pred_3 = model_ridge.predict(X_val)
y_pred_3

array([0.15482695, 0.65932472, 0.0834161 , ..., 0.07830166, 0.76826324,
       0.59318973])

In [174]:
mse_3 = mean_squared_error(y_val, y_pred_3)
mse_3

0.09110066055672253

In [175]:
rmse_3 = np.sqrt(mse_3)
rmse_3.round(3)

0.302

In [176]:
# if alpha = 1

model_ridge = Ridge(solver='sag', alpha=1, random_state=42)

In [177]:
model_ridge.fit(X_train, y_train)

Ridge(alpha=1, random_state=42, solver='sag')

In [178]:
y_pred_4 = model_ridge.predict(X_val)
y_pred_4

array([0.15484531, 0.65929429, 0.08338948, ..., 0.07828444, 0.76826366,
       0.59315576])

In [179]:
mse_4 = mean_squared_error(y_val, y_pred_4)
mse_4

0.0911034191503324

In [180]:
rmse_4 = np.sqrt(mse_4)
rmse_4.round(3)

0.302

In [181]:
# if alpha = 10

model_ridge = Ridge(solver='sag', alpha=10, random_state=42)

In [182]:
model_ridge.fit(X_train, y_train)

Ridge(alpha=10, random_state=42, solver='sag')

In [183]:
y_pred_5 = model_ridge.predict(X_val)
y_pred_5

array([0.15532449, 0.65842449, 0.08291721, ..., 0.07788546, 0.76835162,
       0.59208133])

In [184]:
mse_5 = mean_squared_error(y_val, y_pred_5)
mse_5

0.09119821133749974

In [185]:
rmse_5 = np.sqrt(mse_5)
rmse_5.round(3)

0.302