In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge

1. get the data to df
2. clean the data (Caps, '_')
3. Missing values
4. Apply log to price
5. Split the data (train, val, test) using scikit-learn
6. Categorical + numerical features. One hot encoding. 
7. Fit the Ridge model
8. train the model on train df
9. fit the model with val df and predict the output
10. rmse for val df

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


In [4]:
df = df[['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']]

In [5]:
df.columns= df.columns.str.lower().str.replace(' ','_')

In [6]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [7]:
df = df.fillna(0)

In [8]:
df = df.rename(columns={'msrp':'price'})

In [9]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

### Train and Validation dataset split

In [10]:
df_full_train_and_val, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [11]:
df_train, df_val = train_test_split(df_full_train_and_val, test_size=0.25, random_state=1)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

### Applying log trans 

In [13]:
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

In [14]:
del df_train['price']
del df_val['price']
del df_test['price']

In [15]:
len(df_train), len(df_val),len(df_test)

(7148, 2383, 2383)

In [16]:
numerical = ['engine_hp','engine_cylinders','highway_mpg','city_mpg']

categorical = [
    'year',
    'make',
    'model',
    'transmission_type',
    'vehicle_style'
]

In [17]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

### One hot encoding

In [18]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [19]:
X_val.shape

(2383, 954)

In [20]:
model = Ridge(solver='sag', alpha=1, random_state=42)
model.fit(X_train, y_train)



In [21]:
y_pred = model.predict(X_val)

In [22]:
#y_pred = np.expm1(y_pred)
#y_val = np.expm1(y_val)

In [23]:
#alpha = 0
round(rmse(y_val, y_pred),3)

0.486

In [24]:
#alpha = 0.01
round(rmse(y_val, y_pred),3)

0.486

In [25]:
#alpha = 0.1
round(rmse(y_val, y_pred),3)

0.486

In [26]:
#alpha = 1
round(rmse(y_val, y_pred),3)

0.486

In [27]:
#alpha = 10
round(rmse(y_val, y_pred),3)

0.486