# Project 2: Model Development

# Task:
- develop a ML model to predict the price of a car based on the features included in this dataset

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Function Definitions

In [2]:
def rmse(true, pred):
    return np.sqrt(np.mean(np.abs(pred - true)**2))

def r2(true, pred):
    return np.corrcoef(true, pred)[0][1]**2

def mae(true, pred):
    return np.mean(np.abs(pred - true))

def mse(true, pred):
    return np.mean(np.abs(pred - true)**2)

def reg_metrics(train_true, train_pred,  test_true, test_pred):
    d = {}
    
    d['Metric'] = ['MAE', 'MSE', 'RMSE', 'R2']
    d['Train'] = [mae(train_true,train_pred),
                  mse(train_true, train_pred), 
                  rmse(train_true, train_pred), 
                  r2(train_true, train_pred)]
    d['Test'] = [mae(test_true,test_pred),
                  mse(test_true, test_pred), 
                  rmse(test_true, test_pred), 
                  r2(test_true, test_pred)]
    
    return pd.DataFrame(d)
    

# Data

In [3]:
df = pd.read_csv('cars_multiple_linearR.csv')

In [4]:
df.head()

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4345 entries, 0 to 4344
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         4345 non-null   object 
 1   Price         4173 non-null   float64
 2   Body          4345 non-null   object 
 3   Mileage       4345 non-null   int64  
 4   EngineV       4195 non-null   float64
 5   Engine Type   4345 non-null   object 
 6   Registration  4345 non-null   object 
 7   Year          4345 non-null   int64  
 8   Model         4345 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 305.6+ KB


# Pre-Processing
- For explanations of steps, see Data_Exploration.ipynb

In [6]:
df.drop(columns='Model', inplace=True)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df['Engine Type'].replace({'Petrol':'Gas'}, inplace=True)
mile_filter = df['Mileage'] > 450
df.drop(index=df.loc[mile_filter].index, inplace=True)
non_comb = df['Engine Type'] == 'Other'
df.drop(index = df.loc[non_comb].index, inplace=True)

In [7]:
le = LabelEncoder()
df['Engine Type'] = le.fit_transform(df['Engine Type'])
df['Registration'] = le.fit_transform(df['Registration'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3824 entries, 0 to 4342
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         3824 non-null   object 
 1   Price         3824 non-null   float64
 2   Body          3824 non-null   object 
 3   Mileage       3824 non-null   int64  
 4   EngineV       3824 non-null   float64
 5   Engine Type   3824 non-null   int64  
 6   Registration  3824 non-null   int64  
 7   Year          3824 non-null   int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 268.9+ KB


# Regression Models on Original Data

## Set up transformer
- OHE Brand and Body
- scale price, Mileage, EngineV, Year
- LE Engine Type, Registration

In [9]:
# Create column Selectors

## Numerical selector
num_sel = make_column_selector(dtype_include='number')

## Categorical Selector
cat_sel = make_column_selector(dtype_include='object')

In [10]:
# instantiate the le, ohe, and the scaler
ohe = OneHotEncoder(sparse=True, handle_unknown='ignore')
scaler = StandardScaler(with_mean=False)

In [11]:
# Build Tuples
cat_tuple = (ohe, cat_sel)
num_tuple = (scaler, num_sel)

In [12]:
# build column transformer
col_trans = make_column_transformer(cat_tuple, num_tuple, remainder='passthrough')
col_trans

## Validation Split

In [13]:
X = df.drop(columns='Price')
y = df['Price']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
X_train

Unnamed: 0,Brand,Body,Mileage,EngineV,Engine Type,Registration,Year
735,Mercedes-Benz,van,170,2.2,0,1,2011
530,Audi,sedan,127,2.8,1,1,2010
2208,Mitsubishi,crossover,235,3.0,1,1,2007
3902,Volkswagen,sedan,87,1.6,1,1,2011
848,Mercedes-Benz,sedan,245,2.2,0,1,2007
...,...,...,...,...,...,...,...
1260,Volkswagen,sedan,162,1.4,1,1,2012
1451,Toyota,vagon,73,4.0,1,1,2009
962,Audi,sedan,122,2.0,1,1,2011
3980,Toyota,sedan,185,3.0,1,1,2004


## Linear Regression Model

In [16]:
lin_reg_pipe = make_pipeline(col_trans, LinearRegression())
lin_reg_pipe.fit(X_train, y_train)

In [17]:
lin_train_pred = lin_reg_pipe.predict(X_train)
lin_test_pred = lin_reg_pipe.predict(X_test)

In [18]:
print(reg_metrics(y_train, lin_train_pred, y_test, lin_test_pred))

  Metric         Train          Test
0    MAE  1.024803e+04  9.839420e+03
1    MSE  3.590936e+08  2.460905e+08
2   RMSE  1.894977e+04  1.568727e+04
3     R2  4.538637e-01  5.018795e-01


## Regression Tree

In [19]:
dec_tree = DecisionTreeRegressor(random_state=42)
dec_pipe = make_pipeline(col_trans, dec_tree)

In [20]:
dec_pipe.fit(X_train, y_train)

In [21]:
dec_train_pred = dec_pipe.predict(X_train)
dec_test_pred = dec_pipe.predict(X_test)
print(reg_metrics(y_train, dec_train_pred, y_test, dec_test_pred))

  Metric         Train          Test
0    MAE  1.909020e+02  3.886836e+03
1    MSE  3.155772e+06  6.580379e+07
2   RMSE  1.776449e+03  8.111954e+03
3     R2  9.952000e-01  8.731693e-01


### Tune

In [22]:
dec_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('onehotencoder',
                                    OneHotEncoder(handle_unknown='ignore'),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fde098b7ee0>),
                                   ('standardscaler',
                                    StandardScaler(with_mean=False),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fde098b7910>)])),
  ('decisiontreeregressor', DecisionTreeRegressor(random_state=42))],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fde098b7ee0>),
      

In [23]:
dec_pipe.named_steps['decisiontreeregressor'].get_depth()

26

In [24]:
dec_pipe.named_steps['decisiontreeregressor'].get_n_leaves()

2701

In [25]:
np.arange(1000, 3100, 100)

array([1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000,
       2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000])

In [26]:
model = make_pipeline(col_trans, dec_tree)

param_grid = {'decisiontreeregressor__max_depth': np.arange(2, 20, 1),
              'decisiontreeregressor__max_leaf_nodes': np.arange(10, 110, 10)}

In [27]:
dec_grid_search = GridSearchCV(model, param_grid)

In [28]:
dec_grid_search.fit(X_train, y_train)

In [29]:
dec_grid_search.best_params_

{'decisiontreeregressor__max_depth': 10,
 'decisiontreeregressor__max_leaf_nodes': 50}

In [30]:
best_dec = make_pipeline(col_trans, DecisionTreeRegressor(random_state=42, max_depth=10, max_leaf_nodes=50))

In [31]:
best_dec.fit(X_train, y_train)

In [32]:
best_dec_train_pred = best_dec.predict(X_train)
best_dec_test_pred = best_dec.predict(X_test)
print(reg_metrics(y_train, best_dec_train_pred, y_test, best_dec_test_pred))

  Metric         Train          Test
0    MAE  4.209583e+03  4.606865e+03
1    MSE  4.312894e+07  6.381964e+07
2   RMSE  6.567263e+03  7.988719e+03
3     R2  9.343996e-01  8.732378e-01


## Random Forest

In [33]:
for_pipe = make_pipeline(col_trans, RandomForestRegressor(random_state=42))

In [34]:
for_pipe.fit(X_train, y_train)

In [35]:
for_train_pred = for_pipe.predict(X_train)
for_test_pred = for_pipe.predict(X_test)
print(reg_metrics(y_train, for_train_pred, y_test, for_test_pred))

  Metric         Train          Test
0    MAE  1.448475e+03  3.197355e+03
1    MSE  1.784059e+07  5.401731e+07
2   RMSE  4.223812e+03  7.349647e+03
3     R2  9.740337e-01  8.970010e-01


### Tune

In [36]:
for_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('onehotencoder',
                                    OneHotEncoder(handle_unknown='ignore'),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fde098b7ee0>),
                                   ('standardscaler',
                                    StandardScaler(with_mean=False),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fde098b7910>)])),
  ('randomforestregressor', RandomForestRegressor(random_state=42))],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fde098b7ee0>),
      

In [37]:
for_pipe.named_steps['randomforestregressor']

In [38]:
for_model = make_pipeline(col_trans, RandomForestRegressor())

for_params_grid = {'randomforestregressor__max_depth': np.arange(2, 40, 1),
               'randomforestregressor__min_samples_leaf':np.arange(1,10, 1)}

In [39]:
for_grid_search = GridSearchCV(for_model, for_params_grid)

In [40]:
for_grid_search.fit(X_train, y_train)

In [41]:
for_grid_search.best_params_

{'randomforestregressor__max_depth': 24,
 'randomforestregressor__min_samples_leaf': 2}

In [44]:
best_for = make_pipeline(col_trans, RandomForestRegressor(random_state=42, max_depth=26, min_samples_leaf=3))

In [45]:
best_for.fit(X_train, y_train)

In [46]:
best_for_train_pred = best_for.predict(X_train)
best_for_test_pred = best_for.predict(X_test)
print(reg_metrics(y_train, best_for_train_pred, y_test, best_for_test_pred))

  Metric         Train          Test
0    MAE  2.496773e+03  3.329876e+03
1    MSE  5.305999e+07  5.689991e+07
2   RMSE  7.284229e+03  7.543203e+03
3     R2  9.213431e-01  8.881568e-01


the test R2 here is lower than in my first example, I havent found the best parameter combination, but have run out of time for now and further searching takes a long time

# Feature Engineering
- Current best model is random forest with default hyperparameters
- try engineering
    - country of origin
    - age instead of year
- Then test random forest again and see if that changes

In [47]:
df2 = df.copy()

In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3824 entries, 0 to 4342
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         3824 non-null   object 
 1   Price         3824 non-null   float64
 2   Body          3824 non-null   object 
 3   Mileage       3824 non-null   int64  
 4   EngineV       3824 non-null   float64
 5   Engine Type   3824 non-null   int64  
 6   Registration  3824 non-null   int64  
 7   Year          3824 non-null   int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 268.9+ KB


In [49]:
df2['Country of Origin'] = df['Brand'].copy()
df2.replace({'BMW':'Germany',
                'Mercedes-Benz':'Germany',
                'Audi':'Germany',
                'Volkswagen':'Germany',
                'Toyota':'Japan',
                'Mitsubishi':'Japan',
                'Renault':'France'}, inplace=True)

In [50]:
df2.head()

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Country of Origin
0,Germany,4200.0,sedan,277,2.0,1,1,1991,Germany
1,Germany,7900.0,van,427,2.9,0,1,1999,Germany
2,Germany,13300.0,sedan,358,5.0,1,1,2003,Germany
3,Germany,23000.0,crossover,240,4.2,1,1,2007,Germany
4,Japan,18300.0,crossover,120,2.0,1,1,2011,Japan


In [51]:
df2['Year'] = 2022 - df['Year']

In [52]:
df2['Year'].describe()

count    3824.000000
mean       15.648274
std         6.633643
min         6.000000
25%        11.000000
50%        15.000000
75%        19.000000
max        53.000000
Name: Year, dtype: float64

In [53]:
X2 = df.drop(columns='Price')
y2 = df['Price']

In [54]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)

In [55]:
for_pipe_fe = make_pipeline(col_trans, RandomForestRegressor(random_state=42))

In [56]:
for_pipe_fe.fit(X2_train, y2_train)

In [57]:
for_fe_train_pred = for_pipe_fe.predict(X2_train)
for_fe_test_pred = for_pipe_fe.predict(X2_test)
print(reg_metrics(y2_train, for_fe_train_pred, y2_test, for_fe_test_pred))

  Metric         Train          Test
0    MAE  1.448475e+03  3.197355e+03
1    MSE  1.784059e+07  5.401731e+07
2   RMSE  4.223812e+03  7.349647e+03
3     R2  9.740337e-01  8.970010e-01


This did not change the outcome

In [59]:
y2_test.head()

3532    11850.0
4216     3100.0
2205     8700.0
2706    12500.0
1650     8400.0
Name: Price, dtype: float64

In [62]:
for_fe_test_pred[0:5]

array([11967.91,  3294.5 ,  6334.27, 12339.46,  8700.48])

In [64]:
y2_test.mean()

18297.50530334728

In [66]:
X2_test.head()

Unnamed: 0,Brand,Body,Mileage,EngineV,Engine Type,Registration,Year
3532,Mercedes-Benz,van,177,2.2,0,1,2011
4216,Volkswagen,sedan,390,1.8,1,1,1990
2205,BMW,other,212,3.0,0,0,2004
2706,Volkswagen,sedan,84,2.0,1,1,2007
1650,Renault,van,260,1.9,0,1,2005
