# Model training

In this notebook I:  
- Trained a linear regression model to predict the optimal rental price per day based on multiple features
- Regularized the model for any overfitting  
- Fine tuned the best hyperparameters to use  
- Evaluated model robustness via a cross validation

## Import libraries

In [2]:
import os
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate, GridSearchCV

In [3]:
DATASET_PATH = "../data/get_around_pricing_project_cleaned.csv"
MODELS_FOLDER = "models"

## Import data

In [4]:
# Load CSV file of cleaned data
df = pd.read_csv(DATASET_PATH)
print(df.shape)
print(df.columns)

(4842, 14)
Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')


## 1. Train test split

In [5]:
# Separate target from explanatory variables
y = df["rental_price_per_day"]
X = df.drop(["rental_price_per_day"], axis=1)

print('Target lenght:', y.shape)
print('Explanatory variables df shape:', X.shape)

Target lenght: (4842,)
Explanatory variables df shape: (4842, 13)


In [6]:
#Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X train shape:', X_train.shape)
print('y train shape', y_train.shape)
print('X test shape:', X_test.shape)
print('y test shape', y_test.shape)

X train shape: (3873, 13)
y train shape (3873,)
X test shape: (969, 13)
y test shape (969,)


In [7]:
# Convert dataframes to numpy arrays for pre-processing
X_train = X_train.values
X_test = X_test.values
y_train = y_train.tolist()
y_test = y_test.tolist()

print(X_train[0:2,:])
print(X_test[0:2,:])
print()
print(y_train[0:2])
print(y_test[0:2])

[['Citroën' 234365 135 'diesel' 'black' 'estate' True True False False
  True False True]
 ['Volkswagen' 57344 70 'diesel' 'grey' 'hatchback' False True False
  False False False True]]
[['Toyota' 193657 85 'diesel' 'silver' 'van' False False False False
  False False True]
 ['Audi' 178112 170 'petrol' 'silver' 'sedan' True True True False False
  False True]]

[127, 109]
[94, 37]


## 2. Pre-processing

In [8]:
## Script re-used from scripts given during Jedha Bootcamp course 

# Automatically detect positions of numeric/categorical features in explanatory variables dataframe
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
        numeric_indices.append(idx)
    else :
        categorical_features.append(i)
        categorical_indices.append(idx)

    idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['mileage', 'engine_power']  at positions  [1, 2]
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']  at positions  [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [9]:
# Pipeline for pre-processing categorical features and standardizing numerical features

# Normalization
numeric_transformer = StandardScaler()

# One hot encoding
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

## 3. Linear regression model

In [10]:
# Define full pipeline including pre-processing and basline linear regressor
baseline_regressor = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', LinearRegression())
    ])

regressor_ridge = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', Ridge(alpha=1.5))
    ])

regressor_lasso = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', Lasso(alpha=1.5))
    ])

regressor_rf = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', RandomForestRegressor())
    ])

### Model performance  
Evaluate performance of a baseline regressor model and compare to other regressors

In [11]:
# Fit models and evaluate performance with coefficient of determination R^2

print('Baseline regressor')
baseline_regressor.fit(X_train, y_train)
print("R2 score on training set: ", baseline_regressor.score(X_train,y_train))
print("R2 score on test set: ", baseline_regressor.score(X_test,y_test))
print('##################')
print('Rigdge regressor')
regressor_ridge.fit(X_train, y_train)
print("R2 score on training set: ", regressor_ridge.score(X_train,y_train))
print("R2 score on test set: ", regressor_ridge.score(X_test,y_test))
print('##################')
print('Lasso regressor')
regressor_lasso.fit(X_train, y_train)
print("R2 score on training set: ", regressor_lasso.score(X_train,y_train))
print("R2 score on test set: ", regressor_lasso.score(X_test,y_test))
print('##################')
print('Random Forest regressor')
regressor_rf.fit(X_train, y_train)
print("R2 score on training set: ", regressor_rf.score(X_train,y_train))
print("R2 score on test set: ", regressor_rf.score(X_test,y_test))

Baseline regressor
R2 score on training set:  0.7164492031260445
R2 score on test set:  0.6516031988969857
##################
Rigdge regressor
R2 score on training set:  0.7140680243100068
R2 score on test set:  0.6866344732555352
##################
Lasso regressor
R2 score on training set:  0.6123740505563878
R2 score on test set:  0.607668530839915
##################
Random Forest regressor
R2 score on training set:  0.9659780533559799
R2 score on test set:  0.7592812287945734


### Best model hyperparameter tuning  
Evaluate best hyperparameters to use for the model having the best performance  

First use RandomizedSearch to narrow down the hyperparameters to use
Secondly use GridSearch to fine tune on search for hyperparameters

Randomized search code based on Hyperparameter tuning publication by Will Koehrsen

In [12]:
# Randomized search: create random grid

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [13]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
regressor_rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
randomsearch = RandomizedSearchCV(estimator = regressor_rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1)

X_train_t = featureencoder.fit_transform(X_train)
X_test_t = featureencoder.transform(X_test)

# Fit the random search model
randomsearch.fit(X_train_t, y_train)



In [14]:
# Best hyperparameters found through randomized search
print("Best hyperparameters for Random Forest regressor: ", randomsearch.best_params_)

Best hyperparameters for Random Forest regressor:  {'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}


In [15]:
# Fine tune hyperparameter search

# Perform Grid Search for best parameters for regressors
regressor_rf = RandomForestRegressor()

# Grid of values to be tested: 
params = {
    'max_depth': [70,80, 90, 100],
    'min_samples_split': [3, 4, 5],
    'n_estimators': [1000,1200,1400]
}
gridsearch_rf = GridSearchCV(regressor_rf, param_grid = params, cv=5 ,verbose=0, n_jobs = -1)
gridsearch_rf.fit(X_train_t, y_train)

print("Best hyperparameters for Random Forest regressor: ", gridsearch_rf.best_params_)
print("Best R2 score Random Forest : ", gridsearch_rf.best_score_)



Best hyperparameters for Random Forest regressor:  {'max_depth': 70, 'min_samples_split': 5, 'n_estimators': 1000}
Best R2 score Random Forest :  0.7548610467394663


### Best model crossvalidation

In [16]:
# Define final pipeline of best model using identified hyperparameters
best_regressor_rf = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', RandomForestRegressor(n_estimators=1200, 
        min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=80, bootstrap=False))
    ])

In [17]:
best_regressor_rf.fit(X_train, y_train)
print("R2 score on training set: ", best_regressor_rf.score(X_train,y_train))
print("R2 score on test set: ", best_regressor_rf.score(X_test,y_test))

R2 score on training set:  0.9551267458505919
R2 score on test set:  0.7431710778917816


In [18]:
# Cross validate to obtain test and train scores as well as fit and score time
scores = cross_validate(best_regressor_rf, X, y, return_train_score=True, cv=10)
scores_df = pd.DataFrame(scores).reset_index().rename(columns={'index':'cv'})
scores_df.head()



Unnamed: 0,cv,fit_time,score_time,test_score,train_score
0,0,9.789462,0.122772,0.567467,0.95855
1,1,9.783456,0.097236,0.618854,0.958226
2,2,9.703144,0.096081,0.664858,0.955867
3,3,9.733331,0.101031,0.619632,0.957399
4,4,9.779461,0.087963,0.726593,0.956218


In [19]:
print('Cross validation results for linear regressor with Random Forest Regressor:')
print("%0.2f mean accuracy with a mean standard deviation of %0.2f" % (scores_df.test_score.mean(), scores_df.test_score.std()))

Cross validation results for linear regressor with Random Forest Regressor:
0.69 mean accuracy with a mean standard deviation of 0.08


### Compare to cross validation of Ridge

In [20]:
# Confirm best hyperparameters of Ridge through GridSearch
regressor_ridge = Ridge()

# Grid of values to be tested: (alpha ==0 is equivalent to ordinary least squares solved by a linear regressor; tol = precision of the solution - default 1e-03)
params = {
    'alpha': [0.0, 0.1, 0.5, 1.0, 1.5],
    'tol': [1e-05, 1e-04, 1e-03]
}
gridsearch = GridSearchCV(regressor_ridge, param_grid = params, cv = 10)
gridsearch.fit(X_train_t, y_train)

print("Best hyperparameters for Ridge regressor: ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Best hyperparameters for Ridge regressor:  {'alpha': 1.5, 'tol': 1e-05}
Best R2 score :  0.7062874256314109


In [21]:
# Cross validate to obtain test and train scores as well as fit and score time
best_regressor_ridge = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', Ridge(alpha=1.5, tol=1e-05))
    ])


scores = cross_validate(best_regressor_ridge, X, y, return_train_score=True, cv=10)
scores_df = pd.DataFrame(scores).reset_index().rename(columns={'index':'cv'})
scores_df.head()



Unnamed: 0,cv,fit_time,score_time,test_score,train_score
0,0,0.018976,0.003355,0.623242,0.715776
1,1,0.016574,0.003057,0.599449,0.716558
2,2,0.0179,0.002736,0.627115,0.713049
3,3,0.015859,0.002559,0.616235,0.709882
4,4,0.015953,0.002752,0.730865,0.706684


In [22]:
print('Cross validation results for linear regressor with Ridge Regressor:')
print("%0.2f mean accuracy with a mean standard deviation of %0.2f" % (scores_df.test_score.mean(), scores_df.test_score.std()))

Cross validation results for linear regressor with Ridge Regressor:
0.65 mean accuracy with a mean standard deviation of 0.07


## 3. Export of best model  

Whilst at first the Random Forest Regressor appeared to perform better, following cross validation I preferred to choose the Rige model which has similar stability (lower mean sd)as we can better interpret it regarding feature contribution


In [23]:
best_regressor_ridge.fit(X_train, y_train)
joblib.dump(best_regressor_ridge, "../models/reg_model.joblib")

['../models/reg_model.joblib']

## 4. Summary  
The best model identified was the Ridge regressor with a mean accuracy score of 0.65 and a mean standard deviation of 0.07 across a 10-fold crossvalidation 

In [24]:
best_regressor_ridge.get_params


<bound method Pipeline.get_params of Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  [0, 3, 4, 5, 6, 7, 8, 9, 10,
                                                   11, 12]),
                                                 ('num', StandardScaler(),
                                                  [1, 2])])),
                ('lin_reg', Ridge(alpha=1.5, tol=1e-05))])>