# Model training

In this notebook I:  
- Trained a linear regression model to predict the optimal rental price per day based on multiple features
- Regularized the model for any overfitting  
- Fine tuned the best hyperparameters to use  
- Evaluated model robustness via a cross validation

## Import libraries

In [80]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [58]:
DATASET_PATH = "../data/get_around_pricing_project_cleaned.csv"
MODELS_FOLDER = "models"

## Import data

In [59]:
# Load CSV file of cleaned data
df = pd.read_csv(DATASET_PATH)
print(df.shape)
print(df.columns)

(4842, 14)
Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')


## 1. Train test split

In [60]:
# Separate target from explanatory variables
y = df["rental_price_per_day"]
X = df.drop(["rental_price_per_day"], axis=1)

print('Target lenght:', y.shape)
print('Explanatory variables df shape:', X.shape)

Target lenght: (4842,)
Explanatory variables df shape: (4842, 13)


In [61]:
#Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X train shape:', X_train.shape)
print('y train shape', y_train.shape)
print('X test shape:', X_test.shape)
print('y test shape', y_test.shape)

X train shape: (3873, 13)
y train shape (3873,)
X test shape: (969, 13)
y test shape (969,)


In [62]:
# Convert dataframes to numpy arrays for pre-processing
X_train = X_train.values
X_test = X_test.values
y_train = y_train.tolist()
y_test = y_test.tolist()

print(X_train[0:2,:])
print(X_test[0:2,:])
print()
print(y_train[0:2])
print(y_test[0:2])

[['Citroën' 234365 135 'diesel' 'black' 'estate' True True False False
  True False True]
 ['Volkswagen' 57344 70 'diesel' 'grey' 'hatchback' False True False
  False False False True]]
[['Toyota' 193657 85 'diesel' 'silver' 'van' False False False False
  False False True]
 ['Audi' 178112 170 'petrol' 'silver' 'sedan' True True True False False
  False True]]

[127, 109]
[94, 37]


## 2. Pre-processing

In [63]:
## Script re-used from scripts given during Jedha Bootcamp course 

# Automatically detect positions of numeric/categorical features in explanatory variables dataframe
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
        numeric_indices.append(idx)
    else :
        categorical_features.append(i)
        categorical_indices.append(idx)

    idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['mileage', 'engine_power']  at positions  [1, 2]
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']  at positions  [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [64]:
# Pipeline for pre-processing categorical features and standardizing numerical features

# Normalization
numeric_transformer = StandardScaler()

# One hot encoding
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

## 3. Linear regression model

In [81]:
# Define full pipeline including pre-processing and basline linear regressor
regressor = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', LinearRegression())
    ])

regressor_ridge = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', Ridge(alpha=2, tol=1e-04))
    ])

regressor_lasso = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', Lasso(alpha=1.5))
    ])

regressor_rf = Pipeline([
        ('preprocessing', featureencoder),
        ('lin_reg', RandomForestRegressor())
    ])

### Model performance

In [None]:
# Fit model and predict target variable
regressor.fit(X_train, y_train)

y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

print("R2 score on training set: ", r2_score(y_train, y_pred_train))
print("R2 score on test set: ", r2_score(y_test, y_pred_test))

R2 score on training set:  0.7164492031260445
R2 score on test set:  0.6516031988969857


In [75]:
# Ridge regularization
# Fit model and predict target variable
regressor_ridge.fit(X_train, y_train)

y_pred_train = regressor_ridge.predict(X_train)
y_pred_test = regressor_ridge.predict(X_test)

print("R2 score on training set: ", r2_score(y_train, y_pred_train))
print("R2 score on test set: ", r2_score(y_test, y_pred_test))

R2 score on training set:  0.7139992796034528
R2 score on test set:  0.6863492622374188


In [78]:
# Lasso regularization
# Fit model and predict target variable
regressor_lasso.fit(X_train, y_train)

y_pred_train = regressor_lasso.predict(X_train)
y_pred_test = regressor_lasso.predict(X_test)

print("R2 score on training set: ", r2_score(y_train, y_pred_train))
print("R2 score on test set: ", r2_score(y_test, y_pred_test))

R2 score on training set:  0.6123740505563878
R2 score on test set:  0.607668530839915


In [82]:
# Random Forest Regressor
# Fit model and predict target variable
regressor_rf.fit(X_train, y_train)

y_pred_train = regressor_rf.predict(X_train)
y_pred_test = regressor_rf.predict(X_test)

print("R2 score on training set: ", r2_score(y_train, y_pred_train))
print("R2 score on test set: ", r2_score(y_test, y_pred_test))

R2 score on training set:  0.9646749846603689
R2 score on test set:  0.7556731715860856


In [None]:
# Need to carry out gridsearch to fine tune hyperparameters of RF