# import libraries

In [28]:
import pandas as pd
import numpy as np
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression , Lasso , Ridge, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 
from sklearn.svm import SVR, LinearSVR
import plotly.express as px


import matplotlib.pyplot as plt

import joblib

# Preprocessing
# Loading the dataset

In [9]:
data = pd.read_csv('../Data/get_around_pricing_project.csv')
data = data.drop("Unnamed: 0", axis=1)

data.head()


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [14]:
x = data.drop('rental_price_per_day', axis=1)
y = data['rental_price_per_day']

categorical_indices = [0,3,4,5,6,7,8,9,10,11,12]
numeric_indices = [1,2]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first',categories='auto',handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features, auto for representations of all catego
    ])

# Applying pipeline on X variables depending on columns
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(x_train.head())
X_train = feature_encoder.fit_transform(x_train)
print('...Done.')
print(X_train[0:5,:])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(x_test.head())
X_test = feature_encoder.transform(x_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:])
print()



Y_train = y_train.to_numpy().reshape(-1,1)
Y_test = y_test.to_numpy().reshape(-1,1)

scaler = StandardScaler()

print("Preprocessing Y_train and Y_test")
Y_train = scaler.fit_transform(Y_train)
print(Y_train[0:5,:])
Y_test = scaler.transform(Y_test)
print(Y_test[0:5,:])

Performing preprocessings on train set...
     model_key  mileage  engine_power    fuel paint_color car_type  \
1215   Renault   119515           135  diesel        grey   estate   
432    Citroën   234365           135  diesel       black   estate   
4244       BMW    77356           105  diesel       black      suv   
289    Peugeot   181297           105  diesel       brown   estate   
2585   Citroën   144089           137  petrol       black    sedan   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
1215                      False     True                 False          False   
432                        True     True                 False          False   
4244                      False     True                 False          False   
289                       False     True                 False          False   
2585                       True     True                 False          False   

      has_getaround_connect  has_speed_regulator  

## train model

In [18]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [19]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[[ 0.26561441]
 [-0.71378027]
 [ 0.24161018]
 ...
 [ 2.17111656]
 [ 0.48648941]
 [ 0.53893442]]



In [20]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[[ 5.81153348e-01]
 [-2.62499156e-01]
 [ 8.57094854e-01]
 [-1.38810481e-01]
 [ 2.05277500e-01]
 [-1.21522419e+00]
 [-6.50717730e-01]
 [ 3.44494539e-01]
 [ 6.76750578e-01]
 [-2.88763432e-01]
 [-7.29358959e-01]
 [-3.47135031e-01]
 [-4.87982831e-01]
 [ 7.87654468e-01]
 [ 1.03003877e-01]
 [-4.00818504e-01]
 [ 9.90201689e-01]
 [ 1.61690442e+00]
 [-8.97053993e-01]
 [ 3.37001853e-01]
 [ 2.04022032e+00]
 [-2.09993847e-01]
 [-5.95519155e-01]
 [ 4.63525409e-01]
 [-4.23073160e-01]
 [-1.14354482e+00]
 [-6.84520920e-01]
 [-6.27137581e-01]
 [-5.64775834e-02]
 [ 3.88435076e-01]
 [-1.00761375e+00]
 [-8.10850890e-01]
 [-3.78894985e-01]
 [-5.80874360e-01]
 [ 3.37365781e-01]
 [ 2.90102409e-02]
 [ 8.32849500e-03]
 [ 1.15960553e-01]
 [-2.82814639e-01]
 [ 2.25268774e+00]
 [-2.36453512e-01]
 [ 3.79388079e-01]
 [ 6.17541550e-01]
 [-6.40020350e-01]
 [ 7.49875683e-01]
 [-7.77804323e-01]
 [ 3.52431111e-02]
 [ 9.82688415e-01]
 [-2.06580315e-01]
 [-7.72270744e-01]
 [ 1.01554142e

In [21]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.7140101651551811
R2 score on test set :  0.6937162271679334


In [22]:
regressor.coef_

array([[ 0.31755568,  0.16260864,  0.10366532,  0.42669274, -1.03396102,
        -0.42196799, -0.895914  ,  0.22701834,  0.04696934,  0.28214116,
         0.8684494 , -1.23523409,  0.62930979, -0.08979258,  0.60483277,
        -0.06588882,  0.73626773, -0.33341769,  0.22230413,  1.51114029,
         0.42275028,  1.02973652,  0.61050955,  1.21895256,  0.87595104,
         0.65688589,  1.03169321, -0.63183602,  0.22244846, -0.57451386,
         0.02015646, -0.0722922 ,  0.04734818, -0.74538989, -0.00715918,
        -0.16227441,  0.06811635, -0.08558327,  0.12865841,  0.04203293,
        -0.34458222, -0.26914207, -0.15815915, -0.21151219,  0.0639038 ,
        -0.72880666,  0.04045978,  0.34662027,  0.0229489 ,  0.11998646,
         0.14792427,  0.13005793, -0.09630185, -0.38662501,  0.41885712]])

In [24]:
column_names = []
for name, pipeline, features_list in feature_encoder.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['model_key_Audi', 'model_key_BMW', 'model_key_Citroën', 'model_key_Ferrari', 'model_key_Fiat', 'model_key_Ford', 'model_key_Honda', 'model_key_KIA Motors', 'model_key_Lamborghini', 'model_key_Lexus', 'model_key_Maserati', 'model_key_Mazda', 'model_key_Mercedes', 'model_key_Mini', 'model_key_Mitsubishi', 'model_key_Nissan', 'model_key_Opel', 'model_key_PGO', 'model_key_Peugeot', 'model_key_Porsche', 'model_key_Renault', 'model_key_SEAT', 'model_key_Subaru', 'model_key_Suzuki', 'model_key_Toyota', 'model_key_Volkswagen', 'model_key_Yamaha', 'fuel_electro', 'fuel_hybrid_petrol', 'fuel_petrol', 'paint_color_black', 'paint_color_blue', 'paint_color_brown', 'paint_color_green', 'paint_color_grey', 'paint_color_orange', 'paint_color_red', 'paint_color_silver', 'paint_color_white', 'car_type_coupe', 'car_type_estate', 'car_type_hatchback', 'car_type_sedan', 'car_type_subcompact', 'car_type_suv', 'car_type_van', 'private_parking_available_Tr

In [25]:
# Create a pandas DataFrame
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
model_key_Audi,0.317556
model_key_BMW,0.162609
model_key_Citroën,0.103665
model_key_Ferrari,0.426693
model_key_Fiat,-1.033961
model_key_Ford,-0.421968
model_key_Honda,-0.895914
model_key_KIA Motors,0.227018
model_key_Lamborghini,0.046969
model_key_Lexus,0.282141


In [26]:
# Compute abs() and sort values
feature_importance = abs(coefs).sort_values(by = 'coefficients')
feature_importance

Unnamed: 0,coefficients
paint_color_grey,0.007159
paint_color_black,0.020156
has_air_conditioning_True,0.022949
private_parking_available_True,0.04046
car_type_coupe,0.042033
model_key_Lamborghini,0.046969
paint_color_brown,0.047348
car_type_suv,0.063904
model_key_Nissan,0.065889
paint_color_red,0.068116


In [29]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()