In [1]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
# Loading dataset
df = pd.read_csv("get_around_pricing_project.csv")

# Viewing the first lines of the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [3]:
# Basic statistics
df.shape

df.describe(include="all")

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0


The minimum and the maximum values in the 'mileage' column  as well as the minimum value in the 'engine' column look suspicious. We will have a closer look at these cases during further analysis.

In [4]:
# Checking data types
df.dtypes

Unnamed: 0                    int64
model_key                    object
mileage                       int64
engine_power                  int64
fuel                         object
paint_color                  object
car_type                     object
private_parking_available      bool
has_gps                        bool
has_air_conditioning           bool
automatic_car                  bool
has_getaround_connect          bool
has_speed_regulator            bool
winter_tires                   bool
rental_price_per_day          int64
dtype: object

In [5]:
df.columns

Index(['Unnamed: 0', 'model_key', 'mileage', 'engine_power', 'fuel',
       'paint_color', 'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [6]:
# Dropping useless column
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [7]:
# Checking if there are missing values
100*df.isnull().sum()/df.shape[0]

model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

In [8]:
# Univariate analysis
# HIstogram of each quantitative variable
num_features = ['mileage', 'engine_power', 'rental_price_per_day']
for i in range(0, len(num_features)):
    fig = px.histogram(df[num_features[i]])
    fig.show()

There are indeed outliers in the 'mileage' column (a negative value and a very big value) and in the 'engine' column (the minimum value close to zero).

In [9]:
# Univariate analysis
# Barplot of each qualitative variable
cat_features = ['model_key', 'fuel',
       'paint_color', 'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires']
for i in range(len(cat_features)):
    fig = px.bar(df[cat_features[i]])
    fig.update_traces(dict(marker_line_width=0))
    fig.show()

In [10]:
# Checking whether how different models are distributed in the dataset
df['model_key'].value_counts(normalize=True)*100

Citroën        20.008259
Renault        18.913896
BMW            17.076192
Peugeot        13.256246
Audi           10.861037
Nissan          5.678299
Mitsubishi      4.769771
Mercedes        2.002891
Volkswagen      1.342143
Toyota          1.094363
SEAT            0.949824
Subaru          0.908528
Opel            0.681396
Ferrari         0.681396
PGO             0.681396
Maserati        0.371670
Suzuki          0.165187
Porsche         0.123890
Ford            0.103242
KIA Motors      0.061945
Alfa Romeo      0.061945
Fiat            0.041297
Lexus           0.041297
Lamborghini     0.041297
Mini            0.020648
Mazda           0.020648
Honda           0.020648
Yamaha          0.020648
Name: model_key, dtype: float64

In [11]:
# Sort by frequency
df['model_key'].value_counts(sort=True, ascending=True)

Yamaha           1
Mini             1
Mazda            1
Honda            1
Lexus            2
Lamborghini      2
Fiat             2
KIA Motors       3
Alfa Romeo       3
Ford             5
Porsche          6
Suzuki           8
Maserati        18
Ferrari         33
Opel            33
PGO             33
Subaru          44
SEAT            46
Toyota          53
Volkswagen      65
Mercedes        97
Mitsubishi     231
Nissan         275
Audi           526
Peugeot        642
BMW            827
Renault        916
Citroën        969
Name: model_key, dtype: int64

For several models, there is only one car present in the dataset. This will create a problem when we do a train test split (ex. a Honda would be found in test dataset but not in train dataset) so the machine learning model will not be able to create preditions about the car model it had not seen during training.

In [12]:
# Deleting rows with models mentioned only once in the dataset
df = df.loc[df.duplicated(subset='model_key', keep=False), :]

# Checking the result
df['model_key'].value_counts(sort=True, ascending=True)

Lexus            2
Lamborghini      2
Fiat             2
KIA Motors       3
Alfa Romeo       3
Ford             5
Porsche          6
Suzuki           8
Maserati        18
PGO             33
Opel            33
Ferrari         33
Subaru          44
SEAT            46
Toyota          53
Volkswagen      65
Mercedes        97
Mitsubishi     231
Nissan         275
Audi           526
Peugeot        642
BMW            827
Renault        916
Citroën        969
Name: model_key, dtype: int64

In [13]:
# Checking the smallest mileages
df.nsmallest(10, "mileage", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
2938,Renault,-64,230,diesel,black,sedan,True,True,False,True,False,False,True,274
2409,Opel,476,120,diesel,blue,hatchback,True,True,False,False,False,True,True,174
4372,Mitsubishi,612,183,diesel,black,suv,True,True,False,False,False,False,True,222
3935,Mitsubishi,706,155,diesel,black,suv,True,True,False,True,True,True,True,204
3687,BMW,2399,85,diesel,white,subcompact,False,False,False,False,False,False,True,132
4146,Suzuki,2970,423,petrol,red,suv,True,True,True,False,False,False,True,287
471,Peugeot,3142,100,diesel,blue,estate,True,True,False,False,False,False,True,146
781,Peugeot,3478,100,diesel,blue,estate,True,True,False,False,False,False,True,146
4731,Toyota,4530,230,diesel,silver,suv,True,True,False,True,False,False,False,270
2998,Audi,5117,160,diesel,blue,sedan,True,True,True,False,True,True,True,187


In [14]:
# Dropping the row with the negative mileage
df= df.drop(df[df["mileage"]==-64].index)

# Checking the result
df.nsmallest(10, "mileage", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
2409,Opel,476,120,diesel,blue,hatchback,True,True,False,False,False,True,True,174
4372,Mitsubishi,612,183,diesel,black,suv,True,True,False,False,False,False,True,222
3935,Mitsubishi,706,155,diesel,black,suv,True,True,False,True,True,True,True,204
3687,BMW,2399,85,diesel,white,subcompact,False,False,False,False,False,False,True,132
4146,Suzuki,2970,423,petrol,red,suv,True,True,True,False,False,False,True,287
471,Peugeot,3142,100,diesel,blue,estate,True,True,False,False,False,False,True,146
781,Peugeot,3478,100,diesel,blue,estate,True,True,False,False,False,False,True,146
4731,Toyota,4530,230,diesel,silver,suv,True,True,False,True,False,False,False,270
2998,Audi,5117,160,diesel,blue,sedan,True,True,True,False,True,True,True,187
2678,Audi,5578,195,diesel,grey,sedan,True,True,True,False,False,True,True,250


In [15]:
# Checking the larges mileages
df.nlargest(10, "mileage", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
3732,Citroën,1000376,90,diesel,black,subcompact,True,False,False,False,False,False,True,37
557,Renault,484615,120,diesel,blue,estate,True,True,False,False,False,False,True,91
2350,Peugeot,477571,85,diesel,grey,hatchback,False,True,False,False,False,True,False,35
2829,Audi,439060,105,diesel,silver,sedan,False,False,True,False,False,False,True,10
3198,Citroën,405816,100,diesel,blue,sedan,False,False,False,False,False,False,True,22
1573,Citroën,400654,110,diesel,black,estate,False,False,True,False,False,False,True,42
1003,Audi,398043,130,diesel,grey,estate,False,False,False,False,False,False,True,59
2912,Renault,394135,135,diesel,black,sedan,True,True,False,False,True,False,True,118
1521,Audi,391309,190,diesel,black,estate,False,True,False,True,True,False,True,116
3554,Audi,388616,180,diesel,grey,sedan,True,True,False,False,True,False,True,121


In [16]:
# The milage of more than one million km (or miles) looks very unlikely. We will drop the corresponding row.
df= df.drop(df[df["mileage"]==1000376].index)

# Checking the result
df.nlargest(10, "mileage", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
557,Renault,484615,120,diesel,blue,estate,True,True,False,False,False,False,True,91
2350,Peugeot,477571,85,diesel,grey,hatchback,False,True,False,False,False,True,False,35
2829,Audi,439060,105,diesel,silver,sedan,False,False,True,False,False,False,True,10
3198,Citroën,405816,100,diesel,blue,sedan,False,False,False,False,False,False,True,22
1573,Citroën,400654,110,diesel,black,estate,False,False,True,False,False,False,True,42
1003,Audi,398043,130,diesel,grey,estate,False,False,False,False,False,False,True,59
2912,Renault,394135,135,diesel,black,sedan,True,True,False,False,True,False,True,118
1521,Audi,391309,190,diesel,black,estate,False,True,False,True,True,False,True,116
3554,Audi,388616,180,diesel,grey,sedan,True,True,False,False,True,False,True,121
3444,Peugeot,384698,85,diesel,blue,sedan,True,False,False,False,False,False,True,32


In [17]:
# Checking the largest engine powers
df.nlargest(10, "engine_power", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
4146,Suzuki,2970,423,petrol,red,suv,True,True,True,False,False,False,True,287
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
72,Citroën,69410,317,petrol,white,coupe,True,True,False,False,False,True,True,232
2944,Peugeot,33670,317,petrol,blue,sedan,True,True,False,False,False,True,True,226
3092,Peugeot,39250,317,petrol,black,sedan,True,True,False,False,False,True,True,229
67,Peugeot,29925,309,petrol,silver,coupe,True,True,False,False,True,True,True,217
73,Peugeot,170550,309,petrol,grey,coupe,True,True,False,False,True,False,True,167
93,Peugeot,99283,309,petrol,silver,coupe,False,False,False,False,True,False,True,169
139,Peugeot,169970,309,petrol,grey,coupe,True,True,False,False,True,False,True,189
4370,Subaru,134154,300,hybrid_petrol,black,suv,True,True,False,True,True,False,True,148


In [18]:
# Checking the smallest engine powers
df.nsmallest(10, "engine_power", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
3765,Nissan,81770,0,diesel,white,suv,False,False,False,False,False,False,False,108
1796,Porsche,152328,25,hybrid_petrol,black,hatchback,False,True,False,False,False,False,True,142
1925,Porsche,152470,25,hybrid_petrol,black,hatchback,False,True,False,False,False,False,True,124
2390,Peugeot,170529,66,diesel,silver,hatchback,False,False,True,False,False,False,False,32
2771,Renault,146951,66,petrol,white,sedan,False,False,False,False,False,False,False,36
1804,Volkswagen,179307,70,diesel,blue,hatchback,False,True,False,False,False,False,True,91
1847,Volkswagen,100398,70,diesel,white,hatchback,False,True,False,False,False,True,True,103
1983,Volkswagen,57344,70,diesel,grey,hatchback,False,True,False,False,False,False,True,109
1988,Volkswagen,150373,70,diesel,brown,hatchback,False,True,False,False,False,False,True,91
2001,Volkswagen,72527,70,diesel,silver,hatchback,False,False,False,False,False,False,True,96


In [19]:
# Dropping the row where the engine power is equal to zero
df= df.drop(df[df["engine_power"]==0].index)

# Checking the result
df.nsmallest(10, "engine_power", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
1796,Porsche,152328,25,hybrid_petrol,black,hatchback,False,True,False,False,False,False,True,142
1925,Porsche,152470,25,hybrid_petrol,black,hatchback,False,True,False,False,False,False,True,124
2390,Peugeot,170529,66,diesel,silver,hatchback,False,False,True,False,False,False,False,32
2771,Renault,146951,66,petrol,white,sedan,False,False,False,False,False,False,False,36
1804,Volkswagen,179307,70,diesel,blue,hatchback,False,True,False,False,False,False,True,91
1847,Volkswagen,100398,70,diesel,white,hatchback,False,True,False,False,False,True,True,103
1983,Volkswagen,57344,70,diesel,grey,hatchback,False,True,False,False,False,False,True,109
1988,Volkswagen,150373,70,diesel,brown,hatchback,False,True,False,False,False,False,True,91
2001,Volkswagen,72527,70,diesel,silver,hatchback,False,False,False,False,False,False,True,96
2273,Volkswagen,87768,70,diesel,brown,hatchback,False,False,False,False,False,False,True,79


In [20]:
# Checking the smallest rental prices
df.nsmallest(10, "rental_price_per_day", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
565,Citroën,179358,120,diesel,black,estate,False,True,False,False,False,False,True,10
630,Peugeot,147558,105,diesel,white,estate,False,True,False,False,False,False,False,10
879,Peugeot,134156,105,diesel,grey,estate,False,True,False,False,False,False,False,10
1255,Citroën,170381,135,diesel,silver,estate,True,True,False,False,False,False,True,10
1832,BMW,174524,85,diesel,blue,hatchback,False,True,False,False,False,False,True,10
2829,Audi,439060,105,diesel,silver,sedan,False,False,True,False,False,False,True,10
4356,BMW,79685,190,diesel,black,suv,False,False,False,False,False,False,False,10
2473,Audi,230578,85,diesel,black,sedan,False,False,False,False,False,False,True,14
2574,Audi,229880,85,diesel,black,sedan,False,False,False,False,False,False,True,14
2611,Audi,230264,85,diesel,black,sedan,False,False,False,False,False,False,True,14


In [21]:
# Checking the largest rental prices
df.nlargest(10, "rental_price_per_day", keep='first')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
4753,BMW,72515,135,diesel,blue,suv,False,False,True,False,False,False,False,422
4684,SEAT,103222,140,diesel,grey,suv,True,True,False,False,True,False,True,378
90,Renault,12402,170,hybrid_petrol,grey,coupe,True,True,False,False,False,False,False,309
4146,Suzuki,2970,423,petrol,red,suv,True,True,True,False,False,False,True,287
4731,Toyota,4530,230,diesel,silver,suv,True,True,False,True,False,False,False,270
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
3595,Maserati,34624,235,diesel,grey,sedan,True,True,True,True,False,True,True,262
3320,Maserati,25858,235,diesel,black,sedan,True,True,False,True,False,False,True,261
3345,Maserati,25571,235,diesel,black,sedan,True,True,False,True,False,False,True,261
4749,Subaru,29460,280,diesel,white,suv,True,True,True,True,False,True,True,258


In [22]:
# Resetting index after deleting rows
df.reset_index()

# Checking the length of the dataframe after manipulations
len(df)

4836

Saving the cleaned dataset for future use in order not to repeat this preparation for every model we train...

In [23]:
# Commented to avoid unnecessary rewriting
"""df.to_csv("get_around_pricing_project_cleaned.csv", index=False)"""

'df.to_csv("get_around_pricing_project_cleaned.csv", index=False)'

Preparing to train the baseline model (multivariate linear regression)

In [24]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = df.columns[:-1]
target_variable = df.columns[-1]

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

Separating labels from features...
...Done.



In [25]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [26]:
# Splitting into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [27]:
# Creating pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [28]:
# Creating pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

In [29]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [30]:
# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test)

Training Linear Multivariate Model

In [31]:
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

In [32]:
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])

...Done.
[ 91.65724158 108.90103987 146.2843791  109.05272806  93.42761454]


In [33]:
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])

...Done.
[136.94368557 149.7755159  131.83800625 100.17692224 117.52243114]


In [34]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.7148524507554002
R2 score on test set :  0.7092296278847003


In [35]:
# Retrieving column names from transformers
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['mileage', 'engine_power', 'model_key_Audi', 'model_key_BMW', 'model_key_Citroën', 'model_key_Ferrari', 'model_key_Fiat', 'model_key_Ford', 'model_key_KIA Motors', 'model_key_Lamborghini', 'model_key_Lexus', 'model_key_Maserati', 'model_key_Mercedes', 'model_key_Mitsubishi', 'model_key_Nissan', 'model_key_Opel', 'model_key_PGO', 'model_key_Peugeot', 'model_key_Porsche', 'model_key_Renault', 'model_key_SEAT', 'model_key_Subaru', 'model_key_Suzuki', 'model_key_Toyota', 'model_key_Volkswagen', 'fuel_electro', 'fuel_hybrid_petrol', 'fuel_petrol', 'paint_color_black', 'paint_color_blue', 'paint_color_brown', 'paint_color_green', 'paint_color_grey', 'paint_color_orange', 'paint_color_red', 'paint_color_silver', 'paint_color_white', 'car_type_coupe', 'car_type_estate', 'car_type_hatchback', 'car_type_sedan', 'car_type_subcompact', 'car_type_suv', 'car_type_van', 'private_parking_available_True', 'has_gps_True', 'has_air_conditioning_True',

In [36]:
# Saving model coefficients as a dataframe
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])

coefs.head()

Unnamed: 0,coefficients
mileage,-13.507759
engine_power,14.009181
model_key_Audi,9.465917
model_key_BMW,5.936428
model_key_Citroën,3.2577


In [37]:
# Getting absolute values and sorting to obtain the most important features first
feature_importance = abs(coefs).sort_values(by = 'coefficients', ascending=False)

display(feature_importance)

Unnamed: 0,coefficients
model_key_Fiat,34.11976
model_key_Suzuki,34.104977
fuel_hybrid_petrol,33.094547
model_key_SEAT,32.935594
model_key_Maserati,29.219896
model_key_Toyota,29.115008
model_key_Porsche,26.507326
car_type_van,25.790789
model_key_Opel,25.691683
model_key_Volkswagen,23.71111


In [38]:
# Plotting coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120}, 
                  yaxis={'categoryorder':'total ascending'})
fig.show()
