# Preprocessing the model: Applying Feature Engineering to the Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')

## Loading the data

In [2]:
root = '../data/cleaned_cars.csv'

df = pd.read_csv(root)
df.head()

Unnamed: 0,title,brand,model,type,year,kms,city,gearbox,doors,seats,...,height,length,width,trunk_vol,max_speed,mixed_cons,weight,tank_vol,acc,price
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,2014,37125,Alicante,Automatic,2,2,...,157,270,156,203,145,4.3,780,33,13.7,5500
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,Manual,3,4,...,140,426,181,292,235,7.6,1373,55,7.2,10900
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,2021,0,Cantabria,Automatic,4,5,...,142,453,180,430,222,4.2,1545,42,8.5,36100
3,Bmw X4 Xdrive 20d,BMW,X4,sports,2017,85000,Girona,Automatic,5,5,...,162,467,188,500,212,5.4,1740,67,8.0,28000
4,Dodge Viper Srt-10,DODGE,VIPER,sports,2005,95017,Unknown,Automatic,2,2,...,123,446,191,344,189,21.0,1546,70,3.9,27990


## Identifying categorical features

In [3]:
df.select_dtypes(exclude=["number"]).columns.value_counts().reset_index().drop(0, axis=1)

Unnamed: 0,index
0,model
1,title
2,chassis
3,warranty
4,city
5,brand
6,color
7,fuel_type
8,gearbox
9,dealer


In [4]:
df.select_dtypes(exclude=["number"])

Unnamed: 0,title,brand,model,type,city,gearbox,color,fuel_type,warranty,dealer,chassis
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,WHITE,Gasoline,YES,Professional,Convertible
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,WHITE,Gasoline,YES,Professional,Coupe
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,OTHER,Diesel,YES,Professional,Coupe
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,WHITE,Diesel,YES,Professional,Coupe
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,RED,Gasoline,YES,Professional,Coupe
...,...,...,...,...,...,...,...,...,...,...,...
55321,Opel Corsa 1.3cdti Selective 75,OPEL,CORSA,other,Madrid,Manual,WHITE,Diesel,YES,Professional,Sedan
55322,Mercedes Clase V 250d Largo Avantgarde 7g Tronic,MERCEDES,CLASE V,other,Unknown,Automatic,BLACK,Diesel,YES,Professional,Minivan
55323,Jeep Cherokee 2.0d Longitude 4x2 103kw,JEEP,CHEROKEE,other,Ciudad Real,Manual,GREY,Diesel,YES,Professional,Offroad
55324,Bmw X3 Xdrive 30da,BMW,X3,other,Barcelona,Automatic,WHITE,Diesel,YES,Professional,Offroad


It is needed to convert these categorical variables to numbers using a OneHotEncoder. However, title will be the exception since it is not valuable to include long text in a regression model.

In [5]:
cat_cols = df.select_dtypes(exclude=["number"]).columns.drop('title')

## Transforming Categorical into Numerical: One Hot Encoding

Using a dummifyer (`pd.get_dummies`) to transform categorical values into numerical is not the right approach for a machine learning tool. `LabelEncoder` will interpret a categorical feature i.e. `[a,b,b,c]` as `[0,1,1,2]`. This will establish a wrong numeric relationship among the categorical variable. Therefore, using a `OneHotEncoder` is the right procedure to represent a categorical variable since it will create a new category per value.  

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer

### When should we use each preprocessing encoding functions?

- **LabelEncoder** - for labels(response variable) coding 1,2,3… [implies order]

- **OrdinalEncoder** - for features coding 1,2,3 … [implies order]

- **Label Binarizer** - for response variable, coding 0 & 1 [ creating multiple dummy columns]

- **OneHotEncoder** - for feature variables, coding 0 & 1 [ creating multiple dummy columns]

However, `LabelBinarizer` should ideally be used for `target` and `OneHotEncoder` should be used for `features`.

Therefore, `OneHotEncoder` will be applied to these columns: `gearbox, fuel_type, warranty, dealer, doors`

In [7]:
ohe = OneHotEncoder(categories='auto')
onehot_encoder = LabelBinarizer()

In [8]:
df.select_dtypes(exclude=["number"])

Unnamed: 0,title,brand,model,type,city,gearbox,color,fuel_type,warranty,dealer,chassis
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,WHITE,Gasoline,YES,Professional,Convertible
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,WHITE,Gasoline,YES,Professional,Coupe
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,OTHER,Diesel,YES,Professional,Coupe
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,WHITE,Diesel,YES,Professional,Coupe
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,RED,Gasoline,YES,Professional,Coupe
...,...,...,...,...,...,...,...,...,...,...,...
55321,Opel Corsa 1.3cdti Selective 75,OPEL,CORSA,other,Madrid,Manual,WHITE,Diesel,YES,Professional,Sedan
55322,Mercedes Clase V 250d Largo Avantgarde 7g Tronic,MERCEDES,CLASE V,other,Unknown,Automatic,BLACK,Diesel,YES,Professional,Minivan
55323,Jeep Cherokee 2.0d Longitude 4x2 103kw,JEEP,CHEROKEE,other,Ciudad Real,Manual,GREY,Diesel,YES,Professional,Offroad
55324,Bmw X3 Xdrive 30da,BMW,X3,other,Barcelona,Automatic,WHITE,Diesel,YES,Professional,Offroad


In [9]:
np.unique(df.type, return_counts=True)

(array(['big', 'familiar', 'medium', 'minivan', 'offroad', 'other',
        'small', 'sports', 'van'], dtype=object),
 array([9655, 3017, 9805, 8221, 9539, 1649, 2729, 5258, 5453]))

In [10]:
cat_columns = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors']
n_col = len(df[cat_columns].columns)

feature_arr = ohe.fit_transform(df[cat_columns]).toarray()
feature_labels = ohe.categories_

In [11]:
# Using a dictionary to produce all the new OHE columns
feature_cols = []
for k, v in dict(zip(cat_columns, feature_labels)).items():
    for i in v:
        el = k + '_' + str(i)
        feature_cols.append(el)

ohe_features = pd.DataFrame(feature_arr, columns=feature_cols)

ohe_features

Unnamed: 0,gearbox_Automatic,gearbox_Direct,gearbox_Manual,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,warranty_NO,warranty_YES,dealer_Individual,dealer_Professional,doors_2,doors_3,doors_4,doors_5
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55321,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
55322,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
55323,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
55324,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [12]:
data = pd.concat([df, ohe_features], axis=1)
data.head()

Unnamed: 0,title,brand,model,type,year,kms,city,gearbox,doors,seats,...,fuel_type_Gasoline,fuel_type_Hybrid,warranty_NO,warranty_YES,dealer_Individual,dealer_Professional,doors_2,doors_3,doors_4,doors_5
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,2014,37125,Alicante,Automatic,2,2,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,Manual,3,4,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,2021,0,Cantabria,Automatic,4,5,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Bmw X4 Xdrive 20d,BMW,X4,sports,2017,85000,Girona,Automatic,5,5,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,Dodge Viper Srt-10,DODGE,VIPER,sports,2005,95017,Unknown,Automatic,2,2,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [13]:
df.type.value_counts().reset_index().sort_values('index')

Unnamed: 0,index,type
1,big,9655
6,familiar,3017
0,medium,9805
3,minivan,8221
2,offroad,9539
8,other,1649
7,small,2729
5,sports,5258
4,van,5453


Let's verify if the OneHotEncoder was correctly applied:

In [14]:
np.unique(df.fuel_type)

array(['Diesel', 'Electric', 'Gasoline', 'Hybrid'], dtype=object)

In [15]:
ex_o_cols = 'fuel_type_' + np.unique(df.fuel_type)

for col in ex_o_cols:
    print('N {} = {:.0f}'.format(col, data[col].sum()))

N fuel_type_Diesel = 32934
N fuel_type_Electric = 916
N fuel_type_Gasoline = 19040
N fuel_type_Hybrid = 2436


Therefore, we can drop now the `title, gearbox, fuel_type, warranty, dealer, doors` columns

In [16]:
data.drop(cat_columns + ['title'], axis=1, inplace=True)
data.head()

Unnamed: 0,brand,model,type,year,kms,city,seats,power,color,co2_emiss,...,fuel_type_Gasoline,fuel_type_Hybrid,warranty_NO,warranty_YES,dealer_Individual,dealer_Professional,doors_2,doors_3,doors_4,doors_5
0,SMART,FORTWO,sports,2014,37125,Alicante,2,71,WHITE,99,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,4,200,WHITE,179,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,BMW,SERIE 2,sports,2021,0,Cantabria,5,150,OTHER,109,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,BMW,X4,sports,2017,85000,Girona,5,190,WHITE,142,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,DODGE,VIPER,sports,2005,95017,Unknown,2,506,RED,488,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


---

### Applying a more scalable Encoder to categorical variables with high cardinality

`Bayesian Target Encoding` technique is an improvement over the standard `Target Encoding`, because it is trying to extract information from intra-category distribution of the target variable, while Target Encoding ignores it. The method has been proven to be really helpful on data with high cardinality categorical variables. 

`Sampling Bayesian Encoder` provides a fresh look at the Bayesian Target Encoding and opens for more opportunities of research and engineering in the area of encoding of categorical features. Instead of using the moments of the posterior distribution as the new features, it suggests to sample the posterior distribution instead, and use the sampled values as the new features. The idea is based on the observation that target encoding in general, and the Bayesian target encoding in particular, can be represented as a hierarchical model that uses weak learners to discover new features. This technique is used in **ensemble models**, such as `Random Forest` and `Gradient Boosted Trees`.

`Target Encoding` will be applied to high cardinality categories: `brand, model, city, color, type, chassis`

In [17]:
from category_encoders import TargetEncoder

In [18]:
data.select_dtypes(exclude=["number"])

Unnamed: 0,brand,model,type,city,color,chassis
0,SMART,FORTWO,sports,Alicante,WHITE,Convertible
1,VOLKSWAGEN,SCIROCCO,sports,Barcelona,WHITE,Coupe
2,BMW,SERIE 2,sports,Cantabria,OTHER,Coupe
3,BMW,X4,sports,Girona,WHITE,Coupe
4,DODGE,VIPER,sports,Unknown,RED,Coupe
...,...,...,...,...,...,...
55321,OPEL,CORSA,other,Madrid,WHITE,Sedan
55322,MERCEDES,CLASE V,other,Unknown,BLACK,Minivan
55323,JEEP,CHEROKEE,other,Ciudad Real,GREY,Offroad
55324,BMW,X3,other,Barcelona,WHITE,Offroad


Applying Target Encoding using the detailed explanation of this [Medium Article](https://medium.com/analytics-vidhya/target-encoding-vs-one-hot-encoding-with-simple-examples-276a7e7b3e64) and the approach of this [Blog post](https://brendanhasz.github.io/2019/03/04/target-encoding)

In [19]:
encoder = TargetEncoder()

In [20]:
cols_to_encode = ['brand', 'model', 'city', 'color', 'type', 'chassis']
cols_encoded = list(map(lambda c: c + '_encoded', cols_to_encode))

data[cols_encoded] = encoder.fit_transform(data[cols_to_encode], data.price)

  elif pd.api.types.is_categorical(cols):


In [21]:
data

Unnamed: 0,brand,model,type,year,kms,city,seats,power,color,co2_emiss,...,doors_2,doors_3,doors_4,doors_5,brand_encoded,model_encoded,city_encoded,color_encoded,type_encoded,chassis_encoded
0,SMART,FORTWO,sports,2014,37125,Alicante,2,71,WHITE,99,...,1.0,0.0,0.0,0.0,11039.980916,10171.125628,15841.032810,17003.852965,28170.085203,20696.065144
1,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,4,200,WHITE,179,...,0.0,1.0,0.0,0.0,17058.096993,13160.993789,18052.176748,17003.852965,28170.085203,31261.850521
2,BMW,SERIE 2,sports,2021,0,Cantabria,5,150,OTHER,109,...,0.0,0.0,1.0,0.0,23593.497861,23508.383973,18554.493671,21242.844857,28170.085203,31261.850521
3,BMW,X4,sports,2017,85000,Girona,5,190,WHITE,142,...,0.0,0.0,0.0,1.0,23593.497861,36412.589552,16569.441935,17003.852965,28170.085203,31261.850521
4,DODGE,VIPER,sports,2005,95017,Unknown,2,506,RED,488,...,1.0,0.0,0.0,0.0,12506.369072,17783.891805,17303.956434,17436.456262,28170.085203,31261.850521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55321,OPEL,CORSA,other,2016,60000,Madrid,5,75,WHITE,100,...,0.0,0.0,0.0,1.0,12850.655659,11980.345216,18965.842225,17003.852965,24125.953305,15131.850727
55322,MERCEDES,CLASE V,other,2016,70261,Unknown,6,190,BLACK,158,...,0.0,0.0,0.0,1.0,26831.477053,40573.610879,17303.956434,19080.509292,24125.953305,14177.381352
55323,JEEP,CHEROKEE,other,2015,182069,Ciudad Real,5,140,GREY,139,...,0.0,0.0,0.0,1.0,25865.686411,28652.349939,15706.924710,17975.461741,24125.953305,23421.348833
55324,BMW,X3,other,2011,159000,Barcelona,5,258,WHITE,149,...,0.0,0.0,0.0,1.0,23593.497861,29654.348315,18052.176748,17003.852965,24125.953305,23421.348833


After creating the new encoded features, it is possible to see that the target encoding was done based on the price:

In [22]:
data[cols_encoded]

Unnamed: 0,brand_encoded,model_encoded,city_encoded,color_encoded,type_encoded,chassis_encoded
0,11039.980916,10171.125628,15841.032810,17003.852965,28170.085203,20696.065144
1,17058.096993,13160.993789,18052.176748,17003.852965,28170.085203,31261.850521
2,23593.497861,23508.383973,18554.493671,21242.844857,28170.085203,31261.850521
3,23593.497861,36412.589552,16569.441935,17003.852965,28170.085203,31261.850521
4,12506.369072,17783.891805,17303.956434,17436.456262,28170.085203,31261.850521
...,...,...,...,...,...,...
55321,12850.655659,11980.345216,18965.842225,17003.852965,24125.953305,15131.850727
55322,26831.477053,40573.610879,17303.956434,19080.509292,24125.953305,14177.381352
55323,25865.686411,28652.349939,15706.924710,17975.461741,24125.953305,23421.348833
55324,23593.497861,29654.348315,18052.176748,17003.852965,24125.953305,23421.348833


In [23]:
data[['brand', 'brand_encoded', 'price']]

Unnamed: 0,brand,brand_encoded,price
0,SMART,11039.980916,5500
1,VOLKSWAGEN,17058.096993,10900
2,BMW,23593.497861,36100
3,BMW,23593.497861,28000
4,DODGE,12506.369072,27990
...,...,...,...
55321,OPEL,12850.655659,7500
55322,MERCEDES,26831.477053,48500
55323,JEEP,25865.686411,15999
55324,BMW,23593.497861,18300


---

### Pandas get_dummies method

It is needed to build a module to deal with all the categorical columns and transform them into numerical.

Therefore, to apply these changes to all the columns, `pd.get_dummies` can be also used. However, for this case, I will continue with the `Encoded Features` since it is a more stablished and professional method to obtain numerical variables from categorical features.

In [24]:
df.select_dtypes(include=["object"]).drop(['title'], axis=1).columns

Index(['brand', 'model', 'type', 'city', 'gearbox', 'color', 'fuel_type',
       'warranty', 'dealer', 'chassis'],
      dtype='object')

In [25]:
cat_cols = df.select_dtypes(include=["object"]).drop(['title'], axis=1).columns
df_dummy = pd.get_dummies(df, columns=cat_cols)
df_dummy.head()

Unnamed: 0,title,year,kms,doors,seats,power,co2_emiss,height,length,width,...,chassis_Convertible,chassis_Coupe,chassis_Minivan,chassis_Offroad,chassis_Pickup,chassis_Roadster,chassis_Sedan,chassis_Stationwagon,chassis_Targa,chassis_Van
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,2014,37125,2,2,71,99,157,270,156,...,1,0,0,0,0,0,0,0,0,0
1,Volkswagen Scirocco 2.0 Tsi,2009,0,3,4,200,179,140,426,181,...,0,1,0,0,0,0,0,0,0,0
2,Bmw Serie 2 218da Gran Coupé,2021,0,4,5,150,109,142,453,180,...,0,1,0,0,0,0,0,0,0,0
3,Bmw X4 Xdrive 20d,2017,85000,5,5,190,142,162,467,188,...,0,1,0,0,0,0,0,0,0,0
4,Dodge Viper Srt-10,2005,95017,2,2,506,488,123,446,191,...,0,1,0,0,0,0,0,0,0,0


In [26]:
print("Initial number of features:", df.shape[1])
print("Final number of features:", df_dummy.shape[1])

Initial number of features: 27
Final number of features: 872


The DataFrame was transformed to get `865 features` from the initial **21 features**

In [27]:
data.head()

Unnamed: 0,brand,model,type,year,kms,city,seats,power,color,co2_emiss,...,doors_2,doors_3,doors_4,doors_5,brand_encoded,model_encoded,city_encoded,color_encoded,type_encoded,chassis_encoded
0,SMART,FORTWO,sports,2014,37125,Alicante,2,71,WHITE,99,...,1.0,0.0,0.0,0.0,11039.980916,10171.125628,15841.03281,17003.852965,28170.085203,20696.065144
1,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,4,200,WHITE,179,...,0.0,1.0,0.0,0.0,17058.096993,13160.993789,18052.176748,17003.852965,28170.085203,31261.850521
2,BMW,SERIE 2,sports,2021,0,Cantabria,5,150,OTHER,109,...,0.0,0.0,1.0,0.0,23593.497861,23508.383973,18554.493671,21242.844857,28170.085203,31261.850521
3,BMW,X4,sports,2017,85000,Girona,5,190,WHITE,142,...,0.0,0.0,0.0,1.0,23593.497861,36412.589552,16569.441935,17003.852965,28170.085203,31261.850521
4,DODGE,VIPER,sports,2005,95017,Unknown,2,506,RED,488,...,1.0,0.0,0.0,0.0,12506.369072,17783.891805,17303.956434,17436.456262,28170.085203,31261.850521


In [28]:
data.columns

Index(['brand', 'model', 'type', 'year', 'kms', 'city', 'seats', 'power',
       'color', 'co2_emiss', 'chassis', 'height', 'length', 'width',
       'trunk_vol', 'max_speed', 'mixed_cons', 'weight', 'tank_vol', 'acc',
       'price', 'gearbox_Automatic', 'gearbox_Direct', 'gearbox_Manual',
       'fuel_type_Diesel', 'fuel_type_Electric', 'fuel_type_Gasoline',
       'fuel_type_Hybrid', 'warranty_NO', 'warranty_YES', 'dealer_Individual',
       'dealer_Professional', 'doors_2', 'doors_3', 'doors_4', 'doors_5',
       'brand_encoded', 'model_encoded', 'city_encoded', 'color_encoded',
       'type_encoded', 'chassis_encoded'],
      dtype='object')

Dropping the remaining categorical variables

In [29]:
data.drop(cols_to_encode, axis=1, inplace=True)
data.head()

Unnamed: 0,year,kms,seats,power,co2_emiss,height,length,width,trunk_vol,max_speed,...,doors_2,doors_3,doors_4,doors_5,brand_encoded,model_encoded,city_encoded,color_encoded,type_encoded,chassis_encoded
0,2014,37125,2,71,99,157,270,156,203,145,...,1.0,0.0,0.0,0.0,11039.980916,10171.125628,15841.03281,17003.852965,28170.085203,20696.065144
1,2009,0,4,200,179,140,426,181,292,235,...,0.0,1.0,0.0,0.0,17058.096993,13160.993789,18052.176748,17003.852965,28170.085203,31261.850521
2,2021,0,5,150,109,142,453,180,430,222,...,0.0,0.0,1.0,0.0,23593.497861,23508.383973,18554.493671,21242.844857,28170.085203,31261.850521
3,2017,85000,5,190,142,162,467,188,500,212,...,0.0,0.0,0.0,1.0,23593.497861,36412.589552,16569.441935,17003.852965,28170.085203,31261.850521
4,2005,95017,2,506,488,123,446,191,344,189,...,1.0,0.0,0.0,0.0,12506.369072,17783.891805,17303.956434,17436.456262,28170.085203,31261.850521


In [30]:
data.dtypes

year                     int64
kms                      int64
seats                    int64
power                    int64
co2_emiss                int64
height                   int64
length                   int64
width                    int64
trunk_vol                int64
max_speed                int64
mixed_cons             float64
weight                   int64
tank_vol                 int64
acc                    float64
price                    int64
gearbox_Automatic      float64
gearbox_Direct         float64
gearbox_Manual         float64
fuel_type_Diesel       float64
fuel_type_Electric     float64
fuel_type_Gasoline     float64
fuel_type_Hybrid       float64
warranty_NO            float64
warranty_YES           float64
dealer_Individual      float64
dealer_Professional    float64
doors_2                float64
doors_3                float64
doors_4                float64
doors_5                float64
brand_encoded          float64
model_encoded          float64
city_enc

All the features are now set to numerical!

Now the dataset is fully ready to apply a Machine Learning Regression Algorithm.

In [31]:
data.shape

(55326, 36)

In [32]:
output = '../data/regression/cars_reg.csv'
data.to_csv(output, index=False)

Output generated to be used for the regression algorithms.

---

## Exploring Numerical features

On the other hand, there are more numerical columns that will be included in the regression model

In [33]:
df.select_dtypes(exclude=["object"]).columns.value_counts().reset_index().drop(0, axis=1)

Unnamed: 0,index
0,width
1,year
2,height
3,seats
4,co2_emiss
5,mixed_cons
6,weight
7,price
8,tank_vol
9,doors


In [34]:
df.select_dtypes(exclude=["object"])

Unnamed: 0,year,kms,doors,seats,power,co2_emiss,height,length,width,trunk_vol,max_speed,mixed_cons,weight,tank_vol,acc,price
0,2014,37125,2,2,71,99,157,270,156,203,145,4.3,780,33,13.7,5500
1,2009,0,3,4,200,179,140,426,181,292,235,7.6,1373,55,7.2,10900
2,2021,0,4,5,150,109,142,453,180,430,222,4.2,1545,42,8.5,36100
3,2017,85000,5,5,190,142,162,467,188,500,212,5.4,1740,67,8.0,28000
4,2005,95017,2,2,506,488,123,446,191,344,189,21.0,1546,70,3.9,27990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55321,2016,60000,5,5,75,100,148,402,175,285,164,3.8,1237,45,14.8,7500
55322,2016,70261,5,6,190,158,188,514,193,1030,206,6.0,2145,67,9.1,48500
55323,2015,182069,5,5,140,139,163,462,186,412,187,5.3,1828,60,10.9,15999
55324,2011,159000,5,5,258,149,166,465,188,550,230,6.0,1800,67,6.2,18300


In [35]:
df.isna().sum()

title         0
brand         0
model         0
type          0
year          0
kms           0
city          0
gearbox       0
doors         0
seats         0
power         0
color         0
co2_emiss     0
fuel_type     0
warranty      0
dealer        0
chassis       0
height        0
length        0
width         0
trunk_vol     0
max_speed     0
mixed_cons    0
weight        0
tank_vol      0
acc           0
price         0
dtype: int64