# Preprocessing the model: Applying Feature Engineering to the Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the data

In [2]:
root = '../data/cleaned_cars.csv'

df = pd.read_csv(root)
df.head()

Unnamed: 0,title,brand,model,type,city,gearbox,doors,color,co2_emiss,fuel_type,...,dealer,chassis,height,length,max_speed,mixed_cons,weight,tank_vol,acc,price
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,2,WHITE,99,Gasoline,...,Professional,Convertible,157,270,145,4.3,780,33,13.7,5500
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,3,WHITE,179,Gasoline,...,Professional,Coupe,140,426,235,7.6,1373,55,7.2,10900
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,4,OTHER,109,Diesel,...,Professional,Coupe,142,453,222,4.2,1545,42,8.5,36100
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,5,WHITE,142,Diesel,...,Professional,Coupe,162,467,212,5.4,1740,67,8.0,28000
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,2,RED,488,Gasoline,...,Professional,Coupe,123,446,189,21.0,1546,70,3.9,27990


## Identifying categorical features

In [3]:
df.select_dtypes(exclude=["number"]).columns.value_counts().reset_index().drop(0, axis=1)

Unnamed: 0,index
0,dealer
1,city
2,type
3,title
4,gearbox
5,warranty
6,color
7,model
8,fuel_type
9,brand


In [4]:
df.select_dtypes(exclude=["number"])

Unnamed: 0,title,brand,model,type,city,gearbox,color,fuel_type,warranty,dealer,chassis
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,WHITE,Gasoline,YES,Professional,Convertible
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,WHITE,Gasoline,YES,Professional,Coupe
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,OTHER,Diesel,YES,Professional,Coupe
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,WHITE,Diesel,YES,Professional,Coupe
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,RED,Gasoline,YES,Professional,Coupe
...,...,...,...,...,...,...,...,...,...,...,...
55717,Opel Corsa 1.3cdti Selective 75,OPEL,CORSA,other,Madrid,Manual,WHITE,Diesel,YES,Professional,Sedan
55718,Mercedes Clase V 250d Largo Avantgarde 7g Tronic,MERCEDES,CLASE V,other,Unknown,Automatic,BLACK,Diesel,YES,Professional,Minivan
55719,Jeep Cherokee 2.0d Longitude 4x2 103kw,JEEP,CHEROKEE,other,Ciudad Real,Manual,GREY,Diesel,YES,Professional,Offroad
55720,Bmw X3 Xdrive 30da,BMW,X3,other,Barcelona,Automatic,WHITE,Diesel,YES,Professional,Offroad


It is needed to convert these categorical variables to numbers using a OneHotEncoder. However, title will be the exception since it is not valuable to include long text in a regression model.

In [5]:
cat_cols = df.select_dtypes(exclude=["number"]).columns.drop('title')

## Transforming Categorical into Numerical: One Hot Encoding

Using a dummifyer (`pd.get_dummies`) to transform categorical values into numerical is not the right approach for a machine learning tool. `LabelEncoder` will interpret a categorical feature i.e. `[a,b,b,c]` as `[0,1,1,2]`. This will establish a wrong numeric relationship among the categorical variable. Therefore, using a `OneHotEncoder` is the right procedure to represent a categorical variable since it will create a new category per value.  

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer

### When should we use each preprocessing encoding functions?

- **LabelEncoder** - for labels(response variable) coding 1,2,3… [implies order]

- **OrdinalEncoder** - for features coding 1,2,3 … [implies order]

- **Label Binarizer** - for response variable, coding 0 & 1 [ creating multiple dummy columns]

- **OneHotEncoder** - for feature variables, coding 0 & 1 [ creating multiple dummy columns]

However, `LabelBinarizer` should ideally be used for `target` and `OneHotEncoder` should be used for `features`.

Therefore, `OneHotEncoder` will be applied to these columns: `type, gearbox, color, fuel_type, warranty, dealer, chassis`

In [7]:
ohe = OneHotEncoder(categories='auto')
onehot_encoder = LabelBinarizer()

In [8]:
df.select_dtypes(exclude=["number"])

Unnamed: 0,title,brand,model,type,city,gearbox,color,fuel_type,warranty,dealer,chassis
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,WHITE,Gasoline,YES,Professional,Convertible
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,WHITE,Gasoline,YES,Professional,Coupe
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,OTHER,Diesel,YES,Professional,Coupe
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,WHITE,Diesel,YES,Professional,Coupe
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,RED,Gasoline,YES,Professional,Coupe
...,...,...,...,...,...,...,...,...,...,...,...
55717,Opel Corsa 1.3cdti Selective 75,OPEL,CORSA,other,Madrid,Manual,WHITE,Diesel,YES,Professional,Sedan
55718,Mercedes Clase V 250d Largo Avantgarde 7g Tronic,MERCEDES,CLASE V,other,Unknown,Automatic,BLACK,Diesel,YES,Professional,Minivan
55719,Jeep Cherokee 2.0d Longitude 4x2 103kw,JEEP,CHEROKEE,other,Ciudad Real,Manual,GREY,Diesel,YES,Professional,Offroad
55720,Bmw X3 Xdrive 30da,BMW,X3,other,Barcelona,Automatic,WHITE,Diesel,YES,Professional,Offroad


In [9]:
np.unique(df.type, return_counts=True)

(array(['big', 'familiar', 'medium', 'minivan', 'offroad', 'other',
        'small', 'sports', 'van'], dtype=object),
 array([9736, 3028, 9807, 8228, 9551, 1659, 2729, 5524, 5460]))

In [10]:
cat_columns = ['type', 'gearbox', 'color', 'fuel_type', 'warranty', 'dealer', 'chassis']
n_col = len(df[cat_columns].columns)

feature_arr = ohe.fit_transform(df[cat_columns]).toarray()
feature_labels = ohe.categories_

In [11]:
label_gen = [feature_labels[i] for i in range(0, n_col)] # Using a list_comprehension to hold columns' labels

feature_cols = np.concatenate(label_gen).tolist()
ohe_features = pd.DataFrame(feature_arr, columns=feature_cols)

ohe_features

Unnamed: 0,big,familiar,medium,minivan,offroad,other,small,sports,van,Automatic,...,Convertible,Coupe,Minivan,Offroad,Pickup,Roadster,Sedan,Stationwagon,Targa,Van
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55717,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
55718,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55719,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
55720,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
data = pd.concat([df, ohe_features], axis=1)
data.head()

Unnamed: 0,title,brand,model,type,city,gearbox,doors,color,co2_emiss,fuel_type,...,Convertible,Coupe,Minivan,Offroad,Pickup,Roadster,Sedan,Stationwagon,Targa,Van
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,2,WHITE,99,Gasoline,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,3,WHITE,179,Gasoline,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,4,OTHER,109,Diesel,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,5,WHITE,142,Diesel,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,2,RED,488,Gasoline,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df.type.value_counts().reset_index().sort_values('index')

Unnamed: 0,index,type
1,big,9736
6,familiar,3028
0,medium,9807
3,minivan,8228
2,offroad,9551
8,other,1659
7,small,2729
4,sports,5524
5,van,5460


Let's verify if the OneHotEncoder was correctly applied:

In [14]:
for col in np.unique(df.type):
    print('N {} = {:.0f}'.format(col, data[col].sum()))

N big = 9736
N familiar = 3028
N medium = 9807
N minivan = 8228
N offroad = 9551
N other = 1659
N small = 2729
N sports = 5524
N van = 5460


Therefore, we can drop now the `title, type, gearbox, color, fuel_type, warranty, dealer, chassis` columns

In [15]:
data.drop(cat_columns + ['title'], axis=1, inplace=True)
data.head()

Unnamed: 0,brand,model,city,doors,co2_emiss,height,length,max_speed,mixed_cons,weight,...,Convertible,Coupe,Minivan,Offroad,Pickup,Roadster,Sedan,Stationwagon,Targa,Van
0,SMART,FORTWO,Alicante,2,99,157,270,145,4.3,780,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,VOLKSWAGEN,SCIROCCO,Barcelona,3,179,140,426,235,7.6,1373,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BMW,SERIE 2,Cantabria,4,109,142,453,222,4.2,1545,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BMW,X4,Girona,5,142,162,467,212,5.4,1740,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DODGE,VIPER,Unknown,2,488,123,446,189,21.0,1546,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

### Applying a more scalable Encoder to categorical variables with high cardinality

`Bayesian Target Encoding` technique is an improvement over the standard `Target Encoding`, because it is trying to extract information from intra-category distribution of the target variable, while Target Encoding ignores it. The method has been proven to be really helpful on data with high cardinality categorical variables. 

`Sampling Bayesian Encoder` provides a fresh look at the Bayesian Target Encoding and opens for more opportunities of research and engineering in the area of encoding of categorical features. Instead of using the moments of the posterior distribution as the new features, it suggests to sample the posterior distribution instead, and use the sampled values as the new features. The idea is based on the observation that target encoding in general, and the Bayesian target encoding in particular, can be represented as a hierarchical model that uses weak learners to discover new features. This technique is used in **ensemble models**, such as `Random Forest` and `Gradient Boosted Trees`.

`Target Encoding` will be applied to high cardinality categories: `brand, model, city`

In [16]:
from category_encoders import TargetEncoder

In [17]:
data.select_dtypes(exclude=["number"])

Unnamed: 0,brand,model,city
0,SMART,FORTWO,Alicante
1,VOLKSWAGEN,SCIROCCO,Barcelona
2,BMW,SERIE 2,Cantabria
3,BMW,X4,Girona
4,DODGE,VIPER,Unknown
...,...,...,...
55717,OPEL,CORSA,Madrid
55718,MERCEDES,CLASE V,Unknown
55719,JEEP,CHEROKEE,Ciudad Real
55720,BMW,X3,Barcelona


Applying Target Encoding using the detailed explanation of this [Medium Article](https://medium.com/analytics-vidhya/target-encoding-vs-one-hot-encoding-with-simple-examples-276a7e7b3e64) and the approach of this [Blog post](https://brendanhasz.github.io/2019/03/04/target-encoding)

In [18]:
encoder = TargetEncoder()

In [19]:
data[['brand_encoded', 'model_encoded', 'city_encoded']] = encoder.fit_transform(data[['brand', 'model', 'city']], data.price)

  elif pd.api.types.is_categorical(cols):


In [20]:
data

Unnamed: 0,brand,model,city,doors,co2_emiss,height,length,max_speed,mixed_cons,weight,...,Offroad,Pickup,Roadster,Sedan,Stationwagon,Targa,Van,brand_encoded,model_encoded,city_encoded
0,SMART,FORTWO,Alicante,2,99,157,270,145,4.3,780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11039.980916,10171.125628,16031.407829
1,VOLKSWAGEN,SCIROCCO,Barcelona,3,179,140,426,235,7.6,1373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17051.258362,13160.993789,20891.036546
2,BMW,SERIE 2,Cantabria,4,109,142,453,222,4.2,1545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,23652.370000,18554.493671
3,BMW,X4,Girona,5,142,162,467,212,5.4,1740,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,36412.589552,20270.100471
4,DODGE,VIPER,Unknown,2,488,123,446,189,21.0,1546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12506.371324,18780.349934,17739.676186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55717,OPEL,CORSA,Madrid,5,100,148,402,164,3.8,1237,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12845.662341,11980.345216,20536.291634
55718,MERCEDES,CLASE V,Unknown,5,158,188,514,206,6.0,2145,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27181.752961,40412.887500,17739.676186
55719,JEEP,CHEROKEE,Ciudad Real,5,139,163,462,187,5.3,1828,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,25865.686411,28652.349945,15706.924710
55720,BMW,X3,Barcelona,5,149,166,465,230,6.0,1800,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,30510.911111,20891.036546


After creating the new encoded features, it is possible to see that the target encoding was done based on the price:

In [21]:
data[['brand_encoded', 'model_encoded', 'city_encoded']]

Unnamed: 0,brand_encoded,model_encoded,city_encoded
0,11039.980916,10171.125628,16031.407829
1,17051.258362,13160.993789,20891.036546
2,23999.831028,23652.370000,18554.493671
3,23999.831028,36412.589552,20270.100471
4,12506.371324,18780.349934,17739.676186
...,...,...,...
55717,12845.662341,11980.345216,20536.291634
55718,27181.752961,40412.887500,17739.676186
55719,25865.686411,28652.349945,15706.924710
55720,23999.831028,30510.911111,20891.036546


In [22]:
data[['brand', 'brand_encoded', 'price']]

Unnamed: 0,brand,brand_encoded,price
0,SMART,11039.980916,5500
1,VOLKSWAGEN,17051.258362,10900
2,BMW,23999.831028,36100
3,BMW,23999.831028,28000
4,DODGE,12506.371324,27990
...,...,...,...
55717,OPEL,12845.662341,7500
55718,MERCEDES,27181.752961,48500
55719,JEEP,25865.686411,15999
55720,BMW,23999.831028,18300


---

### Pandas get_dummies method

It is needed to build a module to deal with all the categorical columns and transform them into numerical.

Therefore, to apply these changes to all the columns, `pd.get_dummies` can be also used. However, for this case, I will continue with the `Encoded Features` since it is a more stablished and professional method to obtain numerical variables from categorical features.

In [23]:
df.select_dtypes(include=["object"]).drop(['title'], axis=1).columns

Index(['brand', 'model', 'type', 'city', 'gearbox', 'color', 'fuel_type',
       'warranty', 'dealer', 'chassis'],
      dtype='object')

In [24]:
cat_cols = df.select_dtypes(include=["object"]).drop(['title'], axis=1).columns
df_dummy = pd.get_dummies(df, columns=cat_cols)
df_dummy.head()

Unnamed: 0,title,doors,co2_emiss,height,length,max_speed,mixed_cons,weight,tank_vol,acc,...,chassis_Convertible,chassis_Coupe,chassis_Minivan,chassis_Offroad,chassis_Pickup,chassis_Roadster,chassis_Sedan,chassis_Stationwagon,chassis_Targa,chassis_Van
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,2,99,157,270,145,4.3,780,33,13.7,...,1,0,0,0,0,0,0,0,0,0
1,Volkswagen Scirocco 2.0 Tsi,3,179,140,426,235,7.6,1373,55,7.2,...,0,1,0,0,0,0,0,0,0,0
2,Bmw Serie 2 218da Gran Coupé,4,109,142,453,222,4.2,1545,42,8.5,...,0,1,0,0,0,0,0,0,0,0
3,Bmw X4 Xdrive 20d,5,142,162,467,212,5.4,1740,67,8.0,...,0,1,0,0,0,0,0,0,0,0
4,Dodge Viper Srt-10,2,488,123,446,189,21.0,1546,70,3.9,...,0,1,0,0,0,0,0,0,0,0


In [25]:
print("Initial number of features:", df.shape[1])
print("Final number of features:", df_dummy.shape[1])

Initial number of features: 21
Final number of features: 888


The DataFrame was transformed to get `888 features` from the initial **21 features**

In [26]:
data.head()

Unnamed: 0,brand,model,city,doors,co2_emiss,height,length,max_speed,mixed_cons,weight,...,Offroad,Pickup,Roadster,Sedan,Stationwagon,Targa,Van,brand_encoded,model_encoded,city_encoded
0,SMART,FORTWO,Alicante,2,99,157,270,145,4.3,780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11039.980916,10171.125628,16031.407829
1,VOLKSWAGEN,SCIROCCO,Barcelona,3,179,140,426,235,7.6,1373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17051.258362,13160.993789,20891.036546
2,BMW,SERIE 2,Cantabria,4,109,142,453,222,4.2,1545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,23652.37,18554.493671
3,BMW,X4,Girona,5,142,162,467,212,5.4,1740,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,36412.589552,20270.100471
4,DODGE,VIPER,Unknown,2,488,123,446,189,21.0,1546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12506.371324,18780.349934,17739.676186


In [27]:
data.columns

Index(['brand', 'model', 'city', 'doors', 'co2_emiss', 'height', 'length',
       'max_speed', 'mixed_cons', 'weight', 'tank_vol', 'acc', 'price', 'big',
       'familiar', 'medium', 'minivan', 'offroad', 'other', 'small', 'sports',
       'van', 'Automatic', 'Direct', 'Manual', 'BEIGE', 'BLACK', 'BLUE',
       'BRONZE', 'BROWN', 'GREEN', 'GREY', 'ORANGE', 'OTHER', 'PINK', 'PURPLE',
       'RED', 'SILVER', 'WHITE', 'YELLOW', 'Diesel', 'Electric', 'Gasoline',
       'Hybrid', 'NO', 'YES', 'Individual', 'Professional', 'Bus', 'Combi',
       'Convertible', 'Coupe', 'Minivan', 'Offroad', 'Pickup', 'Roadster',
       'Sedan', 'Stationwagon', 'Targa', 'Van', 'brand_encoded',
       'model_encoded', 'city_encoded'],
      dtype='object')

Dropping the remaining categorical variables

In [28]:
data.drop(['brand', 'model', 'city'], axis=1, inplace=True)
data.head()

Unnamed: 0,doors,co2_emiss,height,length,max_speed,mixed_cons,weight,tank_vol,acc,price,...,Offroad,Pickup,Roadster,Sedan,Stationwagon,Targa,Van,brand_encoded,model_encoded,city_encoded
0,2,99,157,270,145,4.3,780,33,13.7,5500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11039.980916,10171.125628,16031.407829
1,3,179,140,426,235,7.6,1373,55,7.2,10900,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17051.258362,13160.993789,20891.036546
2,4,109,142,453,222,4.2,1545,42,8.5,36100,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,23652.37,18554.493671
3,5,142,162,467,212,5.4,1740,67,8.0,28000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23999.831028,36412.589552,20270.100471
4,2,488,123,446,189,21.0,1546,70,3.9,27990,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12506.371324,18780.349934,17739.676186


In [29]:
data.dtypes

doors              int64
co2_emiss          int64
height             int64
length             int64
max_speed          int64
mixed_cons       float64
weight             int64
tank_vol           int64
acc              float64
price              int64
big              float64
familiar         float64
medium           float64
minivan          float64
offroad          float64
other            float64
small            float64
sports           float64
van              float64
Automatic        float64
Direct           float64
Manual           float64
BEIGE            float64
BLACK            float64
BLUE             float64
BRONZE           float64
BROWN            float64
GREEN            float64
GREY             float64
ORANGE           float64
OTHER            float64
PINK             float64
PURPLE           float64
RED              float64
SILVER           float64
WHITE            float64
YELLOW           float64
Diesel           float64
Electric         float64
Gasoline         float64


All the features are now set to numerical!

Now the dataset is fully ready to apply a Machine Learning Regression Algorithm.

In [30]:
data.shape

(55722, 60)

In [31]:
output = '../data/cars_reg.csv'
data.to_csv(output, index=False)

Output generated to be used for the regression algorithms.

---

## Exploring Numerical features

On the other hand, there are more numerical columns that will be included in the regression model

In [32]:
df.select_dtypes(exclude=["object"]).columns.value_counts().reset_index().drop(0, axis=1)

Unnamed: 0,index
0,doors
1,co2_emiss
2,acc
3,height
4,price
5,mixed_cons
6,weight
7,length
8,max_speed
9,tank_vol


In [33]:
df.select_dtypes(exclude=["object"])

Unnamed: 0,doors,co2_emiss,height,length,max_speed,mixed_cons,weight,tank_vol,acc,price
0,2,99,157,270,145,4.3,780,33,13.7,5500
1,3,179,140,426,235,7.6,1373,55,7.2,10900
2,4,109,142,453,222,4.2,1545,42,8.5,36100
3,5,142,162,467,212,5.4,1740,67,8.0,28000
4,2,488,123,446,189,21.0,1546,70,3.9,27990
...,...,...,...,...,...,...,...,...,...,...
55717,5,100,148,402,164,3.8,1237,45,14.8,7500
55718,5,158,188,514,206,6.0,2145,67,9.1,48500
55719,5,139,163,462,187,5.3,1828,60,10.9,15999
55720,5,149,166,465,230,6.0,1800,67,6.2,18300
