# Preprocessing the model: Applying Feature Engineering to the Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the data

In [2]:
root = '../data/merged_cars.csv'

df = pd.read_csv(root)
df.head()

Unnamed: 0,title,brand,model,type,year,kms,city,gearbox,doors,seats,...,width,trunk_vol,max_speed,urban_cons,xtrurban_cons,mixed_cons,weight,tank_vol,acc,price
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,2014,37125,Alicante,Automatic,2,2,...,156,0,145,4.6,4.0,4.3,780,33,13.7,5500
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,Manual,3,4,...,181,292,235,9.9,6.1,7.6,1373,55,7.2,10900
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,2021,0,Cantabria,Automatic,4,5,...,180,430,222,5.4,2.9,4.2,1545,42,8.5,36100
3,Bmw X4 Xdrive 20d,BMW,X4,sports,2017,85000,Girona,Automatic,5,5,...,188,500,212,6.1,5.0,5.4,1740,67,8.0,28000
4,Dodge Viper Srt-10,DODGE,VIPER,sports,2005,95017,Unknown,Automatic,2,2,...,191,0,0,27.3,14.7,21.0,1546,70,3.9,27990


## Identifying categorical features

In [3]:
df.select_dtypes(exclude=["number"]).columns.value_counts().reset_index().drop(0, axis=1)

Unnamed: 0,index
0,fuel_type
1,warranty
2,chassis
3,brand
4,type
5,model
6,dealer
7,color
8,city
9,title


In [4]:
df.select_dtypes(exclude=["number"])

Unnamed: 0,title,brand,model,type,city,gearbox,color,fuel_type,warranty,dealer,chassis
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,Alicante,Automatic,WHITE,Gasoline,YES,Professional,Convertible
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,Barcelona,Manual,WHITE,Gasoline,YES,Professional,Coupe
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,Cantabria,Automatic,OTHER,Diesel,YES,Professional,Coupe
3,Bmw X4 Xdrive 20d,BMW,X4,sports,Girona,Automatic,WHITE,Diesel,YES,Professional,Coupe
4,Dodge Viper Srt-10,DODGE,VIPER,sports,Unknown,Automatic,RED,Gasoline,YES,Professional,Coupe
...,...,...,...,...,...,...,...,...,...,...,...
55968,Opel Corsa 1.3cdti Selective 75,OPEL,CORSA,other,Madrid,Manual,WHITE,Diesel,YES,Professional,Sedan
55969,Mercedes Clase V 250d Largo Avantgarde 7g Tronic,MERCEDES,CLASE V,other,Unknown,Automatic,BLACK,Diesel,YES,Professional,Minivan
55970,Jeep Cherokee 2.0d Longitude 4x2 103kw,JEEP,CHEROKEE,other,Ciudad Real,Manual,GREY,Diesel,YES,Professional,Offroad
55971,Bmw X3 Xdrive 30da,BMW,X3,other,Barcelona,Automatic,WHITE,Diesel,YES,Professional,Offroad


It is needed to convert these categorical variables to numbers using a OneHotEncoder. However, title will be the exception since it is not valuable to include long text in a regression model.

In [5]:
cat_cols = df.select_dtypes(exclude=["number"]).columns.drop('title')

### Transforming Categorical into Numerical: One Hot Encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
onehot_encoder = LabelBinarizer()

In [8]:
onehot_encoded = onehot_encoder.fit_transform(df.type)
df_ohe = pd.DataFrame(onehot_encoded, columns=np.unique(df.type.values))
df_ohe.head()

Unnamed: 0,big,familiar,medium,minivan,offroad,other,small,sports,van
0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0


In [9]:
data = pd.concat([df, df_ohe], axis=1)
data.head()

Unnamed: 0,title,brand,model,type,year,kms,city,gearbox,doors,seats,...,price,big,familiar,medium,minivan,offroad,other,small,sports,van
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,sports,2014,37125,Alicante,Automatic,2,2,...,5500,0,0,0,0,0,0,0,1,0
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,sports,2009,0,Barcelona,Manual,3,4,...,10900,0,0,0,0,0,0,0,1,0
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,sports,2021,0,Cantabria,Automatic,4,5,...,36100,0,0,0,0,0,0,0,1,0
3,Bmw X4 Xdrive 20d,BMW,X4,sports,2017,85000,Girona,Automatic,5,5,...,28000,0,0,0,0,0,0,0,1,0
4,Dodge Viper Srt-10,DODGE,VIPER,sports,2005,95017,Unknown,Automatic,2,2,...,27990,0,0,0,0,0,0,0,1,0


In [10]:
df.type.value_counts().reset_index().sort_values('index')

Unnamed: 0,index,type
0,big,9810
6,familiar,3030
1,medium,9807
3,minivan,8229
2,offroad,9617
8,other,1661
7,small,2744
4,sports,5538
5,van,5537


Let's verify if the OneHotEncoder was correctly applied:

In [11]:
for col in np.unique(df.type):
    print('N {} = {}'.format(col, data[col].sum()))

N big = 9810
N familiar = 3030
N medium = 9807
N minivan = 8229
N offroad = 9617
N other = 1661
N small = 2744
N sports = 5538
N van = 5537


The One Hot Encoder was succesfully applied to the model:

In [12]:
for col in np.unique(df.type):
    print(data[data.type == col][['type', col]].head())

     type  big
5538  big    1
5539  big    1
5540  big    1
5541  big    1
5542  big    1
           type  familiar
51282  familiar         1
51283  familiar         1
51284  familiar         1
51285  familiar         1
51286  familiar         1
         type  medium
41475  medium       1
41476  medium       1
41477  medium       1
41478  medium       1
41479  medium       1
          type  minivan
15348  minivan        1
15349  minivan        1
15350  minivan        1
15351  minivan        1
15352  minivan        1
          type  offroad
31858  offroad        1
31859  offroad        1
31860  offroad        1
31861  offroad        1
31862  offroad        1
        type  other
54312  other      1
54313  other      1
54314  other      1
54315  other      1
54316  other      1
        type  small
29114  small      1
29115  small      1
29116  small      1
29117  small      1
29118  small      1
     type  sports
0  sports       1
1  sports       1
2  sports       1
3  sports       1
4  s

Therefore, we can drop now the `type` column

In [13]:
data = data.drop('type', axis=1)
data.head()

Unnamed: 0,title,brand,model,year,kms,city,gearbox,doors,seats,power,...,price,big,familiar,medium,minivan,offroad,other,small,sports,van
0,Smart Fortwo Cabrio 52 Mhd Pure Aut.,SMART,FORTWO,2014,37125,Alicante,Automatic,2,2,71,...,5500,0,0,0,0,0,0,0,1,0
1,Volkswagen Scirocco 2.0 Tsi,VOLKSWAGEN,SCIROCCO,2009,0,Barcelona,Manual,3,4,200,...,10900,0,0,0,0,0,0,0,1,0
2,Bmw Serie 2 218da Gran Coupé,BMW,SERIE 2,2021,0,Cantabria,Automatic,4,5,150,...,36100,0,0,0,0,0,0,0,1,0
3,Bmw X4 Xdrive 20d,BMW,X4,2017,85000,Girona,Automatic,5,5,190,...,28000,0,0,0,0,0,0,0,1,0
4,Dodge Viper Srt-10,DODGE,VIPER,2005,95017,Unknown,Automatic,2,2,506,...,27990,0,0,0,0,0,0,0,1,0


It is needed to build a module to deal with all the categorical columns and transform them into numerical

---

## Exploring Numerical features

On the other hand, there are more numerical columns that will be included in the regression model

In [14]:
df.select_dtypes(exclude=["object"]).columns.value_counts().reset_index().drop(0, axis=1)

Unnamed: 0,index
0,price
1,tank_vol
2,trunk_vol
3,xtrurban_cons
4,year
5,mixed_cons
6,width
7,max_speed
8,height
9,seats


In [15]:
df.select_dtypes(exclude=["object"])

Unnamed: 0,year,kms,doors,seats,power,co2_emiss,height,length,width,trunk_vol,max_speed,urban_cons,xtrurban_cons,mixed_cons,weight,tank_vol,acc,price
0,2014,37125,2,2,71,99,157,270,156,0,145,4.6,4.0,4.3,780,33,13.7,5500
1,2009,0,3,4,200,179,140,426,181,292,235,9.9,6.1,7.6,1373,55,7.2,10900
2,2021,0,4,5,150,109,142,453,180,430,222,5.4,2.9,4.2,1545,42,8.5,36100
3,2017,85000,5,5,190,142,162,467,188,500,212,6.1,5.0,5.4,1740,67,8.0,28000
4,2005,95017,2,2,506,488,123,446,191,0,0,27.3,14.7,21.0,1546,70,3.9,27990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55968,2016,60000,5,5,75,100,148,402,175,285,164,4.6,3.3,3.8,1237,45,14.8,7500
55969,2016,70261,5,6,190,158,188,514,193,1030,206,6.9,5.5,6.0,2145,67,9.1,48500
55970,2015,182069,5,5,140,139,163,462,186,412,187,6.4,4.6,5.3,1828,60,10.9,15999
55971,2011,159000,5,5,258,149,166,465,188,550,230,7.8,4.8,6.0,1800,67,6.2,18300
