In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/users/alfon/Desktop/Master/TFM/CSV/03.cars_final_def.csv')
df.head()

Unnamed: 0,Brand,Model,Version,Type,Year,Kms,Hp,Gear_type,Fuel_type,Fuel_cons,Doors,Colour,ZIP,Province,Seller,Price_EUR
0,Citroen,C3,1.4HDI Collection,small,2006,182510,68,Manual,Diesel,4.4,5,Beige,8950,Barcelona,Dealer,2400
1,Audi,A1,1.4 TFSI Attraction 92kW,small,2017,67478,125,Manual,Petrol,4.9,5,Beige,8097,Barcelona,Dealer,14990
2,Renault,Clio,1.4 Dynamique,small,2007,31510,98,Manual,Petrol,6.6,5,Beige,8227,Barcelona,Dealer,4950
3,Renault,ZOE,Intens 40 R90 68kW,small,2017,34985,92,Automatic,Electric,0.0,5,Beige,8241,Barcelona,Dealer,10830
4,MINI,Cooper S,,small,2006,83700,174,Manual,Petrol,6.2,5,Beige,8120,Barcelona,Dealer,9800


Removing unnecesary columns for the study

In [3]:
df = df.drop(columns = ["Version","ZIP"], axis = 1)


## Features

In [4]:
df.dtypes

Brand         object
Model         object
Type          object
Year           int64
Kms            int64
Hp             int64
Gear_type     object
Fuel_type     object
Fuel_cons    float64
Doors          int64
Colour        object
Province      object
Seller        object
Price_EUR      int64
dtype: object

There are two kind of features in this dataset: Numerical and Categorical Features

In [5]:
#Numerical Features
df.select_dtypes(exclude=["object"]).columns.value_counts()

Year         1
Kms          1
Hp           1
Fuel_cons    1
Doors        1
Price_EUR    1
dtype: int64

In [6]:
#Categorical Features
df.select_dtypes(exclude=["number"]).columns.value_counts()

Brand        1
Model        1
Type         1
Gear_type    1
Fuel_type    1
Colour       1
Province     1
Seller       1
dtype: int64

To be able to analyze the correlation between all features of the dataset, including numerical and categorical features we need to transform categorical into numerical

## Transformating Categorical features to Numerical features 


By using a one hot encoder we are going to include some of the categorical features to the data frame columns, tranforming them to dummy columns coding them 0 and 1

### Optoin 1 : Dummies

Transforming categorical columns to numerical for those features that make sense to conver to dummies columns 

In [7]:
for col in ['Brand','Gear_type', 'Fuel_type','Type','Seller']:
    df[col] = df[col].astype('category')
print(df.dtypes)

Brand        category
Model          object
Type         category
Year            int64
Kms             int64
Hp              int64
Gear_type    category
Fuel_type    category
Fuel_cons     float64
Doors           int64
Colour         object
Province       object
Seller       category
Price_EUR       int64
dtype: object


In [8]:
df = pd.get_dummies(data=df,columns=['Gear_type','Fuel_type','Type','Seller'])

In [9]:
df.head()

Unnamed: 0,Brand,Model,Year,Kms,Hp,Fuel_cons,Doors,Colour,Province,Price_EUR,...,Type_coupe,Type_familiar,Type_minivan,Type_other,Type_sedan,Type_small,Type_suv,Type_van,Seller_Dealer,Seller_Private
0,Citroen,C3,2006,182510,68,4.4,5,Beige,Barcelona,2400,...,0,0,0,0,0,1,0,0,1,0
1,Audi,A1,2017,67478,125,4.9,5,Beige,Barcelona,14990,...,0,0,0,0,0,1,0,0,1,0
2,Renault,Clio,2007,31510,98,6.6,5,Beige,Barcelona,4950,...,0,0,0,0,0,1,0,0,1,0
3,Renault,ZOE,2017,34985,92,0.0,5,Beige,Barcelona,10830,...,0,0,0,0,0,1,0,0,1,0
4,MINI,Cooper S,2006,83700,174,6.2,5,Beige,Barcelona,9800,...,0,0,0,0,0,1,0,0,1,0


### Option 2 : Encoder for high cardinality features

conda install -c conda-forge category_encoders

In [10]:
from category_encoders import TargetEncoder

In [11]:
df.select_dtypes(exclude=["number"])

Unnamed: 0,Brand,Model,Colour,Province
0,Citroen,C3,Beige,Barcelona
1,Audi,A1,Beige,Barcelona
2,Renault,Clio,Beige,Barcelona
3,Renault,ZOE,Beige,Barcelona
4,MINI,Cooper S,Beige,Barcelona
...,...,...,...,...
28516,BMW,3 Series,Black,
28517,Volkswagen,Caddy,Silver,
28518,Land-Rover,Range Rover Sport,Silver,
28519,Volkswagen,Transporter,White,


In [12]:
encoder = TargetEncoder()

In [13]:
cols_to_encode = ['Brand','Model', 'Colour', 'Province']
cols_encoded = list(map(lambda c: c + '_encoded', cols_to_encode))

df[cols_encoded] = encoder.fit_transform(df[cols_to_encode], df.Price_EUR)

In [14]:
df.head()

Unnamed: 0,Brand,Model,Year,Kms,Hp,Fuel_cons,Doors,Colour,Province,Price_EUR,...,Type_sedan,Type_small,Type_suv,Type_van,Seller_Dealer,Seller_Private,Brand_encoded,Model_encoded,Colour_encoded,Province_encoded
0,Citroen,C3,2006,182510,68,4.4,5,Beige,Barcelona,2400,...,0,1,0,0,1,0,10412.196983,8547.619048,15207.210252,22252.941248
1,Audi,A1,2017,67478,125,4.9,5,Beige,Barcelona,14990,...,0,1,0,0,1,0,25546.556757,16731.591241,15207.210252,22252.941248
2,Renault,Clio,2007,31510,98,6.6,5,Beige,Barcelona,4950,...,0,1,0,0,1,0,11099.399408,9136.68,15207.210252,22252.941248
3,Renault,ZOE,2017,34985,92,0.0,5,Beige,Barcelona,10830,...,0,1,0,0,1,0,11099.399408,10966.931096,15207.210252,22252.941248
4,MINI,Cooper S,2006,83700,174,6.2,5,Beige,Barcelona,9800,...,0,1,0,0,1,0,17251.958184,19240.75,15207.210252,22252.941248


In [15]:
df.drop(['Brand','Model', 'Colour', 'Province'], axis = 1, inplace = True)


Lets re organize the data frame, moving the Target Price to the last column

In [16]:
df_features = df.loc[:,df.columns != "Price_EUR"]
df_features.head()

Unnamed: 0,Year,Kms,Hp,Fuel_cons,Doors,Gear_type_Automatic,Gear_type_Manual,Fuel_type_CNG,Fuel_type_Diesel,Fuel_type_Electric,...,Type_sedan,Type_small,Type_suv,Type_van,Seller_Dealer,Seller_Private,Brand_encoded,Model_encoded,Colour_encoded,Province_encoded
0,2006,182510,68,4.4,5,0,1,0,1,0,...,0,1,0,0,1,0,10412.196983,8547.619048,15207.210252,22252.941248
1,2017,67478,125,4.9,5,0,1,0,0,0,...,0,1,0,0,1,0,25546.556757,16731.591241,15207.210252,22252.941248
2,2007,31510,98,6.6,5,0,1,0,0,0,...,0,1,0,0,1,0,11099.399408,9136.68,15207.210252,22252.941248
3,2017,34985,92,0.0,5,1,0,0,0,1,...,0,1,0,0,1,0,11099.399408,10966.931096,15207.210252,22252.941248
4,2006,83700,174,6.2,5,0,1,0,0,0,...,0,1,0,0,1,0,17251.958184,19240.75,15207.210252,22252.941248


In [17]:
df_target = df[["Price_EUR"]]
df_target.head()

Unnamed: 0,Price_EUR
0,2400
1,14990
2,4950
3,10830
4,9800


In [18]:
df = df_features.join(df_target)
df.head()

Unnamed: 0,Year,Kms,Hp,Fuel_cons,Doors,Gear_type_Automatic,Gear_type_Manual,Fuel_type_CNG,Fuel_type_Diesel,Fuel_type_Electric,...,Type_small,Type_suv,Type_van,Seller_Dealer,Seller_Private,Brand_encoded,Model_encoded,Colour_encoded,Province_encoded,Price_EUR
0,2006,182510,68,4.4,5,0,1,0,1,0,...,1,0,0,1,0,10412.196983,8547.619048,15207.210252,22252.941248,2400
1,2017,67478,125,4.9,5,0,1,0,0,0,...,1,0,0,1,0,25546.556757,16731.591241,15207.210252,22252.941248,14990
2,2007,31510,98,6.6,5,0,1,0,0,0,...,1,0,0,1,0,11099.399408,9136.68,15207.210252,22252.941248,4950
3,2017,34985,92,0.0,5,1,0,0,0,1,...,1,0,0,1,0,11099.399408,10966.931096,15207.210252,22252.941248,10830
4,2006,83700,174,6.2,5,0,1,0,0,0,...,1,0,0,1,0,17251.958184,19240.75,15207.210252,22252.941248,9800


In [19]:

df.dtypes

Year                     int64
Kms                      int64
Hp                       int64
Fuel_cons              float64
Doors                    int64
Gear_type_Automatic      uint8
Gear_type_Manual         uint8
Fuel_type_CNG            uint8
Fuel_type_Diesel         uint8
Fuel_type_Electric       uint8
Fuel_type_Hybrid         uint8
Fuel_type_LPG            uint8
Fuel_type_Petrol         uint8
Type_cabrio              uint8
Type_coupe               uint8
Type_familiar            uint8
Type_minivan             uint8
Type_other               uint8
Type_sedan               uint8
Type_small               uint8
Type_suv                 uint8
Type_van                 uint8
Seller_Dealer            uint8
Seller_Private           uint8
Brand_encoded          float64
Model_encoded          float64
Colour_encoded         float64
Province_encoded       float64
Price_EUR                int64
dtype: object

All the features from the DataFrame have been transformed to numerical

In [20]:
df.to_csv('/users/alfon/Desktop/Master/TFM/CSV/04.cars_features_def.csv',header=True, index=False)