# Data Cleaning of renting cars
Notebook in which the translation and cleaning of renting cars is covered

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

In [2]:
root = '../data/renting_cars.csv'
df = pd.read_csv(root)

df.head()

Unnamed: 0,title,price,contract_months,km_year,fuel_type,color,warranty,maintenance,tires,power,...,width,trunk_vol,max_speed,seats,urban_cons,xtrurban_cons,mixed_cons,weight,tank_vol,acceleration
0,TOYOTA Yaris,295,60,20.0,Híbrido,Consultar,Incluido,Incluido,Incluido,116cv(85Kw),...,"174,5 cm",270 l,175 km/h,5,"0,0 l","0,0 l","2,8 l",1.065 kg,36 l,"10,0 s"
1,TOYOTA Yaris,291,48,20.0,Híbrido,Consultar,Incluido,Incluido,Incluido,116cv(85Kw),...,"174,5 cm",270 l,175 km/h,5,"0,0 l","0,0 l","2,8 l",1.065 kg,36 l,"10,0 s"
2,TOYOTA Yaris,303,36,20.0,Híbrido,Consultar,Incluido,Incluido,Incluido,116cv(85Kw),...,"174,5 cm",270 l,175 km/h,5,"0,0 l","0,0 l","2,8 l",1.065 kg,36 l,"10,0 s"
3,TOYOTA Yaris,276,60,15.0,Híbrido,Consultar,Incluido,Incluido,Incluido,116cv(85Kw),...,"174,5 cm",270 l,175 km/h,5,"0,0 l","0,0 l","2,8 l",1.065 kg,36 l,"10,0 s"
4,TOYOTA Yaris,276,48,15.0,Híbrido,Consultar,Incluido,Incluido,Incluido,116cv(85Kw),...,"174,5 cm",270 l,175 km/h,5,"0,0 l","0,0 l","2,8 l",1.065 kg,36 l,"10,0 s"


In [3]:
df.columns

Index(['title', 'price', 'contract_months', 'km_year', 'fuel_type', 'color',
       'warranty', 'maintenance', 'tires', 'power', 'co2_emiss', 'doors',
       'gear', 'status', 'chassis', 'height', 'length', 'width', 'trunk_vol',
       'max_speed', 'seats', 'urban_cons', 'xtrurban_cons', 'mixed_cons',
       'weight', 'tank_vol', 'acceleration'],
      dtype='object')

In [4]:
df.title.str.split(' ').apply(len).value_counts()

2    1921
3     450
4       1
Name: title, dtype: int64

### Data Analysis to standardize the dataset

What are the most common contacts?

In [5]:
df.contract_months.value_counts()

36    760
48    680
60    445
24    438
1      49
Name: contract_months, dtype: int64

In [6]:
df.km_year.value_counts()

15.0    407
20.0    401
10.0    386
25.0    348
30.0    187
40.0    144
35.0    128
50.0    101
45.0    100
55.0     61
60.0     60
18.0     49
Name: km_year, dtype: int64

In [7]:
df.km_year.astype(int) * 1000

0       20000
1       20000
2       20000
3       15000
4       15000
        ...  
2367    15000
2368    20000
2369    25000
2370    10000
2371    35000
Name: km_year, Length: 2372, dtype: int64

In [8]:
df.fuel_type.value_counts()

Diesel       1248
Gasolina      551
Híbrido       405
Eléctrico     168
Name: fuel_type, dtype: int64

In [9]:
df.doors.value_counts()

5    1960
4     285
3     127
Name: doors, dtype: int64

In [10]:
df.gear.value_counts()

Manual                             1150
Automática secuencial               807
Automática continua                 235
Directo, sin caja de cambios        168
Automática continua, secuencial      12
Name: gear, dtype: int64

In [11]:
df.warranty.value_counts()

Incluido    2372
Name: warranty, dtype: int64

In [12]:
list(df.chassis.unique())

['Berlina',
 'Todo Terreno',
 'Furgon',
 'Coupe',
 'Stationwagon',
 'Monovolumen',
 'Combi']

In [13]:
df.status.value_counts()

Nuevo        2323
Seminuevo      49
Name: status, dtype: int64

In [14]:
df.status.str.replace('Nuevo', 'New').replace('Seminuevo', 'Preowned').value_counts()

New         2323
Preowned      49
Name: status, dtype: int64

In [15]:
round(df.height.str.replace(' cm', '').str.replace(',', '.').astype(float), 0).astype(int)

0       150
1       150
2       150
3       150
4       150
       ... 
2367    166
2368    166
2369    166
2370    166
2371    166
Name: height, Length: 2372, dtype: int64

In [16]:
df.maintenance.value_counts()

Incluido    2372
Name: maintenance, dtype: int64

In [17]:
df.tires.value_counts()

Incluido       2047
No incluido     325
Name: tires, dtype: int64

In [18]:
df.color.value_counts()

BLANCO       965
Consultar    943
GRIS         197
NEGRO        113
OTRO          95
ROJO          20
AZUL          16
GRANATE       12
PLATEADO      10
NARANJA        1
Name: color, dtype: int64

## Applying the created module

In [19]:
from modules.renting_cleanser import brand_renting, clean_renting, paint_renting, order_typify

In [20]:
df = brand_renting(df)

In [21]:
df = clean_renting(df)

In [22]:
df = paint_renting(df)
df.color.value_counts()

OTHER     1038
WHITE      965
GREY       197
BLACK      113
RED         20
BLUE        16
ROJO        12
SILVER      10
ORANGE       1
Name: color, dtype: int64

In [23]:
df.dtypes

title             object
price              int64
c_months           int64
km_year            int64
fuel_type         object
color             object
warranty          object
maintenance       object
tires             object
power             object
co2_emiss         object
doors              int64
gearbox           object
status            object
chassis           object
height           float64
length           float64
width            float64
trunk_vol         object
max_speed         object
seats              int64
urban_cons       float64
xtrurban_cons    float64
mixed_cons       float64
weight            object
tank_vol          object
acc              float64
brand             object
model             object
dtype: object

In [24]:
df.columns

Index(['title', 'price', 'c_months', 'km_year', 'fuel_type', 'color',
       'warranty', 'maintenance', 'tires', 'power', 'co2_emiss', 'doors',
       'gearbox', 'status', 'chassis', 'height', 'length', 'width',
       'trunk_vol', 'max_speed', 'seats', 'urban_cons', 'xtrurban_cons',
       'mixed_cons', 'weight', 'tank_vol', 'acc', 'brand', 'model'],
      dtype='object')

In [25]:
df = order_typify(df)
df.head()

Unnamed: 0,title,brand,model,c_months,km_year,fuel_type,color,gearbox,doors,seats,...,width,trunk_vol,max_speed,urban_cons,xtrurban_cons,mixed_cons,weight,tank_vol,acc,price
0,TOYOTA Yaris,TOYOTA,YARIS,60,20000,Hybrid,OTHER,Automatic,5,5,...,174.5,270,175,0.0,0.0,2.8,1065,36,10.0,295
1,TOYOTA Yaris,TOYOTA,YARIS,48,20000,Hybrid,OTHER,Automatic,5,5,...,174.5,270,175,0.0,0.0,2.8,1065,36,10.0,291
2,TOYOTA Yaris,TOYOTA,YARIS,36,20000,Hybrid,OTHER,Automatic,5,5,...,174.5,270,175,0.0,0.0,2.8,1065,36,10.0,303
3,TOYOTA Yaris,TOYOTA,YARIS,60,15000,Hybrid,OTHER,Automatic,5,5,...,174.5,270,175,0.0,0.0,2.8,1065,36,10.0,276
4,TOYOTA Yaris,TOYOTA,YARIS,48,15000,Hybrid,OTHER,Automatic,5,5,...,174.5,270,175,0.0,0.0,2.8,1065,36,10.0,276


In [26]:
df.dtypes

title             object
brand             object
model             object
c_months           int64
km_year            int64
fuel_type         object
color             object
gearbox           object
doors              int64
seats              int64
warranty          object
maintenance       object
tires             object
status            object
chassis           object
power              int64
co2_emiss          int64
height           float64
length           float64
width            float64
trunk_vol          int64
max_speed          int64
urban_cons       float64
xtrurban_cons    float64
mixed_cons       float64
weight             int64
tank_vol           int64
acc              float64
price              int64
dtype: object

### The renting dataset is ready to be used for modeling!

In [27]:
df.to_csv('../data/renting.csv', index=False)