# Tutorial: Técnicas para codificar las variables categóricas (III): codificación target

#### Importamos las librerías necesarias

In [1]:
import pandas as pd
from category_encoders import TargetEncoder

#### El dataset Automobile
Cargamos el dataset Automobile, le añadimos el nombre de las columnas y rellenamos los valores perdidos.

In [1]:
# Cargamos el dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-'
                 'databases/autos/imports-85.data', encoding = 'utf-8', 
                 header = None)

# Añadimos el nombre de cada variable
df.columns = ["symboling","normalized-losses","make","fuel-type",
              "aspiration","num-of-doors","body-style","drive-wheels",
              "engine-location","wheel-base","length","width",
              "height","curb-weight","engine-type","num-of-cylinders",
              "engine-size","fuel-system","bore","stroke",
              "compression-ratio","horsepower","peak-rpm","city-mpg",
              "highway-mpg","price"]

# Rellenamos los valores perdidos
df['price'] = df['price'].replace(['?'], 10000)

# Convertimos columna "price" a numerica
df["price"] = pd.to_numeric(df["price"])

df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### Método 5: Codificación target
Mediante esta técnica sustituiremos nuestra variable categórica por una sola nueva variable de tipo numérico que contenga información de la variable objetivo. 

In [92]:
len(df["make"].value_counts())

22

En primer lugar lo usamos ajustando smoothing a 0 (sin regularización).

In [2]:
# Creamos el codificador Target indicando nuestra variable
encoder = TargetEncoder(cols = ['make'], smoothing = 0)

# Ajustamos el codificador indicandole la variable objetivo
encoder.fit(df["make"], df['price'])
new_make = encoder.transform(df["make"], df['price'])

# Concatenamos la nueva variable con el resto
df = pd.concat((df, new_make.make.rename('new_make')), axis=1)

Seguidamente, mostramos la antigua variable make junto con la nueva codificada (new_make) y price.

In [3]:
df[["make", "price", "new_make"]].head(8)

Unnamed: 0,make,price,new_make
0,alfa-romero,13495,15498.333333
1,alfa-romero,16500,15498.333333
2,alfa-romero,16500,15498.333333
3,audi,13950,16736.428571
4,audi,17450,16736.428571
5,audi,15250,16736.428571
6,audi,17710,16736.428571
7,audi,18920,16736.428571


Ahora le indicamos el nivel de regularización que queremos mediante el argumento smoothing.

In [4]:
# Creamos el codificador Target indicando nuestra variable
encoder = TargetEncoder(cols = ['make'], smoothing = 3)

# Ajustamos el codificador indicandole la variable objetivo
encoder.fit(df["make"], df['price'])
new_make = encoder.transform(df["make"], df['price'])

# Concatenamos la nueva variable con el resto
df = pd.concat((df, new_make.make.rename('new_make_smooth')), axis=1)

Mostramos los resultados.

In [5]:
df[["make", "price", "new_make", "new_make_smooth"]].head(8)

Unnamed: 0,make,price,new_make,new_make_smooth
0,alfa-romero,13495,15498.333333,14699.827742
1,alfa-romero,16500,15498.333333,14699.827742
2,alfa-romero,16500,15498.333333,14699.827742
3,audi,13950,16736.428571,16308.266296
4,audi,17450,16736.428571,16308.266296
5,audi,15250,16736.428571,16308.266296
6,audi,17710,16736.428571,16308.266296
7,audi,18920,16736.428571,16308.266296
