# Encoding Categorical Values

In [1]:
# https://pbpython.com/categorical-encoding.html

In [5]:
# Turning the text attributes into a numerical values. 
# Turning categorical data in suitable numeric values.

In [6]:
import pandas as pd
import numpy as np

In [7]:
headers = ["symboling", "normalized_losses", "make", "feul_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight", 
           "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore",
           "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", 
           "highway_mpg", "price"]

In [11]:
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("imports-85.data", header=None, names=headers, na_values="?")

In [14]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [15]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
feul_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [19]:
# Here we are selecting only the object columns, since we are focusing on only the categorical variables 
#Pandas has a helpful select_dtypes function which we can use to build a new dataframe containing only the object columns.

In [18]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [21]:
# Before going any further, there are a couple of null values in the data that we need to clean up.

In [22]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [24]:
obj_df["num_doors"].value_counts()

four    114
two      89
Name: num_doors, dtype: int64

In [27]:
#For the sake of simplicity, just fill in the value with the number 4(since the most common value)

In [25]:
obj_df = obj_df.fillna({"num_doors": "four"})

In [28]:
# No NULL values

In [26]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system


# Approach #1 - Find and Replace

In [29]:
obj_df["num_cylinders"].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num_cylinders, dtype: int64

In [30]:
# Here is the complete dictionary for cleaning up the num_doors and num_cylinders columns:

In [31]:
cleanup_nums = {"num_doors": {"four": 4, "two": 2},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3}}

In [32]:
# converting the columns to numbers using replace:

In [33]:
obj_df = obj_df.replace(cleanup_nums)
obj_df.head()

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi


In [34]:
# The nice benefit to this approach is that pandas “knows” the types of 
# values in the columns so the object is now a int64

In [35]:
obj_df.dtypes

make               object
feul_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
dtype: object

# Approach #2 - Label Encoding

In [36]:
# Label encoding is simply converting each value in a column to a number.

In [37]:
# For example, the body_style column contains 5 different values. We could choose to encode it like this:

# convertible -> 0
# hardtop -> 1
# hatchback -> 2
# sedan -> 3
# wagon -> 4

In [38]:
# One trick you can use in pandas is to convert a column to a category, then use those category values for your label encoding

In [40]:
obj_df["body_style"] = obj_df["body_style"].astype('category')
obj_df.dtypes

make                 object
feul_type            object
aspiration           object
num_doors             int64
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders         int64
fuel_system          object
dtype: object

In [41]:
obj_df.head()

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi


In [42]:
# Then you can assign the encoded variable to a new column using the cat.codes accessor:

In [43]:
obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
obj_df.head()

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi,2
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi,3
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi,3


# Approach #3 - One Hot Encoding

In [46]:
#the basic strategy is to convert each category value into a new column and assigns a 1 or 0 (True/False) value to the column

In [47]:
#Pandas supports this feature using get_dummies. This function is named this way because it creates dummy/indicator variables (aka 1 or 0).

In [48]:
# Hopefully a simple example will make this more clear. 
# We can look at the column drive_wheels where we have values of 4wd , fwd or rwd. 
# By using get_dummies we can convert this to three columns with a 1 or 0 corresponding to the correct value:

In [49]:
pd.get_dummies(obj_df, columns=["drive_wheels"]).head()

Unnamed: 0,make,feul_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
1,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
2,alfa-romero,gas,std,2,hatchback,front,ohcv,6,mpfi,2,0,0,1
3,audi,gas,std,4,sedan,front,ohc,4,mpfi,3,0,1,0
4,audi,gas,std,4,sedan,front,ohc,5,mpfi,3,1,0,0


In [50]:
# The new data set contains three new columns:

# drive_wheels_4wd
# drive_wheels_rwd
# drive_wheels_fwd

In [51]:
# This function is powerful because you can pass as many category columns as you would like and choose how to label the columns using prefix.
# Proper naming will make the rest of the analysis just a little bit easier.

In [53]:
pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"]).head()

Unnamed: 0,make,feul_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,4,front,ohc,5,mpfi,3,0,0,0,1,0,1,0,0


In [54]:
# One hot encoding, is very useful but it can cause the number of columns to expand greatly 
# if you have very many unique values in a column. For the number of values in this example, 
# it is not a problem. However you can see how this gets really challenging to manage when you have many more options.

# Approach #4 - Custom Binary Encoding

In [55]:
# In this particular data set, there is a column called engine_type that contains several different values:

In [60]:
obj_df["engine_type"].value_counts()

ohc      148
ohcf      15
ohcv      13
l         12
dohc      12
rotor      4
dohcv      1
Name: engine_type, dtype: int64