In [1]:
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [4]:
## We will only focus on the categorical variables, only include object columns

In [5]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [7]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [9]:
obj_df['num_doors'].value_counts()

num_doors
four    114
two      89
Name: count, dtype: int64

In [16]:
obj_df['num_doors'].fillna('four',inplace=True)
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


## Approach 1 - Find and Replace

In [15]:
# Pandas make it easy for us to directly replace the text values with their numeric equivalent by using the replace

In [17]:
obj_df['num_cylinders'].value_counts()

num_cylinders
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64

In [19]:
obj_df['num_doors'].value_counts()

num_doors
four    116
two      89
Name: count, dtype: int64

In [20]:
cleanup_nums = {'num_doors':{'four':4,'two':2},
               'num_cylinders': {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}

In [22]:
obj_df = obj_df.replace(cleanup_nums)
obj_df

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,4,sedan,rwd,front,ohc,4,mpfi
201,volvo,gas,turbo,4,sedan,rwd,front,ohc,4,mpfi
202,volvo,gas,std,4,sedan,rwd,front,ohcv,6,mpfi
203,volvo,diesel,turbo,4,sedan,rwd,front,ohc,6,idi


In [23]:
obj_df.dtypes

make               object
fuel_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
dtype: object

# Approach 2 Label Encoding

In [24]:
# Label encoding is simply converting each value in a column to a number. For example, the body_style column contains 5 different values. We could choose to encode it like this:

In [25]:
# convertible -> 0
#hardtop -> 1
#hatchback -> 2
#sedan -> 3
#wagon -> 4

# Firts we wil convert the given column into category

In [26]:
obj_df['body_style'] = obj_df['body_style'].astype('category')

In [27]:
obj_df.dtypes

make                 object
fuel_type            object
aspiration           object
num_doors             int64
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders         int64
fuel_system          object
dtype: object

In [28]:
# we will assign the encoded variable to a new column using cat.codes accesssor

In [30]:
obj_df['body_style_cat'] = obj_df['body_style'].cat.codes
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi,2
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi,3
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi,3


# Approach 3 One Hot Encoding

In [31]:
# Label Encoding has the advantage that it has straightforward but the problem is numeric value is misinterpreted by the the algorithms


In [32]:
# Here we convert each category value into a new column we asigns the value as 0 or 1
# we use get_dummies

In [40]:
pd.get_dummies(obj_df,columns=['drive_wheels']).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,False,False,True
1,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,False,False,True
2,alfa-romero,gas,std,2,hatchback,front,ohcv,6,mpfi,2,False,False,True
3,audi,gas,std,4,sedan,front,ohc,4,mpfi,3,False,True,False
4,audi,gas,std,4,sedan,front,ohc,5,mpfi,3,True,False,False


In [41]:
# We can also give the prefix of the column name of each the column

In [43]:
pd.get_dummies(obj_df,columns=['body_style','drive_wheels'], prefix=['body','drive']).replace({True: 1, False: 0})

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,4,front,ohc,5,mpfi,3,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,0,1
201,volvo,gas,turbo,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,0,1
202,volvo,gas,std,4,front,ohcv,6,mpfi,3,0,0,0,1,0,0,0,1
203,volvo,diesel,turbo,4,front,ohc,6,idi,3,0,0,0,1,0,0,0,1


In [44]:
# Remember that the get_dummies return all the fulldata frame 
#One hot encoding, is very useful but it can cause the number of columns to expand greatly if you have very many unique values in a column


## Approach 4 Custom binary Encoding

In [45]:
# We can use some combination of the label encoding and the one hot encoding

In [46]:
obj_df['engine_type'].value_counts()

engine_type
ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: count, dtype: int64

In [47]:
# there are vaious version of the OHC
# we can use str accessor plus np.where to creat a column that indicates whether or not the car has an OHC engine


In [48]:
obj_df['OHC_code'] = np.where(obj_df['engine_type'].str.contains('ohc'),1,0)


In [49]:
obj_df[['make','engine_type','OHC_code']].head()

Unnamed: 0,make,engine_type,OHC_code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1


## Scikit Learn

In [50]:
# Above methods used to encode the target values not the feature values
# But the correct approach to use encoding target values is 
# ORDINALENCODER  ONEHOTENCODER
# iF we want to do equivalent to label encoding on the make of the car ,
# we need a ORDINALENCODER  and FIT_TRANSFORM

In [51]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
obj_df['make_code']= ord_enc.fit_transform(obj_df[['make']])
obj_df[['make','make_code']].head(11)

Unnamed: 0,make,make_code
0,alfa-romero,0.0
1,alfa-romero,0.0
2,alfa-romero,0.0
3,audi,1.0
4,audi,1.0
5,audi,1.0
6,audi,1.0
7,audi,1.0
8,audi,1.0
9,audi,1.0


In [52]:
from sklearn.preprocessing import OneHotEncoder
oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(obj_df[['body_style']])
pd.DataFrame(oe_results.toarray(), columns = oe_style.categories_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [53]:
# toarray() is used to convert the results to a format that can be converted into a dataframe


In [54]:
# Now we will join this data back to original dataframa


In [55]:
obj_df = obj_df.join(pd.DataFrame(oe_results.toarray(), columns = oe_style.categories_))

# Advanced Approaches

In [56]:
# It is basically some other types of the encoding
# 1. BackwardDifferenceEncoder
# 2. PolynomialEncoder

In [57]:
import category_encoders as ce
# Get a new clean ataframe

obj_df  = df.select_dtypes(include=['object']).copy()

ModuleNotFoundError: No module named 'category_encoders'