# Handling Categorical Encodings

> ## 1. Replace Values

In [3]:
import pandas as pd
import numpy as np
import copy
%matplotlib inline

In [4]:
df_flights = pd.read_csv('data.csv')

df_flights.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2014,1,1,1.0,96.0,235.0,70.0,AS,N508AS,145,PDX,ANC,194.0,1542,0.0,1.0
1,2014,1,1,4.0,-6.0,738.0,-23.0,US,N195UW,1830,SEA,CLT,252.0,2279,0.0,4.0
2,2014,1,1,8.0,13.0,548.0,-4.0,UA,N37422,1609,PDX,IAH,201.0,1825,0.0,8.0
3,2014,1,1,28.0,-2.0,800.0,-23.0,US,N547UW,466,PDX,CLT,251.0,2282,0.0,28.0
4,2014,1,1,34.0,44.0,325.0,43.0,AS,N762AS,121,SEA,ANC,201.0,1448,0.0,34.0


In [12]:
cat_df_flights = df_flights.select_dtypes(include=['object']).copy()
cat_df_flights.tail()

Unnamed: 0,carrier,tailnum,origin,dest
162044,UA,N477UA,PDX,IAH
162045,DL,N137DL,SEA,ATL
162046,AS,N530AS,SEA,SAN
162047,US,,PDX,PHX
162048,WN,N790SW,SEA,SMF


In [7]:
print(cat_df_flights.isnull().values.sum())

248


In [13]:
cat_df_flights['tailnum'].value_counts().index[0]

'N218AG'

In [14]:
cat_df_flights = cat_df_flights.fillna(cat_df_flights['tailnum'].value_counts().index[0])
print(cat_df_flights)

       carrier tailnum origin dest
0           AS  N508AS    PDX  ANC
1           US  N195UW    SEA  CLT
2           UA  N37422    PDX  IAH
3           US  N547UW    PDX  CLT
4           AS  N762AS    SEA  ANC
...        ...     ...    ...  ...
162044      UA  N477UA    PDX  IAH
162045      DL  N137DL    SEA  ATL
162046      AS  N530AS    SEA  SAN
162047      US  N218AG    PDX  PHX
162048      WN  N790SW    SEA  SMF

[162049 rows x 4 columns]


In [21]:
cat_df_flights_lc = cat_df_flights.copy()
cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].astype('category')
cat_df_flights_lc['origin'] = cat_df_flights_lc['origin'].astype('category')                                                              

print(cat_df_flights_lc.dtypes)

carrier    category
tailnum      object
origin     category
dest         object
dtype: object


> ## 2. Label Encoding

In [20]:
cat_df_flights.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [18]:
cat_df_flights_sklearn = cat_df_flights.copy()

from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
cat_df_flights_sklearn['carrier_code'] = lb_make.fit_transform(cat_df_flights['carrier'])

cat_df_flights_sklearn.head() #Results in appending a new column to df

Unnamed: 0,carrier,tailnum,origin,dest,carrier_code
0,AS,N508AS,PDX,ANC,1
1,US,N195UW,SEA,CLT,8
2,UA,N37422,PDX,IAH,7
3,US,N547UW,PDX,CLT,8
4,AS,N762AS,SEA,ANC,1


In [19]:
cat_df_flights.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


# 3. Dummy Encoding

In [20]:
cat_df_flights_onehot = cat_df_flights.copy()
cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier'], prefix = ['carrier'])

cat_df_flights_onehot.head()

Unnamed: 0,tailnum,origin,dest,carrier_AA,carrier_AS,carrier_B6,carrier_DL,carrier_F9,carrier_HA,carrier_OO,carrier_UA,carrier_US,carrier_VX,carrier_WN
0,N508AS,PDX,ANC,0,1,0,0,0,0,0,0,0,0,0
1,N195UW,SEA,CLT,0,0,0,0,0,0,0,0,1,0,0
2,N37422,PDX,IAH,0,0,0,0,0,0,0,1,0,0,0
3,N547UW,PDX,CLT,0,0,0,0,0,0,0,0,1,0,0
4,N762AS,SEA,ANC,0,1,0,0,0,0,0,0,0,0,0


> ## 4. Binary Encoding

In [23]:
cat_df_flights_ce = cat_df_flights.copy()

import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['carrier'])
df_binary = encoder.fit_transform(cat_df_flights_ce)

df_binary.head()

Unnamed: 0,carrier_0,carrier_1,carrier_2,carrier_3,carrier_4,tailnum,origin,dest
0,0,0,0,0,1,N508AS,PDX,ANC
1,0,0,0,1,0,N195UW,SEA,CLT
2,0,0,0,1,1,N37422,PDX,IAH
3,0,0,0,1,0,N547UW,PDX,CLT
4,0,0,0,0,1,N762AS,SEA,ANC


# You can explore the normalization/standardization of numerical features