# One Hot Encoding - Features with many Categories

In [2]:
import pandas as pd
import numpy as np

## loading the dataset
data = pd.read_csv('/home/ashish/projects/One_Hot_Encoding-for-MultiCategorical_Feature/mercedes.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d
5,b,e,c,d,g,h
6,r,e,f,d,f,h
7,l,as,f,d,f,j
8,s,as,e,d,f,i
9,b,aq,c,d,f,a


In [3]:
## NO. of labels for  each feature
for feature in data.columns:
    print(feature,': ',len(data[feature].unique()),' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


## Performing Simple one hot coding

In [4]:
print("Before One Hot Encoding we have the shape as : ",data.shape)

Before One Hot Encoding we have the shape as :  (4209, 6)


In [5]:
data_dummies = pd.get_dummies(data,drop_first=True)
print("Before One Hot Encoding we have the shape as : ",data_dummies.shape)

Before One Hot Encoding we have the shape as :  (4209, 117)


So we have 111 more columns/features now --> which leads to curse of dimensionality which is not a good practice in terms of ML algorithms.

# So we will go with another method as shown below::

In [6]:
## Let's find the top 10 most frequent categories for the 'X2' feature

data['X2'].value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [7]:
## list of 10 most frequent categories of all features

top_ten_x2 = [x for x in data['X2'].value_counts().sort_values(ascending=False).head(10).index]
top_ten_x1 = [x for  x in data['X1'].value_counts().sort_values(ascending=False).head(10).index]
top_ten_x3 = [x for x in data['X3'].value_counts().sort_values(ascending=False).head(10).index]
top_ten_x4 = [x for x in data['X4'].value_counts().sort_values(ascending=False).head(10).index]
top_ten_x5 = [x for x in data.X5.value_counts().sort_values(ascending=False).head(10).index]
top_ten_x6 = [x for x in data.X6.value_counts().sort_values(ascending=False).head(10).index]

print(top_ten_x1)
print(top_ten_x2)
print(top_ten_x3)
print(top_ten_x4)
print(top_ten_x5)
print(top_ten_x6)

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']
['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']
['c', 'f', 'a', 'd', 'g', 'e', 'b']
['d', 'a', 'c', 'b']
['w', 'v', 'q', 'r', 'd', 's', 'n', 'm', 'p', 'i']
['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']


In [8]:
## Get whole set of dummy Variables for all the categorical Variables
def one_hot_top_ten(df,variable,ten_labels):
    for label in ten_labels:
        df[variable+'_'+label] = np.where(data[variable]==label,1,0)

In [9]:
data = pd.read_csv('/home/ashish/projects/One_Hot_Encoding-for-MultiCategorical_Feature/mercedes.csv',usecols=['X1','X2','X3','X4','X5','X6'])
one_hot_top_ten(data,'X1',top_ten_x1)
one_hot_top_ten(data,'X2',top_ten_x2)
one_hot_top_ten(data,'X3',top_ten_x3)
one_hot_top_ten(data,'X4',top_ten_x4)
one_hot_top_ten(data,'X5',top_ten_x5)
one_hot_top_ten(data,'X6',top_ten_x6)
data.head(20)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,b,e,c,d,g,h,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
6,r,e,f,d,f,h,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,l,as,f,d,f,j,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [10]:
data.drop(['X1','X2','X3','X4','X5','X6'],axis=1,inplace=True)
data.head()

Unnamed: 0,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
## Display all the columns of the Dataframe fully(without hiding/skipping)

pd.pandas.set_option('display.max_columns',None)
data.head(20)

Unnamed: 0,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_a,X4_c,X4_b,X5_w,X5_v,X5_q,X5_r,X5_d,X5_s,X5_n,X5_m,X5_p,X5_i,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


## Advantages of one hot encoding of top variables
    1. Straightforward to implement
    2. Does not require hours of variable Exploration
    3. Does not massively expand the Feature Space

## Disadvantages of one hot encoding of top variables
    1. Does Not add any information that may make the variables more predictive
    2. Does not keep the information of ignored labels(which may be important)