# ONE-HOT ENCODING WITH MANY CATEGORICAL VARIABLES

In [56]:
import pandas as pd
import numpy as np

In [57]:
df=pd.read_csv('train.csv',usecols=['X1','X2'])
df

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n
...,...,...
4204,s,as
4205,o,t
4206,v,r
4207,r,e


In [58]:
df.shape

(4209, 2)

In [59]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']


In [60]:
df['X1'].unique()

array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
       'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
       'ab'], dtype=object)

In [61]:
df['X2'].unique()

array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
       'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
       'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
       'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar'], dtype=object)

In [62]:
len(df['X1'].unique())

27

In [63]:
len(df['X2'].unique())

44

In [64]:
# Let's look at how many labels each variable has

for col in df.columns:
    print(col,' : ',len(df[col].unique()),'labels')

X1  :  27 labels
X2  :  44 labels


In [65]:
df.shape

(4209, 2)

In [66]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df,drop_first=True).shape

(4209, 69)

We can observe that from with just 2 categorical features we are getting 69 features with the help of one hot encoding.It is unnecessarily columns which can lead to curve of dimensionality.

To solve this problem,we can only include most frequent labels and skip the remaining labels

In [67]:
#Let see the most frequent label for  the column X2
df.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [68]:
# Let's make a list with the most frequent categories of the  variables
top_10_labels=[y for y in df.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [69]:
# get whole set of dummmy variables, for all the categorical variables

def one_hot_encoding(df,variables,top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    for labels in top_x_labels:
        df[variables+'_'+labels]=np.where(df[variables]==labels,1,0)

In [70]:
# Read the data again

df=pd.read_csv('train.csv',usecols=['X1','X2'])

# Encode X2 into the most frequent variables
one_hot_encoding(df,'X1',top_10_labels)
df.head()

Unnamed: 0,X1,X2,X1_as,X1_ae,X1_ai,X1_m,X1_ak,X1_r,X1_n,X1_s,X1_f,X1_e
0,v,at,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,0,0,0,0
3,t,n,0,0,0,0,0,0,0,0,0,0
4,v,n,0,0,0,0,0,0,0,0,0,0


In [71]:
# Encode X1 into the most frequent variables
one_hot_encoding(df,'X2',top_10_labels)
df.head(20)

Unnamed: 0,X1,X2,X1_as,X1_ae,X1_ai,X1_m,X1_ak,X1_r,X1_n,X1_s,...,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,v,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,b,e,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,r,e,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7,l,as,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,s,as,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
9,b,aq,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
df.drop(columns=['X1','X2'],axis=1)

Unnamed: 0,X1_as,X1_ae,X1_ai,X1_m,X1_ak,X1_r,X1_n,X1_s,X1_f,X1_e,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4206,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4207,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In this way we can reduce the unnecessary feature dimensional space 