### ONE HOT ENCODING - Variables With Many Categories

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train.csv/train.csv', usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [3]:
data.columns

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6'], dtype='object')

In [4]:
# how many categories/ labels each variables has
for col in data.columns:
    print(col,': ',str(len(data[col].unique())) +' labels')

X1 :  27 labels
X2 :  44 labels
X3 :  7 labels
X4 :  4 labels
X5 :  29 labels
X6 :  12 labels


In [5]:
# lets see how many columns wwe will btain after 1 hot encoding these variables
pd.get_dummies(data, drop_first=True).shape

(4209, 117)

So to handle such a big categorical values we will use the top frequent categories for 1 hot encoding and the remaining ones will be in another categories - this works only when u have a imbalanced categories in ur features
But when in a test case the frequency of the categories are the same we cant use this approach.

In [6]:
data['X2'].value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [7]:
# we create the list of top 15 categories for X2 column
# value_counts automatically returns results in descending order so no use of using sort_values(ascending=False)
top_15 = [x for x in data['X2'].value_counts().head(15).index]
top_15

['as',
 'ae',
 'ai',
 'm',
 'ak',
 'r',
 'n',
 's',
 'f',
 'e',
 'aq',
 'ay',
 'a',
 't',
 'k']

In [8]:
# and now we label the categories with 0 and 1, here we did this for X2 now we need to do it for X1...X6
'''for label in top_15:
    data[label] = np.where(data['X2']== label,1,0)

data[['X2'] + top_15].head(10)'''

"for label in top_15:\n    data[label] = np.where(data['X2']== label,1,0)\n\ndata[['X2'] + top_15].head(10)"

# now we need to do it for X1...X6

In [9]:
def one_hot_topX(df,v,top_X_labels):
    for label in top_X_labels:
        df[v+'_'+label] = np.where(data[v]==label,1,0)

def topX(colname):
    top_15 = [x for x in data[colname].value_counts().head(15).index]
    return top_15

for col in data.columns:
    one_hot_topX(data,col,topX(col))

data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b,X6_f,X6_e
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
data.columns

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X1_aa', 'X1_s', 'X1_b', 'X1_l',
       'X1_v', 'X1_r', 'X1_i', 'X1_a', 'X1_c', 'X1_o', 'X1_w', 'X1_z', 'X1_u',
       'X1_e', 'X1_m', 'X2_as', 'X2_ae', 'X2_ai', 'X2_m', 'X2_ak', 'X2_r',
       'X2_n', 'X2_s', 'X2_f', 'X2_e', 'X2_aq', 'X2_ay', 'X2_a', 'X2_t',
       'X2_k', 'X3_c', 'X3_f', 'X3_a', 'X3_d', 'X3_g', 'X3_e', 'X3_b', 'X4_d',
       'X4_a', 'X4_b', 'X4_c', 'X5_w', 'X5_v', 'X5_q', 'X5_r', 'X5_s', 'X5_d',
       'X5_n', 'X5_p', 'X5_m', 'X5_i', 'X5_ae', 'X5_ag', 'X5_ac', 'X5_ab',
       'X5_l', 'X6_g', 'X6_j', 'X6_d', 'X6_i', 'X6_l', 'X6_a', 'X6_h', 'X6_k',
       'X6_c', 'X6_b', 'X6_f', 'X6_e'],
      dtype='object')