In [32]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

In [33]:
#reading csv data

df = pd.read_csv("mercedesbenz.csv",usecols = ['X1','X2','X3','X4','X5','X6'])
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [34]:
#checking unique values in cols

for col in df.columns:
    print(col, ': ',len(df[col].unique()),' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [35]:
#get_dummies is not good as it gives us a lot of features

pd.get_dummies(df,drop_first=True).shape

(4209, 117)

In [36]:
#to fix this we will take top10 most frequently occuring values
df.X1.value_counts().sort_values(ascending=False).head(20)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
f      23
y      23
j      22
Name: X1, dtype: int64

In [37]:
#lets extract top10 features

top10 = [x for x in df.X1.value_counts().sort_values(ascending=False).head(10).index]

print(top10)

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']


In [38]:
#for others values in X1 we will make them zero

for i in top10:
    df[i] = np.where(df['X1']==i,1,0)
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


In [39]:
df.shape

(4209, 16)

In [40]:
#creating function to get dummies with top10 method

def get_dummies_with_top10(df,var,top_x_labels):
    
    for i in top_x_labels:
        df[var+'_'+i] = np.where(df[var]==i,1,0)

In [41]:
top10_X2 = [x for x in df.X2.value_counts().sort_values(ascending=False).head(10).index]
top10_X2

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [42]:
final_df = pd.read_csv('mercedesbenz.csv',usecols = ['X1','X2','X3','X4','X5','X6'])
get_dummies_with_top10(final_df,'X2',top10_X2)

In [43]:
final_df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
