## Discretization of categorical variables

In [15]:
import pandas as pd
cat_retained = pd.read_csv("/home/data/kaggle/csv_cat_cut1.csv", nrows=100)

#### Filling the missing values to NA

In [16]:
#cat_retained.drop('Id', axis=1, inplace=True)
cat_retained.drop('Unnamed: 0', axis=1, inplace=True)

cat_data_to_vectorize = cat_retained.fillna('Na')

In [17]:
cat_retained.head()

Unnamed: 0,response,id,l3_s29_f3317,l3_s29_f3320,l3_s29_f3323,l3_s29_f3326,l3_s29_f3329,l3_s29_f3332,l3_s29_f3335,l3_s29_f3338,...,l3_s29_f3466,l3_s29_f3469,l3_s29_f3472,l3_s29_f3484,l3_s29_f3487,l3_s29_f3490,l3_s29_f3493,l3_s29_f3475,l3_s29_f3478,l3_s29_f3481
0,0,4,,,,,,,,,...,,,,,,,,,,
1,0,6,,,,,,,,,...,,,,,,,,,,
2,0,7,T1,T1,T1,T1,T1,T1,T1,T1,...,T1,T1,T1,T1,T1,T1,T1,T1,T1,T1
3,0,9,T1,T1,T1,T1,T1,T1,T1,T1,...,T1,T1,T1,T1,T1,T1,T1,T1,T1,T1
4,0,11,T1,T1,T1,T1,T1,T1,T1,T1,...,T1,T1,T1,T1,T1,T1,T1,T1,T1,T1


#### Preparing input for vectorizer. Using the DictVectorizer package from scikit to get a vectorized array.



In [18]:

cat_data_to_vectorize = cat_data_to_vectorize.T.to_dict().values()

In [19]:
from sklearn.feature_extraction import DictVectorizer as DV

vectorizer = DV( sparse = False )
vec_x_cat_train = vectorizer.fit_transform( cat_data_to_vectorize )
#vec_x_cat_test = vectorizer.transform( x_cat_test ) 

In [20]:
vec_x_cat_train[0:10]

array([[  4.,   1.,   0., ...,   1.,   0.,   0.],
       [  6.,   1.,   0., ...,   1.,   0.,   0.],
       [  7.,   0.,   1., ...,   0.,   1.,   0.],
       ..., 
       [ 16.,   0.,   1., ...,   0.,   1.,   0.],
       [ 18.,   0.,   1., ...,   0.,   1.,   0.],
       [ 23.,   0.,   1., ...,   0.,   1.,   0.]])

#### convert encoded output array to pd Dataframe

In [24]:

encoded_cat = pd.DataFrame(vec_x_cat_train,columns=['l3_s29_f3317_t1','l3_s29_f3317_na',
'l3_s29_f3320_t1','l3_s29_f3320_na',
'l3_s29_f3323_t1','l3_s29_f3323_na',
'l3_s29_f3326_t1','l3_s29_f3326_na',
'l3_s29_f3329_t1','l3_s29_f3329_na',
'l3_s29_f3332_t1','l3_s29_f3332_na',
'l3_s29_f3335_t1','l3_s29_f3335_na',
'l3_s29_f3338_t1','l3_s29_f3338_na',
'l3_s29_f3341_t1','l3_s29_f3341_na',
'l3_s29_f3344_t1','l3_s29_f3344_na',
'l3_s29_f3347_t1','l3_s29_f3347_na',
'l3_s29_f3350_t1','l3_s29_f3350_na',
'l3_s29_f3353_t1','l3_s29_f3353_na',
'l3_s29_f3356_t1','l3_s29_f3356_na',
'l3_s29_f3359_t1','l3_s29_f3359_na',
'l3_s29_f3362_t1','l3_s29_f3362_na',
'l3_s29_f3364_t1','l3_s29_f3364_na',
'l3_s29_f3366_t1','l3_s29_f3366_na',
'l3_s29_f3369_t1','l3_s29_f3369_na',
'l3_s29_f3372_t1','l3_s29_f3372_na',
'l3_s29_f3375_t1','l3_s29_f3375_na',
'l3_s29_f3378_t1','l3_s29_f3378_na',
'l3_s29_f3381_t1','l3_s29_f3381_na',
'l3_s29_f3384_t1','l3_s29_f3384_na',
'l3_s29_f3387_t1','l3_s29_f3387_na',
'l3_s29_f3390_t1','l3_s29_f3390_na',
'l3_s29_f3392_t1','l3_s29_f3392_na',
'l3_s29_f3394_t1','l3_s29_f3394_na',
'l3_s29_f3397_t1','l3_s29_f3397_na',
'l3_s29_f3400_t1','l3_s29_f3400_na',
'l3_s29_f3403_t1','l3_s29_f3403_na',
'l3_s29_f3406_t1','l3_s29_f3406_na',
'l3_s29_f3409_t1','l3_s29_f3409_na',
'l3_s29_f3411_t1','l3_s29_f3411_na',
'l3_s29_f3414_t1','l3_s29_f3414_na',
'l3_s29_f3416_t1','l3_s29_f3416_na',
'l3_s29_f3418_t1','l3_s29_f3418_na',
'l3_s29_f3420_t1','l3_s29_f3420_na',
'l3_s29_f3423_t1','l3_s29_f3423_na',
'l3_s29_f3426_t1','l3_s29_f3426_na',
'l3_s29_f3429_t1','l3_s29_f3429_na',
'l3_s29_f3432_t1','l3_s29_f3432_na',
'l3_s29_f3435_t1','l3_s29_f3435_na',
'l3_s29_f3438_t1','l3_s29_f3438_na',
'l3_s29_f3441_t1','l3_s29_f3441_na',
'l3_s29_f3444_t1','l3_s29_f3444_na',
'l3_s29_f3446_t1','l3_s29_f3446_na',
'l3_s29_f3448_t1','l3_s29_f3448_na',
'l3_s29_f3451_t1','l3_s29_f3451_na',
'l3_s29_f3454_t1','l3_s29_f3454_na',
'l3_s29_f3457_t1','l3_s29_f3457_na',
'l3_s29_f3460_t1','l3_s29_f3460_na',
'l3_s29_f3463_t1','l3_s29_f3463_na',
'l3_s29_f3466_t1','l3_s29_f3466_na',
'l3_s29_f3469_t1','l3_s29_f3469_na',
'l3_s29_f3472_t1','l3_s29_f3472_na',
'l3_s29_f3484_t1','l3_s29_f3484_na',
'l3_s29_f3487_t1','l3_s29_f3487_na',
'l3_s29_f3490_t1','l3_s29_f3490_na',
'l3_s29_f3493_t1','l3_s29_f3493_na',
'l3_s29_f3475_t1','l3_s29_f3475_na',
'l3_s29_f3478_t1','l3_s29_f3478_na',
'l3_s29_f3481_t1','l3_s29_f3481_na',
'l3_s29_f3481_t1','l3_s29_f3457_na'])

In [25]:
encoded_cat.head(5)

Unnamed: 0,l3_s29_f3317_t1,l3_s29_f3317_na,l3_s29_f3320_t1,l3_s29_f3320_na,l3_s29_f3323_t1,l3_s29_f3323_na,l3_s29_f3326_t1,l3_s29_f3326_na,l3_s29_f3329_t1,l3_s29_f3329_na,...,l3_s29_f3493_t1,l3_s29_f3493_na,l3_s29_f3475_t1,l3_s29_f3475_na,l3_s29_f3478_t1,l3_s29_f3478_na,l3_s29_f3481_t1,l3_s29_f3481_na,l3_s29_f3481_t1.1,l3_s29_f3481_na.1
0,4.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,6.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,7.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,9.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,11.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [28]:
import os
path_d = '/home/data/kaggle'
encoded_cat.to_csv(os.path.join(path_d, 'encoded_cat_no_id_no_response_testk.csv'))