# Redfining features' data-types.  
  
After some exploration on the dataset I noticed the following :  
- All values are set to float even the categorical and boolean.
- Some of the categorical features are boolean features ( two categories 0 and 1).
- Some of the continues features could be categorized as they have a few number of distinct values.

----

So what I'm going to do is to define a new data-set with some changes :  
- analyse each feature and categorize it either [ continues, boolean or categorical ].
- change features names to the format ps_(_cat, bin or cont)_...the rest of the old name ..., the reason for that is to make it easier to find the feature's category using the numpy.str.startwith().
- I'll consider any feature with less than 30  distinct value to be categorical otherwise it's continues.
- Also I'll change the data-type of them to be [ float for continues, bool for boolean and integer for categorical]
- change the values of categorical items to be in range [ 0, number_of_distinct_values_for_this_feature ].


In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
# get lists of features names by their data-type. [ [cont]inues and [cat]egorical ]
counter = 0;
datatypes = {
    'cont' : [],
    'cat' : []
}

def setDatatypes(d) :
    global counter
    title = train.columns[counter]
    counter +=1;
    if title in ['target','id'] : return
    distictValues = d.value_counts().shape[0]
    
    if distictValues <= 30 :
        datatypes['cat'].append(title)
    else :
        datatypes['cont'].append(title)
        
_ = train.apply(setDatatypes)

In [4]:


def change_title(old_title, dtype) :
    '''
    change feature title in newData to format ps_(cat|cont)_.....
    '''
    pattern = re.compile('^(ps_)([a-z]{3,4}_[0-9]{1,2})')
    t = pattern.findall(old_title)[0]
    if len(t) == 2 :
        new_title = t[0] + dtype + '_' + t[1]
        train.rename(columns={title:new_title}, inplace=True)
        test.rename(columns={title:new_title}, inplace=True)

#-----------------------------------------------------------------------------------
for title in datatypes['cat'] :
    categories = train[title].append(test[title]).value_counts().index.values
    categories.sort()
    train[title] = pd.Categorical(train[title]).codes
    test[title]  = pd.Categorical(test[title]).codes
    change_title(title, 'cat')
#--------------------------------------------------------------------------------------
for title in datatypes['cont'] :
    change_title(title, 'cont')

In [5]:
print "___________________________\n"
print "categorical columns with number of categories :\n"
for i in [ [col, train[col].append(test[col]).value_counts().shape[0]]\
          for col in train.columns if col.startswith('ps_cat') ] :
    
    print i[0], " :\t", i[1]
print "___________________________\n"
print "continues columns with number of distinct values :\n"
for i in [ [col, train[col].append(test[col]).value_counts().shape[0]]\
          for col in train.columns if col.startswith('ps_cont') ] :
    
    print i[0], " :\t", i[1]

___________________________

categorical columns with number of categories :

ps_cat_ind_01  :	8
ps_cat_ind_02  :	5
ps_cat_ind_03  :	12
ps_cat_ind_04  :	3
ps_cat_ind_05  :	8
ps_cat_ind_06  :	2
ps_cat_ind_07  :	2
ps_cat_ind_08  :	2
ps_cat_ind_09  :	2
ps_cat_ind_10  :	2
ps_cat_ind_11  :	2
ps_cat_ind_12  :	2
ps_cat_ind_13  :	2
ps_cat_ind_14  :	5
ps_cat_ind_15  :	14
ps_cat_ind_16  :	2
ps_cat_ind_17  :	2
ps_cat_ind_18  :	2
ps_cat_reg_01  :	10
ps_cat_reg_02  :	19
ps_cat_car_01  :	13
ps_cat_car_02  :	3
ps_cat_car_03  :	3
ps_cat_car_04  :	10
ps_cat_car_05  :	3
ps_cat_car_06  :	18
ps_cat_car_07  :	3
ps_cat_car_08  :	2
ps_cat_car_09  :	6
ps_cat_car_10  :	3
ps_cat_car_11  :	5
ps_cat_car_15  :	15
ps_cat_calc_01  :	10
ps_cat_calc_02  :	10
ps_cat_calc_03  :	10
ps_cat_calc_04  :	6
ps_cat_calc_05  :	7
ps_cat_calc_06  :	11
ps_cat_calc_07  :	10
ps_cat_calc_08  :	12
ps_cat_calc_09  :	8
ps_cat_calc_10  :	26
ps_cat_calc_11  :	21
ps_cat_calc_12  :	11
ps_cat_calc_13  :	16
ps_cat_calc_14  :	25
ps_cat_calc_15 

In [6]:
# save data to disk
#train.to_csv('./data/new_train.csv',index=False)
#test.to_csv('./data/new_test.csv',index=False)

In [19]:
# define OneHotEncoder classifier.
from sklearn.preprocessing import OneHotEncoder
import pickle
encoders = []
cat = [ feature for feature in train.columns if feature.startswith('ps_cat') ]
for feature in cat :
        enc = OneHotEncoder(sparse=False)
        _ = enc.fit(train[feature].append(test[feature]).values.reshape(-1,1))
        encoders.append(enc)
    
with open('./data/OneHotEncoder.clf', 'wb') as f:
    pickle.dump(file=f, obj=encoders)



-----------