# Redfining features' data-types.  
  
After some exploration on the dataset I noticed the following :  
- All values are set to float even the categorical and boolean.
- Some of the categorical features are boolean features ( two categories 0 and 1).
- Some of the continues features could be categorized as they have a few number of distinct values.

----

So what I'm going to do is to define a new data-set with some changes :  
- analyse each feature and categorize it either [ continues, boolean or categorical ].
- change features names to the format ps_(_cat, bin or cont)_...the rest of the old name ..., the reason for that is to make it easier to find the feature's category using the numpy.str.startwith().
- I'll consider any feature with less than 30  distinct value to be categorical otherwise it's continues.
- Also I'll change the data-type of them to be [ float for continues, bool for boolean and integer for categorical]
- change the values of categorical items to be in range [ 0, number_of_distinct_values_for_this_feature ].


In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_csv('./data/train.csv', na_values='-1')
test = pd.read_csv('./data/test.csv', na_values='-1')
data = test.append(train.drop('target',axis=1))

In [4]:
# get lists of features names by their data-type. [ [cont]inues, [bin]ary, [cat]egorical ]
counter = 0;
datatypes = {
    'cont' : [],
    'bin' : [],
    'cat' : []
}

def setDatatypes(d) :
    global counter
    title = data.columns[counter]
    counter +=1;
    if title in ['target','id'] : return
    distictValues = d.value_counts().shape[0]
    if distictValues == 2 :
        datatypes['bin'].append(title)
    elif distictValues <= 30 :
        datatypes['cat'].append(title)
    else :
        datatypes['cont'].append(title)
        
_ = data.apply(setDatatypes)

In [5]:
# Define the new Dataframes
new_train = train.copy()
new_test = test.copy()
del train
del test

def change_title(old_title, dtype) :
    '''
    change feature title in newData to format ps_(cat|bin|cont)_.....
    '''
    pattern = re.compile('^(ps_)([a-z]{3,4}_[0-9]{1,2})')
    t = pattern.findall(old_title)[0]
    if len(t) == 2 :
        new_title = t[0] + dtype + '_' + t[1]
        new_train.rename(columns={title:new_title}, inplace=True)
        new_test.rename(columns={title:new_title}, inplace=True)
        data.rename(columns={title:new_title}, inplace=True)

#---------------------------------------------------------------------------------
for title in datatypes['bin'] :
    # set datatype to bool.
    new_train[title] = new_train[title].astype(np.bool)
    new_test[title] = new_test[title].astype(np.bool)
    change_title(title, 'bin')
#-----------------------------------------------------------------------------------
for title in datatypes['cat'] :
    # fill nan values to -1, the reason is that we can't convert dtype to int if there is any NaN value.
    data[title].fillna(-1, inplace=True)
    new_train[title].fillna(-1, inplace=True)
    new_test[title].fillna(-1, inplace=True)
    # convert the values into a range of 0 to length.
    values = data[title].value_counts().index.values
    values.sort()
    if not ( values == range(values.shape[0]) ).all() :
        new_values = { str(i):j for i,j in zip(values, range(values.shape[0])) }
        new_train[title] = new_train[title].map(lambda d : new_values[str(d)],na_action='ignore')
        new_test[title] = new_test[title].map(lambda d : new_values[str(d)],na_action='ignore')
        
    new_train[title] = new_train[title].astype(np.int16)
    new_train[title] = new_test[title].astype(np.int16)
    
    # change title to format ps_cat_.....
    change_title(title, 'cat')
#--------------------------------------------------------------------------------------
for title in datatypes['cont'] :
    change_title(title, 'cont')

In [11]:
print "boolean columns :\n"
for i in [ col for col in  data.columns if col.startswith('ps_bin') ]:
    print i
print "___________________________\n"
print "categorical columns with number of categories :\n"
for i in [ [col, data[col].value_counts().shape[0]] for col in data.columns if col.startswith('ps_cat') ] :
    print i[0], " :\t", i[1]
print "___________________________\n"
print "continues columns with number of distinct values :\n"
for i in [ [col, data[col].value_counts().shape[0]] for col in data.columns if col.startswith('ps_cont') ] :
    print i[0], " :\t", i[1]

boolean columns :

ps_bin_ind_04
ps_bin_ind_06
ps_bin_ind_07
ps_bin_ind_08
ps_bin_ind_09
ps_bin_ind_10
ps_bin_ind_11
ps_bin_ind_12
ps_bin_ind_13
ps_bin_ind_16
ps_bin_ind_17
ps_bin_ind_18
ps_bin_car_02
ps_bin_car_03
ps_bin_car_05
ps_bin_car_07
ps_bin_car_08
ps_bin_calc_15
ps_bin_calc_16
ps_bin_calc_17
ps_bin_calc_18
ps_bin_calc_19
ps_bin_calc_20
___________________________

categorical columns with number of categories :

ps_cat_ind_01  :	8
ps_cat_ind_02  :	5
ps_cat_ind_03  :	12
ps_cat_ind_05  :	8
ps_cat_ind_14  :	5
ps_cat_ind_15  :	14
ps_cat_reg_01  :	10
ps_cat_reg_02  :	19
ps_cat_car_01  :	13
ps_cat_car_04  :	10
ps_cat_car_06  :	18
ps_cat_car_09  :	6
ps_cat_car_10  :	3
ps_cat_car_11  :	5
ps_cat_car_15  :	15
ps_cat_calc_01  :	10
ps_cat_calc_02  :	10
ps_cat_calc_03  :	10
ps_cat_calc_04  :	6
ps_cat_calc_05  :	7
ps_cat_calc_06  :	11
ps_cat_calc_07  :	10
ps_cat_calc_08  :	12
ps_cat_calc_09  :	8
ps_cat_calc_10  :	26
ps_cat_calc_11  :	21
ps_cat_calc_12  :	12
ps_cat_calc_13  :	16
ps_cat_calc_

In [7]:
# save data to disk
new_train.to_csv('./data/new_train.csv',index=False)
new_test.to_csv('./data/new_test.csv',index=False)

-----------