# Redfining features' data-types.  
  
After some exploration on the dataset I noticed the following :  
- All values are set to float even the categorical and boolean.
- Some of the categorical features are boolean features ( two categories 0 and 1).
- Some of the continues features could be categorized as they have a few number of distinct values.

----

So what I'm going to do is to define a new data-set with some changes :  
- analyse each feature and categorize it either [ continues, boolean or categorical ].
- change features names to the format ps_(_cat, bin or cont)_...the rest of the old name ..., the reason for that is to make it easier to find the feature's category using the numpy.str.startwith().
- I'll consider any feature with less than 30  distinct value to be categorical otherwise it's continues.
- Also I'll change the data-type of them to be [ float for continues, bool for boolean and integer for categorical]
- change the values of categorical items to be in range [ 0, number_of_distinct_values_for_this_feature ].


In [7]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import DMatrix
import pickle

In [None]:
data = pd.read_csv('./data/data')
print('Data:\t', data.shape)

In [None]:
# get lists of features names by their data-type. [ [cont]inues and [cat]egorical ]
counter = 0;
datatypes = {
    'cont' : [],
    'cat' : []
}

def setDatatypes(d) :
    global counter
    title = data.columns[counter]
    counter +=1;
    if title in ['target','id'] : return
    distictValues = d.value_counts().shape[0]
    
    if distictValues <= 30 :
        datatypes['cat'].append(title)
    else :
        datatypes['cont'].append(title)
        
_ = data.apply(setDatatypes)

In [None]:
datatypes['cont']

In [None]:


def change_title(old_title, dtype) :
    '''
    change feature title in newData to format ps_(cat|cont)_.....
    '''
    pattern = re.compile('^(ps_)([a-z]{3,4}_[0-9]{1,2})')
    t = pattern.findall(old_title)[0]
    if len(t) == 2 :
        new_title = t[0] + dtype + '_' + t[1]
        data.rename(columns={old_title:new_title}, inplace=True)

#-----------------------------------------------------------------------------------
for feature in datatypes['cat'] :
    categories = data[feature].value_counts().index.values
    categories.sort()
    data[feature] = pd.Categorical(data[feature]).codes
    change_title(feature, 'cat')
#--------------------------------------------------------------------------------------
for feature in datatypes['cont'] :
    change_title(feature, 'cont')

In [None]:
total_categos = 0
print ("___________________________\n")
print ("categorical columns with number of categories :\n")
for i in [ [col, data[col].value_counts().shape[0]]\
          for col in data.columns if col.startswith('ps_cat') ] :
    
    print (i[0], " :\t", i[1])
    total_categos += i[1]

print('Total number of categories = ', total_categos)
print("___________________________\n")
print ("continues columns with number of distinct values :\n")
for i in [ [col, data[col].value_counts().shape[0]]\
          for col in data.columns if col.startswith('ps_cont') ] :
    
    print (i[0], ": \tvalue_counts:", i[1], '\tMin:',data[i[0]].min().round(2), '\tMax', data[i[0]].max().round(2),
          '\tMean:', data[i[0]].mean().round(2), '\tMedian:', data[i[0]].median().round(2))

In [None]:
# save data to disk
#data.to_csv('./data/ready_data.csv', index=False)
data = pd.read_csv('./data/ready_data.csv')

In [None]:
from sklearn.preprocessing import MinMaxScaler
cont = [ feature for feature in data.columns if feature.startswith('ps_cont') ]
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(8,6))

for feature, ax in zip(cont, range(5)) :
    norm = MinMaxScaler()
    enc_data[feature] = norm.fit_transform(data[feature].values.reshape((-1,1)))
    sns.distplot(enc_data[feature], rug=True, ax=axs[ax])
fig.show()

In [None]:
# define OneHotEncoder classifier.
from sklearn.preprocessing import OneHotEncoder
import pickle
#encoders = []
enc_data = data[cont].copy()

cat = [ feature for feature in data.columns if feature.startswith('ps_cat') ]
for feature in cat :
        enc = OneHotEncoder(sparse=False)
        encoded = enc.fit_transform(data[feature].values.reshape((-1,1)))
        #encoders.append(enc)
        enc_data = pd.concat((enc_data, pd.DataFrame(encoded)), axis=1)
    
#with open('./data/OneHotEncoder.clf', 'wb') as f:
#    pickle.dump(file=f, obj=encoders)
enc_data.columns = range(enc_data.columns.shape[0])


In [None]:
arr_data

In [None]:
enc_data.to_csv('./data/ready_data.csv', index=False)

In [None]:
arr_data = enc_data.values
np.save('./data/ready_data.npy', arr_data)

In [None]:
corrT = enc_data.corr()
corrT[corrT == 1] = 0
corrT = corrT[corrT.abs() >= 0.8].dropna(how='all').dropna(how='all',axis=1)

In [None]:
for i in range(corrT.shape[0]):
    for j in range(i, corrT.shape[0]):
        if not np.isnan(corrT.iloc[i,j]):
            print(corrT.columns[i], '\t&\t ', corrT.columns[j],'\t=\t', corrT.iloc[i,j])

 Duplicates:
 31 32
41 	&	  42
43 	&	  44 	
45 	&	  46 	
47 	&	  48 	
49 	&	  50 	
51 	&	  52 	
53 	&	  54 	
53 	&	  57 
54 	&	  57
55 	&	  56 	
57 	&	  58 	
76 	&	  77 	
78 	&	  79 	
80 	&	  81 	
125 	&	  126
164 	&	  165
172 	&	  173 
379 	&	  380 
381 	&	  382 
383 	&	  384 
385 	&	  386 
387 	&	  388 
389 	&	  390



42,44,46,48,50,52,54,57,56,58,77,79,81,126,165,173,380,382,384,386,388,390

In [None]:
# drop cols with more than 93% correlation
cols_to_drop = [42,44,46,48,50,52,54,57,56,58,77,79,81,126,165,173,380,382,384,386,388,390]
enc_data.drop(columns=cols_to_drop, inplace=True)

In [None]:
arr_data = enc_data.values
np.save('./data/ready_data.npy', arr_data)

In [None]:
enc_data.shape

In [3]:
data = np.load('./data/ready_data.npy')
target = pd.read_csv('./data/target')

In [4]:
train = data[:target.shape[0]]
test = data[target.shape[0]:]

print('Train: ', train.shape, '\tTarget: ', target.shape)
print('Test:', test.shape)

Train:  (595211, 369) 	Target:  (595211, 1)
Test: (892817, 369)


In [5]:
# check implancing

pos = (target['0'].value_counts()[1]/target.shape[0])
print(pos.round(4)*100,'% of the labels are positive and ',(1-pos.round(4))*100,'% is negative')
print('Accordingly I am going to take N validation set with 3.64% postive labels')

valid_size = int(1e4)
v_pos = int(valid_size * pos)
v_neg = valid_size - v_pos
print('My validation set of size = ',valid_size, 'will have', v_pos, 'positives and',v_neg, 'negatives')

pos_ind = target[target['0']==1].index.tolist()
neg_ind = target[target['0']==0].index.tolist()

v_pos_ind = np.random.choice(pos_ind, v_pos, replace=False)
v_neg_ind = np.random.choice(neg_ind, v_neg, replace=False)
valid_ind = v_pos_ind.tolist() + v_neg_ind.tolist()
train_ind = list(set(target.index.tolist()) - set(valid_ind))
np.random.shuffle(valid_ind)
np.random.shuffle(train_ind)

target = target.values
x_train = train[train_ind]
y_train = target[train_ind]
x_valid = train[valid_ind]
y_valid = target[valid_ind]

print('x_train:\t',x_train.shape,'\ty_train:\t',y_train.shape)
print('x_valid:\t',x_valid.shape,'\ty_valid:\t',y_valid.shape)

np.save('./data/x_train.npy',x_train)
np.save('./data/y_train.npy',y_train)
np.save('./data/x_valid.npy',x_valid)
np.save('./data/y_valid.npy',y_valid)
#np.save('./data/test.npy', test)

3.64 % of the labels are positive and  96.36 % is negative
Accordingly I am going to take N validation set with 3.64% postive labels
My validation set of size =  10000 will have 364 positives and 9636 negatives
x_train:	 (585211, 369) 	y_train:	 (585211, 1)
x_valid:	 (10000, 369) 	y_valid:	 (10000, 1)


In [6]:
# converting to DMAtrices
dtrain = DMatrix(x_train, y_train, nthread=-1)
dvalid = DMatrix(x_valid, y_valid, nthread=-1)
dtest  = DMatrix(test, nthread=-1)

del test, x_train, y_train, x_valid, y_valid, train, data

dtrain.save_binary('./data/train.buffer')
dvalid.save_binary('./data/valid.buffer')
dtest.save_binary('./data/test.buffer')

In [12]:

dtrain.save_binary('./data/train.buffer')
dvalid.save_binary('./data/valid.buffer')
dtest.save_binary('./data/test.buffer')

In [None]:
# Note saving as csv takes way less memory than Dmatrix or npy format.

-----------