## Processing Steps
- Separte variables by Object, encode as categorical variables.
- Separate DateTime variables from Object Type and split into Year, Month, Day, Hour, Min - **LabelEncode**
- Separate Float variables and treat them as continuous.
- Separate Integer variables and verify if any of those are categorical, apply binning if categorical.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
sDir = '/home/pabhijit/data/springleaf/'

In [2]:
from sklearn.impute import SimpleImputer

def imputeMissingFeatures(df_in, impStrgy):
    imp = SimpleImputer(missing_values=np.nan, strategy=impStrgy)
    arrImputedVal = imp.fit_transform(df_in)

    df_out = pd.DataFrame(arrImputedVal)
    df_out.columns = df_in.columns
    #df_out.shape
    return(df_out)

In [3]:
# Load Train Data
df_train = pd.read_pickle(sDir + 'step01_train.pkl')
df_train.shape

(145231, 1934)

In [4]:
# Load Test Data
df_test = pd.read_pickle(sDir + 'step01_test.pkl')
df_test.shape

(145232, 1933)

In [5]:
X = df_train.drop('target', axis=1)
y = df_train[['target']]
X_test = df_test.copy()
print(X.shape)
print(y.shape)
print(X_test.shape)

(145231, 1933)
(145231, 1)
(145232, 1933)


In [6]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

print('X_train', X_train.shape)
print('X_valid', X_valid.shape)
print('y_train', y_train.shape)
print('y_valid', y_valid.shape)

X_train (116184, 1933)
X_valid (29047, 1933)
y_train (116184, 1)
y_valid (29047, 1)


In [7]:
X_train.dtypes.value_counts()

int8       568
float16    434
int32      429
int16      408
object      51
float32     40
float64      3
dtype: int64

In [8]:
X_train.head()

Unnamed: 0,ID,VAR_0001,VAR_0002,VAR_0003,VAR_0004,VAR_0005,VAR_0006,VAR_0007,VAR_0008,VAR_0009,...,VAR_1925,VAR_1926,VAR_1927,VAR_1928,VAR_1929,VAR_1930,VAR_1931,VAR_1932,VAR_1933,VAR_1934
1438,2794,R,139,62,2156,B,0.0,0.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,BRANCH
70125,140193,R,116,152,1300,B,0.0,0.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,BRANCH
13800,27366,H,53,163,2984,B,0.0,0.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,BRANCH
7676,15140,R,104,41,1657,C,0.0,0.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,BRANCH
23219,46216,H,60,72,5975,C,0.0,0.0,False,False,...,0,98,98,998,999999998,998,998,9998,9998,IAPS


### Missing Values

In [9]:
null_cols = X_train.columns[X_train.isnull().any()]
s_miss_cols = X_train[null_cols].isnull().sum()/len(X_train) * 100

In [13]:
type(X_train)

pandas.core.frame.DataFrame

In [10]:
# Variables with 85% and more missing values, these can be dropped.
s_drop_mis_cols = s_miss_cols[s_miss_cols>=85]
s_drop_mis_cols

VAR_0156     95.951250
VAR_0157     99.366522
VAR_0158     98.549714
VAR_0159     95.951250
VAR_0166     90.206913
VAR_0167     98.240722
VAR_0168     92.621187
VAR_0169     90.206913
VAR_0176     87.912277
VAR_0177     97.691593
VAR_0178     91.673552
VAR_0179     87.912277
VAR_0205     98.418887
VAR_0206     98.381877
VAR_0207    100.000000
VAR_0208     86.639296
VAR_0209     93.488776
VAR_0210     86.639296
VAR_0211     86.639296
VAR_0213    100.000000
VAR_0214     99.993114
VAR_0840    100.000000
dtype: float64

#### Drop Missing Variables

In [14]:
X_train.drop(s_drop_mis_cols.index, axis=1, inplace=True)
print('Number of variables reduced to : ', X_train.shape[1])

Number of variables reduced to :  1911


#### Impute Missing Variables - Object Type
Impute Missing variables with respective most frequent value.

In [41]:
X_train_obj = X_train.select_dtypes(include='object')
X_train_obj.shape

(116184, 38)

In [42]:
X_train_obj.head()

Unnamed: 0,VAR_0001,VAR_0005,VAR_0008,VAR_0009,VAR_0010,VAR_0011,VAR_0012,VAR_0043,VAR_0044,VAR_0073,...,VAR_0325,VAR_0342,VAR_0352,VAR_0353,VAR_0354,VAR_0404,VAR_0466,VAR_0467,VAR_0493,VAR_1934
1438,R,B,False,False,False,False,False,False,[],,...,R,CE,U,R,U,-1,-1,-1,-1,BRANCH
70125,R,B,False,False,False,False,False,False,[],,...,H,FF,R,R,U,-1,-1,-1,-1,BRANCH
13800,H,B,False,False,False,False,False,False,[],,...,S,-1,O,O,O,-1,-1,-1,-1,BRANCH
7676,R,C,False,False,False,False,False,False,[],,...,S,ED,U,U,U,-1,-1,-1,-1,BRANCH
23219,H,C,False,False,False,False,False,False,[],,...,S,-1,U,O,U,CONTACT,-1,-1,-1,IAPS


In [43]:
print('Number of object features with missing values :', len(X_train_obj.columns[X_train_obj.isnull().any()]))

Number of object features with missing values : 35


In [44]:
X_train_obj_prcd = imputeMissingFeatures(X_train_obj, 'most_frequent')
X_train_obj_prcd.shape

(116184, 38)

In [47]:
print('Number of object features with missing values :', len(X_train_obj_prcd.columns[X_train_obj_prcd.isnull().any()]))

Number of object features with missing values : 0


In [48]:
del(X_train_obj)

#### Variables with No Variance

In [49]:
# Variables with No Variance (number of unique values = 1), these variables can be dropped
idx_no_var = df_descr[df_descr['unique']==1].index
idx_no_var

Index(['VAR_0008', 'VAR_0009', 'VAR_0010', 'VAR_0011', 'VAR_0012', 'VAR_0043',
       'VAR_0044', 'VAR_0196', 'VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0229',
       'VAR_0239'],
      dtype='object')

In [51]:
# Drop variables with no variance
X_train_obj_prcd.drop(idx_no_var, axis=1, inplace=True)
X_train_obj_prcd.shape

(116184, 25)

#### Variables with majority default values as "-1"
- There are variables which are defaulted to **-1**. 
- Below variables have more than 80% of the values defaulted.
- Since information not available for majority of these variables, we can drop these.

Variable and respective -1 count:  
VAR_0466 - 121461  
VAR_0467 - 121675  
VAR_0493 - 135182  
VAR_0404 - 130443  

In [52]:
cols_to_drop = ['VAR_0466', 'VAR_0467', 'VAR_0493', 'VAR_0404']
X_train_obj_prcd.drop(cols_to_drop, axis=1, inplace=True)
X_train_obj_prcd.shape

(116184, 21)

### Numeric Variables

In [54]:
X_train_num = X_train.select_dtypes(exclude='object')
X_train_num.shape

(116184, 1873)

In [55]:
df_descr = X_train_num.describe().transpose()
df_descr

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,116184.0,1.449622e+05,8.383098e+04,2.0,72334.5,145053.0,217354.0,290461.0
VAR_0002,116184.0,1.055033e+02,1.438247e+02,0.0,24.0,60.0,132.0,999.0
VAR_0003,116184.0,8.833412e+01,1.255082e+02,0.0,13.0,55.0,120.0,999.0
VAR_0004,116184.0,3.722627e+03,1.240425e+04,0.0,1785.0,2500.0,3600.0,2200000.0
VAR_0006,116143.0,,0.000000e+00,0.0,0.0,0.0,1.0,52.0
...,...,...,...,...,...,...,...,...
VAR_1929,116184.0,9.903945e+08,9.753596e+07,0.0,999999998.0,999999998.0,999999998.0,999999999.0
VAR_1930,116184.0,9.231061e+02,2.119181e+02,1.0,998.0,998.0,998.0,999.0
VAR_1931,116184.0,9.366189e+02,2.101698e+02,0.0,998.0,998.0,998.0,999.0
VAR_1932,116184.0,9.941405e+03,7.481829e+02,0.0,9998.0,9998.0,9998.0,9999.0


In [102]:
df_data_num['VAR_0223'].isnull().any()

True

#### Impute Missing Variables
Impute missing variables with missing values with respective median.

In [57]:
print('Number of object features with missing values :', len(X_train_num.columns[X_train_num.isnull().any()]))

Number of object features with missing values : 468


In [58]:
X_train_num_prcd = imputeMissingFeatures(X_train_num, 'median')
X_train_num_prcd.shape

(116184, 1873)

In [59]:
print('Number of object features with missing values :', len(X_train_num_prcd.columns[X_train_num_prcd.isnull().any()]))

Number of object features with missing values : 0


#### Variables with no variance

In [60]:
# Variables with no variance
idx_no_var = df_descr[df_descr['std'] == 0].index
idx_no_var

Index(['VAR_0006', 'VAR_0007', 'VAR_0013', 'VAR_0014', 'VAR_0015', 'VAR_0016',
       'VAR_0017', 'VAR_0018', 'VAR_0019', 'VAR_0020',
       ...
       'VAR_0495', 'VAR_0514', 'VAR_0515', 'VAR_0517', 'VAR_0520', 'VAR_0527',
       'VAR_0528', 'VAR_0530', 'VAR_0847', 'VAR_1428'],
      dtype='object', length=159)

In [61]:
X_train_num_prcd.drop(idx_no_var, axis=1, inplace=True)
X_train_num_prcd.shape

(116184, 1714)

In [62]:
# This step was to pick few variables which were exluded from previous step because std=nan. 
# E.g 'VAR_0223', Not sure why df.describe() would return std=nan for this variable
# TO take care off such instances below is another way to find out variables with no variance
s = X_train_num_prcd.nunique()==1
idx_no_var = s[s==True].index

In [63]:
X_train_num_prcd.drop(idx_no_var, axis=1, inplace=True)

In [65]:
X_train_num_prcd = X_train_num_prcd.drop('ID', axis=1)

In [66]:
X_train_num_prcd.shape

(116184, 1712)

### Correlated Features

In [111]:
# Merge Obect and Numeric dataframes
#df_dataX = pd.concat([df_train_obj, df_train_num], axis=1)
#df_dataX.shape

In [67]:
%%time
df_corr = X_train_num_prcd.corr().abs()
df_corr

CPU times: user 7min 44s, sys: 406 ms, total: 7min 44s
Wall time: 7min 50s


Unnamed: 0,VAR_0002,VAR_0003,VAR_0004,VAR_0045,VAR_0046,VAR_0047,VAR_0048,VAR_0049,VAR_0063,VAR_0064,...,VAR_1924,VAR_1925,VAR_1926,VAR_1927,VAR_1928,VAR_1929,VAR_1930,VAR_1931,VAR_1932,VAR_1933
VAR_0002,1.000000,0.145695,0.004533,0.001593,0.001698,0.003411,0.003477,0.015801,0.019512,0.018874,...,0.015492,0.011876,0.033671,0.020788,0.020666,0.010286,0.032799,0.019017,0.005748,0.034130
VAR_0003,0.145695,1.000000,0.023100,0.003004,0.003448,0.002412,0.001881,0.001420,0.013014,0.018356,...,0.026105,0.022231,0.006897,0.008575,0.008635,0.000310,0.005767,0.007685,0.001255,0.007108
VAR_0004,0.004533,0.023100,1.000000,0.001587,0.000465,0.001210,0.002137,0.001415,0.003550,0.004120,...,0.026927,0.007379,0.003787,0.001005,0.000945,0.004504,0.005049,0.001427,0.003037,0.003630
VAR_0045,0.001593,0.003004,0.001587,1.000000,0.687301,0.611969,0.560744,0.323021,0.079003,0.075235,...,0.000080,0.002561,0.004553,0.004291,0.004314,0.006562,0.004831,0.004333,0.008278,0.004581
VAR_0046,0.001698,0.003448,0.000465,0.687301,1.000000,0.882097,0.799962,0.445980,0.097089,0.095707,...,0.001045,0.002951,0.009311,0.009890,0.009933,0.005971,0.008602,0.009243,0.007595,0.009356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VAR_1929,0.010286,0.000310,0.004504,0.006562,0.005971,0.007260,0.006141,0.009332,0.017037,0.019052,...,0.004990,0.014095,0.272185,0.327110,0.325254,1.000000,0.383100,0.404177,0.645470,0.272567
VAR_1930,0.032799,0.005767,0.005049,0.004831,0.008602,0.014985,0.019529,0.041745,0.050944,0.059995,...,0.018068,0.036659,0.978300,0.814077,0.810425,0.383100,1.000000,0.803869,0.277466,0.977559
VAR_1931,0.019017,0.007685,0.001427,0.004333,0.009243,0.013751,0.017773,0.036706,0.046441,0.055990,...,0.025728,0.031310,0.808431,0.968016,0.966625,0.404177,0.803869,1.000000,0.276069,0.808440
VAR_1932,0.005748,0.001255,0.003037,0.008278,0.007595,0.007006,0.005801,0.011539,0.021890,0.025288,...,0.007296,0.011599,0.209264,0.233961,0.232859,0.645470,0.277466,0.276069,1.000000,0.209243


In [68]:
# Create a True/False mask and apply it (we will eliminate one half of the correlation matrix)
mask = np.triu(np.ones_like(df_corr, dtype=bool))
mask

array([[ True,  True,  True, ...,  True,  True,  True],
       [False,  True,  True, ...,  True,  True,  True],
       [False, False,  True, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ..., False,  True,  True],
       [False, False, False, ..., False, False,  True]])

In [69]:
# Get only True values
df_corr_mask = df_corr.mask(mask)
df_corr_mask

Unnamed: 0,VAR_0002,VAR_0003,VAR_0004,VAR_0045,VAR_0046,VAR_0047,VAR_0048,VAR_0049,VAR_0063,VAR_0064,...,VAR_1924,VAR_1925,VAR_1926,VAR_1927,VAR_1928,VAR_1929,VAR_1930,VAR_1931,VAR_1932,VAR_1933
VAR_0002,,,,,,,,,,,...,,,,,,,,,,
VAR_0003,0.145695,,,,,,,,,,...,,,,,,,,,,
VAR_0004,0.004533,0.023100,,,,,,,,,...,,,,,,,,,,
VAR_0045,0.001593,0.003004,0.001587,,,,,,,,...,,,,,,,,,,
VAR_0046,0.001698,0.003448,0.000465,0.687301,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VAR_1929,0.010286,0.000310,0.004504,0.006562,0.005971,0.007260,0.006141,0.009332,0.017037,0.019052,...,0.004990,0.014095,0.272185,0.327110,0.325254,,,,,
VAR_1930,0.032799,0.005767,0.005049,0.004831,0.008602,0.014985,0.019529,0.041745,0.050944,0.059995,...,0.018068,0.036659,0.978300,0.814077,0.810425,0.383100,,,,
VAR_1931,0.019017,0.007685,0.001427,0.004333,0.009243,0.013751,0.017773,0.036706,0.046441,0.055990,...,0.025728,0.031310,0.808431,0.968016,0.966625,0.404177,0.803869,,,
VAR_1932,0.005748,0.001255,0.003037,0.008278,0.007595,0.007006,0.005801,0.011539,0.021890,0.025288,...,0.007296,0.011599,0.209264,0.233961,0.232859,0.645470,0.277466,0.276069,,


In [70]:
# List column names of highly correlated features (r > 0.95)
lst_to_drop = [c for c in df_corr_mask.columns if any(df_corr_mask[c] > 0.90)]
len(lst_to_drop)

1058

In [72]:
# Drop the features in the to_drop list
X_train_num_prcd = X_train_num_prcd.drop(lst_to_drop, axis=1)

print("The X_train_num_prcd dataframe has {} columns".format(X_train_num_prcd.shape[1]))

The X_train_num_prcd dataframe has 654 columns


In [74]:
#df_dataX.head()
X_train_num_prcd['target'] = y_train['target']

KeyError: 'target'

In [None]:
df_processed = pd.concat([X_train_num_prcd_obj, X_train_num_prcd], axis=1)
df_processed.shape

In [None]:
df_processed.to_pickle(sDir + 'step02_data.pkl')

In [76]:
#del(df_dataX)

Unnamed: 0,target
1438,0
70125,0
13800,0
7676,0
23219,0


### Misc