# Read Data

In [15]:
import itertools
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [6]:
X_train = pd.read_csv('X_train.csv')

The next steps come with a certain caveat. 

We modify and delete features that have the same value for all samples (so the variance of these individual features = 0) or that have 'special' values. This is valid for one dataset (e.g. train), but might look completely different for the valid or even the test dataset. As a conclusion the outcome of the operations have to be applied to the valid and test set too. Even though the variance might NOT be = 0 there or the 'special' values are distributed differently etc. 

All this might impact the final model. It might be worth it to double check what the model would look like without these preparations. 

# Feature Preparation

## Imputing 'special' values or missing values

In [8]:
res = X_train.isnull().sum()
if (res[res > 0].empty):
    print('...no attributes with missing values found')
else:
    print('...found {0} attributes with missing values'.format(res))

# analyze attributes manually that have 'fillers' such as -999999 and replace fillers with new value
# in the examples below the 75% percentile has been taken as a new value to replace fillers
X_train['var3'].replace(to_replace=-999999, value=2, inplace=True)
X_train['delta_imp_amort_var18_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_amort_var34_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_aport_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_aport_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_compra_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_reemb_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_reemb_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_reemb_var33_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_trasp_var17_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_trasp_var17_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_trasp_var33_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_trasp_var33_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_imp_venta_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_aport_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_aport_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_aport_var33_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_compra_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_reemb_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_reemb_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_reemb_var33_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_trasp_var17_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_trasp_var17_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_trasp_var33_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_trasp_var33_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_train['delta_num_venta_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)

...no attributes with missing values found


## Delete features with zero variance

In [13]:
n_features_originally = X_train.shape[1]

selector = VarianceThreshold()
selector.fit(data)

orig_feat_ix = np.arange(data.columns.size)
feat_ix_keep = selector.get_support(indices=True)
feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)

X_train.drop(labels=data.columns[feat_ix_delete], axis=1, inplace=True)

n_features_deleted = feat_ix_delete.size

print(' - deleted %s / %s features (~= %.1f %%)' % (n_features_deleted, n_features_originally,
      100.0 * (np.float(n_features_deleted) / n_features_originally)))

 - deleted 44 / 370 features (~= 11.9 %)


## Delete identical features

In [17]:
n_features_originally = X_train.shape[1]

feat_names_delete = []
for feat_1, feat_2 in itertools.combinations(iterable=X_train.columns, r=2):
    if np.array_equal(X_train[feat_1], X_train[feat_2]):
        feat_names_delete.append(feat_2)

feat_names_delete = np.unique(feat_names_delete)

X_train.drop(labels=feat_names_delete, axis=1, inplace=True)

n_features_deleted = len(feat_names_delete)

print(' - Deleted %s / %s features' % (n_features_deleted, n_features_originally))

 - Deleted 50 / 326 features


# Prepare valid set accordingly

Apply exactly the same steps as above to the valid data set. But this time without analyzing it as the results might look different. 

In [23]:
# read valid dataset
X_valid = pd.read_csv('X_valid.csv')

In [24]:
# imput missing values and replace 'special' values
X_valid['var3'].replace(to_replace=-999999, value=2, inplace=True)
X_valid['delta_imp_amort_var18_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_amort_var34_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_aport_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_aport_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_compra_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_reemb_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_reemb_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_reemb_var33_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_trasp_var17_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_trasp_var17_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_trasp_var33_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_trasp_var33_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_imp_venta_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_aport_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_aport_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_aport_var33_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_compra_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_reemb_var13_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_reemb_var17_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_reemb_var33_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_trasp_var17_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_trasp_var17_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_trasp_var33_in_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_trasp_var33_out_1y3'].replace(to_replace=9999999999, value=0, inplace=True)
X_valid['delta_num_venta_var44_1y3'].replace(to_replace=9999999999, value=0, inplace=True)

In [25]:
# drop zero variance features according to X_train outcome
X_valid.drop(labels=data.columns[feat_ix_delete], axis=1, inplace=True)

In [26]:
# drop identical features according to X_train outcome
X_valid.drop(labels=feat_names_delete, axis=1, inplace=True)

# Write data 

In [28]:
# write reduced train and valid data into separat .csv files for further processing
X_train.to_csv("X_train_reduced.csv", sep=',', header=True)
X_valid.to_csv("X_valid_reduced.csv", sep=',', header=True)