In [1]:
%load_ext pycodestyle_magic
%flake8_on

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import xgboost as xgb
from reduce_mem_usage import reduce_mem_usage
import matplotlib.pyplot as plt

pd.set_option('max_columns', 100)
pd.set_option('max_rows', 1000)

In [4]:
train_transaction = pd.read_csv('input/train_transaction.csv',
                                index_col='TransactionID')

test_transaction = pd.read_csv('input/test_transaction.csv',
                               index_col='TransactionID')

train_identity = pd.read_csv('input/train_identity.csv',
                             index_col='TransactionID')
test_identity = pd.read_csv('input/test_identity.csv',
                            index_col='TransactionID')

sample_submission = pd.read_csv('input/sample_submission.csv',
                                index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left',
                                left_index=True, right_index=True)

test = test_transaction.merge(test_identity, how='left',
                              left_index=True, right_index=True)

print(train.shape)
print(test.shape)

(590540, 433)
(506691, 432)


In [5]:
y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype == 'object' or X_test[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

In [6]:
X_train['RANDOM_NOISE'] = np.random.normal(size=len(X_train))

In [7]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Mem. usage decreased to 548.27 Mb (72.2% reduction)
Mem. usage decreased to 460.02 Mb (72.5% reduction)


In [8]:
X_PL = X_test.copy()

In [9]:
X_train.shape

(590540, 433)

In [10]:
X_train2 = X_train.copy()
X_train2['isFraud'] = y_train.values
X_train = X_train2.copy()
del(X_train2)

# Train / Valid / Test split

In [11]:
X_train_initial_copy = X_train.copy()

In [17]:
# we first take 30 percent of data for test
X_train, X_test, _, _ = train_test_split(X_train,
                                         y_train,
                                         shuffle=False,
                                         test_size=0.30)

In [18]:
# Then we keep 50% of that test data for validation
# Hence having train70%, valid 15% and test 15%
X_valid, X_test, _, _ = train_test_split(X_test,
                                         np.zeros(len(X_test)),
                                         shuffle=False,
                                         test_size=0.5)

In [19]:
X_train.shape, X_valid.shape, X_test.shape

((413378, 434), (88581, 434), (88581, 434))

In [20]:
X_train.to_csv('input/X_train.csv')
X_valid.to_csv('input/X_valid.csv')
X_test.to_csv('input/X_test.csv')

X_PL.to_csv('input/X_PL.csv')