In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv


In [12]:
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import sys
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
sns.set_style('darkgrid')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')


In [5]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
del train_identity, train_transaction, test_identity, test_transaction
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 650.48 Mb (66.8% reduction)
Mem. usage decreased to 565.37 Mb (66.3% reduction)


In [None]:
print(train_identity.shape)
print(train_transaction.shape)
print(train.shape)

print(test_identity.shape)
print(test_transaction.shape)
print(test.shape)

**Exploratory Data Analysis**

In [None]:
print(train.describe())
print(test.describe())

In [None]:
display(train.head())
display(test.head())
display(train['isFraud'].value_counts())
train['isFraud'].value_counts().plot.bar();

In [None]:
fig , ax = plt.subplots(2,1, figsize = (16,10))
train['TransactionDay'] = train['TransactionDT'] // (24*3600)
train.groupby('TransactionDay')['isFraud'].sum().plot.line(ax=ax[0])
train.groupby('TransactionDay')['isFraud'].mean().plot.line(ax=ax[1])

**Transforming Datatime (in milliseconds) to Date, Day, Month, Hour and Weekday** 

In [None]:
# Assuming Dec 2017 as the start date for the data
startdate = datetime.datetime.strptime('2017-12-01', '%Y-%m-%d')
train['Date'] = train['TransactionDT'].apply(lambda x: startdate+datetime.timedelta(seconds=x))
train['Month/Year'] = train['Date'].dt.month.astype(str)+'/'+train['Date'].dt.year.astype(str)
train['Weekday'] = train['Date'].dt.dayofweek
train['Hour'] = train['Date'].dt.hour
train['Day'] = train['Date'].dt.day

In [None]:

fig, ax = plt.subplots(4, 1, figsize = (16,24))
#train.groupby('Date')['isFraud'].mean().plot.line(ax = ax[0])
train.groupby('Month/Year')['isFraud'].mean().plot.line(ax = ax[0],ylim=(0,0.05))
train.groupby('Weekday')['isFraud'].mean().plot.line(ax = ax[1],ylim=(0,0.04))
train.groupby('Day')['isFraud'].mean().plot.line(ax = ax[2],ylim=(0,0.05))
train.groupby('Hour')['isFraud'].mean().plot.line(ax = ax[3],ylim=(0,0.12))

In [6]:
data = train.append(test, sort = False)

In [7]:
data['NullCount'] = data.isnull().sum(axis=1)

In [15]:
vcols = [f'V{i}' for i in range(1,340)]
scaler = MinMaxScaler()
pca = PCA(n_components = 2)

vcol_pca = pca.fit_transform(scaler.fit_transform(data[vcols].fillna(-1)))

data['vcol_pca0'] = vcol_pca[:, 0]
data['vcol_pca1'] = vcol_pca[:, 1]
data['vcol_nulls'] = data[vcols].isnull().sum(axis = 1)

In [18]:
data.drop(vcols, axis = 1, inplace = True)