# Cleaning

## Loading data

In [1]:
import os
import kaggle
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)

In [3]:
# Download dataset using Kaggle API Key
# Checking the existance of the required .csv files, if not download them

ABS_PATH = "../"
RAW_DATA_PATH = os.path.join( ABS_PATH, "data/raw/" )
PRO_DATA_PATH = os.path.join( ABS_PATH, "data/processed/" )
if not os.path.isfile( os.path.join(RAW_DATA_PATH, "fraudTrain.csv") ) or not os.path.isfile( os.path.join(RAW_DATA_PATH, "fraudTest.csv") ):
    !python ../src/data/make_dataset.py

In [4]:
# Dataset analysis: Train and Test reading
df_train = pd.read_csv( os.path.join(RAW_DATA_PATH, "fraudTrain.csv"), index_col=0)
df_test = pd.read_csv( os.path.join( RAW_DATA_PATH, "fraudTest.csv"), index_col=0)

## Data cleaning

Casting the time-related values to `Datetime`:

In [5]:
# Train
df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'], errors='ignore')
df_train['dob'] = pd.to_datetime(df_train['dob'], errors='ignore')

# Test
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'], errors='ignore')
df_test['dob'] = pd.to_datetime(df_test['dob'], errors='ignore')

Dropping irrelevant or redundant data since we already have geolocation and time data contained in other values, thus, reducing its dimension:

In [6]:
drop_columns = ['first', 'last', 'street', 'city', 'state', 'zip', 'trans_num', 'unix_time']

df_train.drop(columns=drop_columns, inplace=True)
df_test.drop(columns=drop_columns, inplace=True)

We do the calculus of the age for each card user by substracting the birthdate to the current year:

In [7]:
df_train['age'] = np.round((df_train['trans_date_trans_time'] - df_train['dob']) / np.timedelta64(1, 'Y'))
df_train = df_train.astype({'age' : 'int64'})

df_test['age'] = np.around((df_train['trans_date_trans_time'] - df_train['dob']) / np.timedelta64(1, 'Y'))
df_test = df_test.astype({'age' : 'int64'})

drop_columns = ['dob']

df_train.drop(columns=drop_columns, inplace=True)
df_test.drop(columns=drop_columns, inplace=True)

Renaming a few columns for displaying purposes:

In [8]:
df_train.head()
trans_dict = {'trans_date_trans_time':'timestamp', 'cc_num':'credit_card_num', 'merchant':'shop', 'amt':'amount'}

df_train.rename(columns=trans_dict, inplace=True)
df_test.rename(columns=trans_dict, inplace=True)

## Saving the results

To main formats used, .csv files for a better human visualization, and the .pkl in order to load them faster into other python scripts.

In [None]:
df_train.to_csv(os.path.join(PRO_DATA_PATH, "clean_fraudTrain.csv"), index=False)
df_test.to_csv(os.path.join(PRO_DATA_PATH, "clean_fraudTest.csv"), index=False)

df_train.to_pickle(os.path.join( PRO_DATA_PATH, "clean_fraudTrain.pkl" ) )
df_test.to_pickle(os.path.join( PRO_DATA_PATH, "clean_fraudText.pkl" ) )