In [None]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from collections import Counter
from prettytable import PrettyTable
from time import time
import datetime
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
from prettytable import PrettyTable
import sklearn
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# reading the datasets
df_train_transaction = pd.read_csv("/content/drive/MyDrive/Self_Case_Study_1/Datasets/ieee-fraud-detection/train_transaction.csv")
df_train_identity = pd.read_csv("/content/drive/MyDrive/Self_Case_Study_1/Datasets/ieee-fraud-detection/train_identity.csv")
df_test_transaction = pd.read_csv("/content/drive/MyDrive/Self_Case_Study_1/Datasets/ieee-fraud-detection/test_transaction.csv")
df_test_identity = pd.read_csv("/content/drive/MyDrive/Self_Case_Study_1/Datasets/ieee-fraud-detection/test_identity.csv")

In [None]:
#getting the shape of each train dataset and test dataset
print("shape of train_transaction dataset:", df_train_transaction.shape)
print("shape of train_identity dataset:", df_train_identity.shape)
print("shape of test_transaction dataset:", df_test_transaction.shape)
print("shape of test_identity dataset:", df_test_identity.shape)

shape of train_transaction dataset: (590540, 394)
shape of train_identity dataset: (144233, 41)
shape of test_transaction dataset: (506691, 393)
shape of test_identity dataset: (141907, 41)


In [None]:
# merging transaction and identity data
df_train = df_train_transaction.merge(df_train_identity, on = 'TransactionID', how = 'left')
target = df_train['isFraud']     #storing class label inside a 'target' variable
# df_train.drop(['isFraud'], axis=1, inplace = True)   

df_test = df_test_transaction.merge(df_test_identity, on = 'TransactionID', how = 'left')

In [None]:
# getting the shape of the dataset after merging
print('shape of training dataset:', df_train.shape)
print('shape of test dataset:', df_test.shape)

shape of training dataset: (590540, 434)
shape of test dataset: (506691, 433)


In [None]:
print(df_train.head(4))
print('='*80)
print(df_test.head(4))

   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0        2987000        0          86400            68.5         W  13926   
1        2987001        0          86401            29.0         W   2755   
2        2987002        0          86469            59.0         W   4663   
3        2987003        0          86499            50.0         W  18132   

   card2  card3       card4  card5  ... id_31  id_32  id_33  id_34  id_35  \
0    NaN  150.0    discover  142.0  ...   NaN    NaN    NaN    NaN    NaN   
1  404.0  150.0  mastercard  102.0  ...   NaN    NaN    NaN    NaN    NaN   
2  490.0  150.0        visa  166.0  ...   NaN    NaN    NaN    NaN    NaN   
3  567.0  150.0  mastercard  117.0  ...   NaN    NaN    NaN    NaN    NaN   

  id_36 id_37  id_38  DeviceType  DeviceInfo  
0   NaN   NaN    NaN         NaN         NaN  
1   NaN   NaN    NaN         NaN         NaN  
2   NaN   NaN    NaN         NaN         NaN  
3   NaN   NaN    NaN         NaN        

In [None]:
print(df_train.columns)
print('='*80)
print(df_test.columns)

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=434)
Index(['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD',
       'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
       ...
       'id-31', 'id-32', 'id-33', 'id-34', 'id-35', 'id-36', 'id-37', 'id-38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=433)


We observed that The names of the id features in test dataset and train dataset did not match. While id features in the test dataset were of the form id-x, they were present in the train dataset with the name id_x, where x was a number between 01 and 38. and as a result, we will convert the test dataset's id feature names from id-x to id_x.

In [None]:
# converting the test dataset's id feature names from id-x to id_x
for i in df_test.columns:
    k = i.replace('-', '_')
    df_test.rename(columns = {i:k}, inplace = True)

In [None]:
print(df_train.columns)
print('='*50)
print(df_test.columns)

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=434)
Index(['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD',
       'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=433)


### Missing data and duplicates

In [None]:
# getting the missing data from train dataset
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data_train = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data_train.head(10)

Unnamed: 0,Total,Percent
id_24,585793,0.991962
id_25,585408,0.99131
id_07,585385,0.991271
id_08,585385,0.991271
id_21,585381,0.991264
id_26,585377,0.991257
id_27,585371,0.991247
id_23,585371,0.991247
id_22,585371,0.991247
dist2,552913,0.936284


In [None]:
# getting the missing data from test dataset
total = df_test.isnull().sum().sort_values(ascending=False)
percent = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data_test = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data_test.head(10)

Unnamed: 0,Total,Percent
id_24,501951,0.990645
id_25,501652,0.990055
id_26,501644,0.990039
id_21,501632,0.990016
id_08,501632,0.990016
id_07,501632,0.990016
id_27,501629,0.99001
id_23,501629,0.99001
id_22,501629,0.99001
dist2,470255,0.92809


In [None]:
## duplicates in dataset
print("number of duplicate rows in train dataset:", df_train.duplicated().sum())
print("number of duplicate rows in test dataset:", df_test.duplicated().sum())

# No duplicates found

number of duplicate rows in train dataset: 0
number of duplicate rows in test dataset: 0


In [None]:
# available categorical features
cat_fea = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 
           'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 
           'DeviceType', 'DeviceInfo', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 
           'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 
           'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']

In [None]:
# available numerical features
num_fea =  ['TransactionDT', 'TransactionAmt', 'dist1', 'dist2','C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 
                 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7','D8', 'D9', 'D10', 'D11', 
                 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 
                 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 
                 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 
                 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 
                 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 
                 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 
                 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 
                 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 
                 'V102', 'V103', 'V104', 'V105', 'V106', 'V107','V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 
                 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 
                 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 
                 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 
                 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 
                 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 
                 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 
                 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 
                 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 
                 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 
                 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 
                 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 
                 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 
                 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 
                 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 
                 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 
                 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 
                 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 
                 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 
                 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339', 'id_01', 'id_02', 'id_03', 
                 'id_04', 'id_05', 'id_06','id_07','id_08' 'id_09', 'id_10', 'id_11']

In [None]:
# data preprocessing
def preprocessing(data, feature):
    data[feature] = data[feature].str.replace(' ','_')
    data[feature]= data[feature].str.replace('-','_')
    data[feature] = data[feature].str.replace('/','_')
    
    return data[feature]

# preprocessing of few categorical columns

df_train['DeviceInfo'] = preprocessing(df_train, 'DeviceInfo')
df_test['DeviceInfo'] = preprocessing(df_test, 'DeviceInfo')

df_train['card4'] = preprocessing(df_train, 'card4')
df_test['card4'] = preprocessing(df_test, 'card4')

df_train['card6'] = preprocessing(df_train, 'card6')
df_test['card6'] = preprocessing(df_test, 'card6')

df_train['id_30'] = preprocessing(df_train, 'id_30')
df_test['id_30'] = preprocessing(df_test, 'id_30')

df_train['id_31'] = preprocessing(df_train, 'id_31')
df_test['id_31'] = preprocessing(df_test, 'id_31')

### Spliting the dataset into train and test

In [None]:
x_train = df_train.drop(['isFraud', 'TransactionID'], axis=1)
y_train = df_train['isFraud']

x_test = df_test.drop(['TransactionID'], axis=1)
test_ids = df_test['TransactionID'].values

del df_train, df_test

### Encoding & scaling the data

In [None]:
def label_encoding(X_train, X_test, cat_features):
    
    '''
    Utility Function to Encode Categorical Features.
    '''
    
    for fea in cat_features:
        X_train[fea] = X_train[fea].astype(str)
        X_test[fea] = X_test[fea].astype(str)
    
        label_enc = LabelEncoder()
        label_enc.fit(X_train[fea])
        mapping = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
        X_train[fea] = label_enc.transform(X_train[fea])
    
        # Manually Encoding the CV and Test Dataset so as to avoid error for any category which is not present in train set
    
        # All the categories which are not present in train datset are encoded as -1
    
        X_test[fea] = [-1 if mapping.get(val, -1)==-1 else mapping[val] for val in X_test[fea].values]
 

    return (X_train, X_test)

In [None]:
# normalization using minmaxscaler

def min_max_scaler(X_train, X_test):
    
    X_train_norm = X_train.copy()
    X_test_norm = X_test.copy()
    
    for fea in X_train.columns:
        if fea not in cat_fea:
            scale = MinMaxScaler()
            X_train_norm[fea] = scale.fit_transform(X_train_norm[fea].values.reshape(-1, 1))
            X_test_norm[fea] = scale.transform(X_test_norm[fea].values.reshape(-1, 1))
            X_train_norm[fea].fillna(-1,inplace=True)
            X_test_norm[fea].fillna(-1,inplace=True)

    return (X_train_norm, X_test_norm)

In [None]:
# function to save test predictions in a file

def predict_and_save(prediction, name):
    
    '''
        Utility Function to save the test data predictions locally.
    '''

    df = pd.DataFrame({'TransactionID':test_ids.reshape(-1), 'isFraud':prediction.reshape(-1)})
    df = df.sort_values('TransactionID')
    df.to_csv(name, index=False)

In [None]:
# Label Encoding Categorical Features
x_train, x_test = label_encoding(x_train, x_test, cat_fea)

Hyper-Parameter Tuning using RandomizedSearchCV

In [None]:
# Hyperparameters
param_grid = {'learning_rate':[0.002, 0.02, 0.2],
              'max_depth':[12, 16, 20],
              'subsample': [0.4,0.6,0.8],
              'colsample_bytree' : [0.4,0.6,0.8],
              'n_estimators': [1000, 2000, 3000, 5000],
              'tree_method': ['gpu_hist']}

In [None]:
import xgboost as xgb
clf = xgb.XGBClassifier()
search = RandomizedSearchCV(clf, param_grid, n_iter=6, verbose=20, cv=3, 
                            scoring='roc_auc', return_train_score=True, random_state = 10)

search.fit(x_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START colsample_bytree=0.8, learning_rate=0.002, max_depth=12, n_estimators=2000, subsample=0.4, tree_method=gpu_hist
[CV 1/3; 1/6] END colsample_bytree=0.8, learning_rate=0.002, max_depth=12, n_estimators=2000, subsample=0.4, tree_method=gpu_hist;, score=(train=0.960, test=0.873) total time= 3.2min
[CV 2/3; 1/6] START colsample_bytree=0.8, learning_rate=0.002, max_depth=12, n_estimators=2000, subsample=0.4, tree_method=gpu_hist
[CV 2/3; 1/6] END colsample_bytree=0.8, learning_rate=0.002, max_depth=12, n_estimators=2000, subsample=0.4, tree_method=gpu_hist;, score=(train=0.962, test=0.884) total time= 3.4min
[CV 3/3; 1/6] START colsample_bytree=0.8, learning_rate=0.002, max_depth=12, n_estimators=2000, subsample=0.4, tree_method=gpu_hist
[CV 3/3; 1/6] END colsample_bytree=0.8, learning_rate=0.002, max_depth=12, n_estimators=2000, subsample=0.4, tree_method=gpu_hist;, score=(train=0.958, test=0.893) total time= 3.

In [None]:
results = pd.DataFrame.from_dict(search.cv_results_)
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score'] 
cv_auc_std = results['std_test_score']

In [None]:
print('Best Parameters:' , search.best_params_)
print('Best Score:' , search.best_score_)

Best Parameters: {'tree_method': 'gpu_hist', 'subsample': 0.8, 'n_estimators': 1000, 'max_depth': 20, 'learning_rate': 0.2, 'colsample_bytree': 0.4}
Best Score: 0.8954609389974263


In [None]:
best_learning_rate = search.best_params_['learning_rate']
best_estimator = search.best_params_['n_estimators']
best_depth = search.best_params_['max_depth']
best_subsample = search.best_params_['subsample']
best_colsample_bytree = search.best_params_['colsample_bytree']