In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.use_inf_as_na = True
import torch as th

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [6]:
#Load train and test data
train_transaction=pd.read_csv('dataset/train_transaction.csv')
train_identity=pd.read_csv('dataset/train_identity.csv')
test_transaction=pd.read_csv('dataset/test_transaction.csv')
test_identity=pd.read_csv('dataset/test_identity.csv')

In [7]:
train_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train_identity.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [9]:
#merge transaction and identity data for both train and test
train_df = train_transaction.merge(train_identity, how="left", on="TransactionID")
del train_transaction, train_identity
test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")
del test_transaction, test_identity

In [10]:
print(train_df.shape)

(590540, 434)


In [11]:
print(test_df.shape)

(506691, 433)


In [12]:
#function to inspect different data features(cols) and discard trivial cols with more than 60% NaN values
def discard_trivial_cols(df):
    trivial_columns = []
    for col in df.columns:
        nan_count = df[col].isnull().sum()
        nan_percentage = nan_count / len(df[col])
        if nan_percentage > 0.6:
            trivial_columns.append(col)
    filtered_df = df.drop(trivial_columns, axis=1)
    return filtered_df

In [13]:
train_df = discard_trivial_cols(train_df)
test_df = discard_trivial_cols(test_df)
train_df = train_df.drop('TransactionDT', axis=1)
test_df = test_df.drop('TransactionDT', axis=1)

In [14]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,68.5,W,13926,,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#This function was suggested by Kaggle community in the competition's discussion to minimize the memory used by the data
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [16]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Mem. usage decreased to 318.20 Mb (68.6% reduction)
Mem. usage decreased to 272.54 Mb (68.4% reduction)


In [17]:
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))
test_df.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,0.0,282.5,282.5,282.5,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,67.949997,67.9375,183.875,67.9375,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# set TransactionID as index
train_df.set_index('TransactionID', inplace=True)
test_df.set_index('TransactionID', inplace=True)

In [19]:
train_df.head()

Unnamed: 0_level_0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
2987001,0,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987002,0,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987003,0,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
2987004,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#Function to map numerical features to log scale to decrease their range and the huge differences between them (feature smoothing)
def scale_numerical_features(data):
    return np.log10(data + 1e-9)

In [21]:
#define numerical and categorical cols of the data
cat_cols_dict = {'cat_cols': 'ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9,DeviceType,DeviceInfo,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38'}
num_cols_dict = {'num_cols': 'TransactionAmt,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339'}

In [22]:
#Encode categorical features of the data using label encoding
for col in cat_cols_dict['cat_cols'].split(','):
    if col in train_df:
        lbl = LabelEncoder()
        lbl.fit(list(train_df[col].values))
        train_df[col] = lbl.transform(list(train_df[col].values))

In [23]:
train_df.head()

Unnamed: 0_level_0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,68.5,4,10095,500,42,1,38,1,166,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
2987001,0,29.0,4,1372,303,42,2,2,1,173,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987002,0,59.0,4,2833,389,42,4,58,2,178,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987003,0,50.0,4,13341,466,42,2,14,2,282,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
2987004,0,50.0,1,2712,413,42,2,2,1,241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
#Define features (x) and labels (y) to train the model
x = train_df.drop('isFraud', axis=1)
y = train_df['isFraud'].astype('uint8')

In [44]:
num_cols_filtered = []
for col in num_cols_dict['num_cols'].split(','):
    if col in x:
        num_cols_filtered.append(col)

In [45]:
#scale all numerical features to log scale to smooth the feature set
feature_processor = make_column_transformer((
                FunctionTransformer(scale_numerical_features),
                num_cols_filtered
            ),
            remainder='passthrough'
        )

In [46]:
feature_processor.fit(x)
#replace NaN values with 0
features = np.nan_to_num(feature_processor.transform(x), nan=0.)
features_corrected = th.nan_to_num(th.from_numpy(features.astype('float32')), nan=0)
x = features_corrected

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  features_corrected = th.nan_to_num(th.from_numpy(features.astype('float32')), nan=0)


In [47]:
#Split train data into train and test splits with ratios [80%, 20%]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42, stratify=y)
X_train.shape

torch.Size([472432, 223])

In [65]:
#Decision tree classifier
dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)
pred_dt = dt_clf.predict_proba(X_test)[:,1]
print(f'AUC score: {roc_auc_score(y_test, pred_dt):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_dt)):.5f}')

AUC score: 0.78150
Accuracy score: 0.96728


In [69]:
#Naive Bayes classifier
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
pred_gnb = gnb_clf.predict(X_test)
print(f'AUC score: {roc_auc_score(y_test, pred_gnb):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_gnb)):.5f}')

  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  x = um.multiply(x, x, out=x)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


AUC score: 0.50000
Accuracy score: 0.96501


In [70]:
#Random forest classifier
rf_clf = RandomForestClassifier(random_state=30)
rf_clf.fit(X_train, y_train)
pred_rf = model_rf_down.predict_proba(X_test)[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test, pred_rf):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_rf)):.5f}')

ROC-AUC score: 0.92312
Accuracy score: 0.97891


In [78]:
#Gradient boosting classifier
gb_clf = GradientBoostingClassifier(n_estimators=15, learning_rate=0.2, max_depth=10, random_state=0)
gb_clf.fit(X_train, y_train)
pred_gb = gb_clf.predict_proba(X_test)[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test, pred_gb):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_gb)):.5f}')

ROC-AUC score: 0.91497
Accuracy score: 0.97805


In [77]:
#AdaBoost classifier
ab_clf = AdaBoostClassifier(n_estimators=30, learning_rate=0.2, random_state=0)
ab_clf.fit(X_train, y_train)
pred_ab = ab_clf.predict_proba(X_test)[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test, pred_ab):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_ab)):.5f}')

ROC-AUC score: 0.84271
Accuracy score: 0.96623


In [79]:
#XGBoost classifier
xgb_clf = XGBClassifier(seed=42, n_estimators=500, max_depth=10, learning_rate=0.2)
xgb_clf.fit(X_train, y_train)
pred_xgb = xgb_clf.predict_proba(X_test)[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test, pred_xgb):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_xgb)):.5f}')

ROC-AUC score: 0.96917
Accuracy score: 0.98583


In [80]:
#Catboost classifier
cb_clf = CatBoostClassifier(n_estimators=500, learning_rate=0.2, max_depth=10, random_state=30)
cb_clf.fit(X_train.tolist(), y_train.tolist(), logging_level='Silent')
pred_cb = cb_clf.predict_proba(X_test.tolist())[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test.tolist(), pred_cb):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_cb)):.5f}')

ROC-AUC score: 0.95857
Accuracy score: 0.98379


In [85]:
# The final predictions could be a weighted average of the predictions of the the best two performing models(xgboost, catboost)
#A higher weight will be given to xgboost as it slightly outperformed catboost

pred_final = (0.9*pred_xgb + 0.1*pred_cb)
print(f'ROC-AUC score: {roc_auc_score(y_test, pred_final):.5f}')
print(f'Accuracy score: {accuracy_score(y_test, np.round(pred_final)):.5f}')

ROC-AUC score: 0.97010
Accuracy score: 0.98578
