In [1]:
import pandas as pd
import numpy as np

In [2]:
folder_path = 'data/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sub = pd.read_csv(f'{folder_path}sample_submission.csv')
# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [3]:
train.head()


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [4]:
# drop columns 

one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]
one_value_cols == one_value_cols_test

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))
cols_to_drop.remove('isFraud')
len(cols_to_drop)

82

In [5]:
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [6]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
#X_test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1)
del train
test = test[["TransactionDT", 'TransactionID']]

In [7]:
# clean

# by https://www.kaggle.com/dimartinot
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)   

# Cleaning infinite values to NaN
X = clean_inf_nan(X)
X_test = clean_inf_nan(X_test )

In [8]:
cols_to_keep = [
   'card1',
   'card2',
    'card5',
    'addr1',
    'dist1',
    'D2',
    'D4',
    'C13',
    'D10',
    'D1',
    'id_20',
    'D8',
    'C1',    
]

FILTERED_X = X[cols_to_keep]
FILTERED_X_test = X_test[cols_to_keep]
del X
del X_test
list(FILTERED_X.columns.values) 

['card1',
 'card2',
 'card5',
 'addr1',
 'dist1',
 'D2',
 'D4',
 'C13',
 'D10',
 'D1',
 'id_20',
 'D8',
 'C1']

In [9]:
# supervised neural network
# https://scikit-learn.org/stable/modules/neural_networks_supervised.html

from sklearn.neural_network import MLPClassifier


In [23]:
params = {'solver':'adam', 
          'learning_rate':'adaptive'
         }
FILTERED_X = clean_inf_nan(FILTERED_X)


In [14]:
FILTERED_X = FILTERED_X.fillna(0)
FILTERED_X_test = FILTERED_X_test.fillna(0)

FILTERED_X.head()

Unnamed: 0,card1,card2,card5,addr1,dist1,D2,D4,C13,D10,D1,id_20,D8,C1
0,13926,0.0,142.0,315.0,19.0,0.0,0.0,1.0,13.0,14.0,0.0,0.0,1.0
1,2755,404.0,102.0,325.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,4663,490.0,166.0,330.0,287.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,18132,567.0,117.0,476.0,0.0,112.0,94.0,25.0,84.0,112.0,0.0,0.0,2.0
4,4497,514.0,102.0,420.0,0.0,0.0,0.0,1.0,0.0,0.0,144.0,0.0,1.0


In [24]:
clf = MLPClassifier(**params)
clf.fit(FILTERED_X, y)  

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [25]:
results = clf.predict_proba(FILTERED_X_test)

In [40]:
test["isFraud"] = [row[1] for row in results]

In [41]:
test.head()

Unnamed: 0,TransactionDT,TransactionID,isFraud
0,18403224,3663549,0.006278
1,18403263,3663550,0.01459
2,18403310,3663551,0.041751
3,18403310,3663552,0.024716
4,18403317,3663553,0.02798


In [43]:
test.to_csv('output.csv', index=False, columns = ["TransactionID","isFraud"])