## Fraud detection with Tensorflow

### import modules

In [1]:
#  Libraries
import numpy as np 
import pandas as pd 

# Suppress warning
import warnings
warnings.filterwarnings("ignore")

# Data processing, metrics and modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold,KFold
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics
from sklearn import preprocessing

import itertools
from scipy import interp

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams

### read from files

In [2]:
data = pd.read_csv('newtrain_transaction_200000.csv', index_col='TransactionID')
test = pd.read_csv('newtest_transaction_withoutlabel.csv', index_col='TransactionID')
train_identity = pd.read_csv('newtrain_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('newtest_identity.csv', index_col='TransactionID')
pred_df = pd.read_csv('newsample_submission.csv', index_col='TransactionID')


### Separate data with labels

In [3]:
labels=data['isFraud'].copy()
data=data.drop(['isFraud'],axis=1)

In [4]:
print(data.shape)
print(test.shape)

(80397, 392)
(10000, 392)


In [5]:
#preserve Transaction AMT as weight
# import torch
# weight=data['TransactionAmt'].copy()
# weight=weight/100
# print(weight.max())
# print(weight.min())
# weight=weight.to_list()
# weight=torch.Tensor(weight)
# print(type(weight))

### Drop columns

In [6]:
#Drop all the V***
test = test.iloc[:, :53]
print(test.shape)
data = data.iloc[:, :53]
print(data.shape)

(10000, 53)
(80397, 53)


In [7]:
#Drop D1 to D15
drop_list=['D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15']
for col in drop_list:
    data=data.drop([col],axis=1)
    test=test.drop([col],axis=1)
print(data.shape)
print(test.shape)

(80397, 38)
(10000, 38)


In [8]:
#Drop identity columns with high level NANs
train_NA = train_identity.isna().sum(axis=0)
test_NA = test_identity.isna().sum(axis=0)
for indexs in train_NA.index:
    if train_NA[indexs]>40000: 
        train_identity=train_identity.drop([indexs],axis=1)
        test_identity=test_identity.drop([indexs],axis=1)
# print(train_NA)
data = data.merge(train_identity, how='left', left_index=True, right_index=True)
test = test.merge(test_identity, how='left', left_index=True, right_index=True)
print(data.shape)
print(test.shape)

(80397, 59)
(10000, 59)


### Numerical and Categorical

In [9]:
categorical = [
    'ProductCD',
    'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2',
    'P_emaildomain',
    'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
    'DeviceType','DeviceInfo',
    'id_12','id_13','id_14','id_15','id_16','id_17','id_18','id_19',
    'id_20','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29',
    'id_30','id_31','id_32','id_33','id_34','id_35','id_36','id_37','id_38'
]

categorical = [col for col in categorical if col in data.columns]
numerical=[]
for col in data.columns:
    if not(col in categorical): numerical.append(col)
# print(numerical)
# print(categorical)

## Data Processing

### Numerical scaling
For continuous right-skewed features we wil apply log-transform, so that will make them look more like normal distributed.

In [10]:
# class ContinuousFeatureConverter:
#     def __init__(self, name, feature, log_transform):
#         self.name = name
#         self.skew = feature.skew()
#         self.log_transform = log_transform
        
#     def transform(self, feature):
#         if self.skew > 1:
#             feature = self.log_transform(feature)
        
#         mean = feature.mean()
#         std = feature.std()
#         return (feature - mean)/(std + 1e-6)    

In [11]:
from sklearn.preprocessing import StandardScaler
for column in numerical:
#     print(data[column])
    scaler = StandardScaler()
    if data[column].max() > 100 and data[column].min() >= 0:
        data[column] = np.log1p(data[column])
        test[column] = np.log1p(test[column])
    scaler.fit(np.concatenate([data[column].values.reshape(-1,1), test[column].values.reshape(-1,1)]))
    data[column] = scaler.transform(data[column].values.reshape(-1,1))
    test[column] = scaler.transform(test[column].values.reshape(-1,1))

### Fill nan (with 0: not the optimal)

In [12]:
def nan2mean(df):
    for x in list(df.columns.values):
        if x in numerical:
            df[x] = df[x].fillna(0)
    return df

data=nan2mean(data)
test=nan2mean(test)
# print(data)

### Label encoding for categorical data

In [13]:
# Label Encoding
category_counts = {}
for f in categorical:
    data[f] = data[f].replace("nan", "other")
    data[f] = data[f].replace(np.nan, "other")
    test[f] = test[f].replace("nan", "other")
    test[f] = test[f].replace(np.nan, "other")
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(data[f].values) + list(test[f].values))
    data[f] = lbl.transform(list(data[f].values))
    test[f] = lbl.transform(list(test[f].values))
    category_counts[f] = len(list(lbl.classes_)) + 1

## Splitting the training and validation set

In [14]:
X_train,X_test,y_train,y_test=train_test_split(data,labels,test_size=0.2, random_state=13)
# print(X_train)

## Building a neural network

In [15]:
#Import
from keras import models
from keras import layers
import numpy as np
from sklearn.utils import class_weight
from keras import regularizers

network=models.Sequential()
network.add(layers.Dense(200, activation='relu', input_shape=(59,)))
# network.add(layers.Dropout(0.1))
network.add(layers.Dense(100, activation='relu'))
network.add(layers.Dense(60, activation='relu'))
# network.add(layers.Dropout(0.1))
network.add(layers.Dense(1, activation='sigmoid'))

In [24]:
# # Use binary crossentropy loss
# network.compile(optimizer='rmsprop',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

#define a new loss function
from keras import backend as K
import tensorflow as tf

# def new_loss(y_true,y_pred):
#     loss=weight*K.binary_crossentropy(y_pred, y_true)
#     return K.mean(loss, axis=-1)

loss=[]
# for i in range(10,101,10):
network.compile(optimizer='Adagrad',loss='binary_crossentropy',
            metrics=['accuracy'])
network.fit(X_train, y_train, epochs=63, batch_size=128)
test_loss, test_acc = network.evaluate(X_test, y_test)
loss.append(test_loss)
    
# x=range(10,101,10)
# plt.figure(figsize=(10, 6))
# plt.plot(x, loss)
# plt.show();


Epoch 1/63
Epoch 2/63
Epoch 3/63
Epoch 4/63
Epoch 5/63
Epoch 6/63
Epoch 7/63
Epoch 8/63
Epoch 9/63
Epoch 10/63
Epoch 11/63
Epoch 12/63
Epoch 13/63
Epoch 14/63
Epoch 15/63
Epoch 16/63
Epoch 17/63
Epoch 18/63
Epoch 19/63
Epoch 20/63
Epoch 21/63
Epoch 22/63
Epoch 23/63
Epoch 24/63
Epoch 25/63
Epoch 26/63
Epoch 27/63
Epoch 28/63
Epoch 29/63
Epoch 30/63
Epoch 31/63
Epoch 32/63
Epoch 33/63
Epoch 34/63
Epoch 35/63
Epoch 36/63
Epoch 37/63
Epoch 38/63
Epoch 39/63
Epoch 40/63
Epoch 41/63
Epoch 42/63
Epoch 43/63
Epoch 44/63
Epoch 45/63
Epoch 46/63
Epoch 47/63
Epoch 48/63
Epoch 49/63
Epoch 50/63
Epoch 51/63
Epoch 52/63
Epoch 53/63
Epoch 54/63
Epoch 55/63
Epoch 56/63
Epoch 57/63
Epoch 58/63
Epoch 59/63
Epoch 60/63
Epoch 61/63
Epoch 62/63
Epoch 63/63
Epoch 1/63
Epoch 2/63
Epoch 3/63
Epoch 4/63
Epoch 5/63
Epoch 6/63
Epoch 7/63
Epoch 8/63
Epoch 9/63
Epoch 10/63
Epoch 11/63
Epoch 12/63
Epoch 13/63


Epoch 14/63
Epoch 15/63
Epoch 16/63
Epoch 17/63
Epoch 18/63
Epoch 19/63
Epoch 20/63
Epoch 21/63
Epoch 22/63
Epoch 23/63
Epoch 24/63
Epoch 25/63
Epoch 26/63
Epoch 27/63
Epoch 28/63
Epoch 29/63
 632/8040 [=>............................] - ETA: 21s - loss: 0.0571 - accuracy: 0.9836

KeyboardInterrupt: 

In [21]:
test_loss, test_acc = network.evaluate(X_test, y_test)
print('test_acc:', test_acc)

test_acc: 0.9779228568077087


## The loss calculation for homework2

In [23]:
def compute_loss(df_pred, df_true):
    fraud_set = set(df_true[df_true['isFraud']==1]['TransactionID'])
    print('total fraud',len(fraud_set))
    cutoff = df_pred['isFraud'].nlargest(1000).iloc[-1]
    print('cutoff',cutoff)
    pass_set = set(df_pred[df_pred['isFraud']<cutoff]['TransactionID'])
    fraud_miss = fraud_set & pass_set
    df_fraud_miss = df_true[df_true['TransactionID'].isin(fraud_miss)]
    print('missed_fraud',df_fraud_miss.shape[0])
    loss = df_fraud_miss['TransactionAmt'].sum()
    print('mean missed fraud amt',loss/df_fraud_miss.shape[0])
    return loss   

answer=pd.read_csv('newtest_transaction.csv')
test_answer=answer['isFraud'].copy()
test_loss, test_acc = network.evaluate(test, test_answer)
print(test_acc)

pred=network.predict(test,batch_size = 2000, verbose = True)
pred_df['isFraud']=pred
pred_df=pred_df.reset_index()
pred_df.to_csv('newsample_submission.csv')

print(pred_df)
# print(answer)
compute_loss(pred_df,answer)

0.9337999820709229
      TransactionID  Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1  \
0           2987022           0             0               0   
1           2987100           1             1               1   
2           2987104           2             2               2   
3           2987148           3             3               3   
4           2987288           4             4               4   
...             ...         ...           ...             ...   
9995        3577009        9995          9995            9995   
9996        3577027        9996          9996            9996   
9997        3577070        9997          9997            9997   
9998        3577183        9998          9998            9998   
9999        3577465        9999          9999            9999   

      Unnamed: 0.1.1.1  level_0  Unnamed: 0.1.1.1.1  index   isFraud  
0                    0        0                   0      0  0.000001  
1                    1        1                   1      1

35369.235