# CS4035 - Cyber Data Analytics
## Lab 1 - Fraud data

## Group Number : 25

## Student 1 
### Name : Aditya Kunar
### ID : 5074274

## Student 2
### Name : Anwesh Marwade
### ID : 5052068

### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import re
from random import randint
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

### Read the Data

In [2]:
data = "data_for_student_case.csv" #modify the path here for wherever the data is.
df1 = pd.read_csv(data)

### A look at the data

In [3]:
df1.head()

Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
0,1,2015-11-09 14:26:51,MX,mccredit,530056.0,64800.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-01 23:03:11,MexicoAccount,email68370,ip111778,card184798
1,2,2015-11-09 14:27:38,MX,mccredit,547046.0,44900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-02 04:50:55,MexicoAccount,email101299,ip78749,card151595
2,3,2015-11-23 16:34:16,MX,mccredit,528843.0,149900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-02 14:30:28,MexicoAccount,email278604,ip70594,card242142
3,4,2015-11-23 16:34:51,MX,mccredit,547146.0,109900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-03 07:53:37,MexicoAccount,email47409,ip113648,card181744
4,5,2015-11-09 14:26:08,MX,visaclassic,477291.0,89900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-08 18:35:35,MexicoAccount,email205501,ip83553,card97271


### Data Preprocessing

In [4]:
# Apply currency conversion so that the amounts are standardized
def conv(row):
    currency_dict = {0: 0.86248, 1: 1.5911, 2: 21.2829, 3: 1.6805, 4: 10.635}
    return row['amount'] / (currency_dict[row['currencycode']]*100)

def preprocess(df):
    
    df1 = df.copy(deep=True)
    df1 = df1.loc[~(df1['simple_journal'] == 'Refused')] #removing the unknown class.
    df1.loc[df1['simple_journal'] == 'Chargeback', 'simple_journal'] = 1 #fraud
    df1.loc[df1['simple_journal'] == 'Settled', 'simple_journal'] = 0 #Normal
    
    #Dealing with na values by filling them with default values. 
    df1.loc[df1['cardverificationcodesupplied'].isna(),'cardverificationcodesupplied'] = False
    df1.loc[df1['issuercountrycode'].isna(),'issuercountrycode'] = 'ZZ'
    df1.loc[df1['shoppercountrycode'].isna(),'shoppercountrycode'] = 'ZZ'
    df1.loc[df1['mail_id'].str.contains('na',case=False),'mail_id'] = 'email99999'
    #3-6 is just 3 for cvcresponsecode.
    df1.loc[df1['cvcresponsecode'] > 2,'cvcresponsecode'] = 3
    
    #Encoding the categorical variables.
    bin_enc = LabelEncoder()
    bin_enc.fit(df1['bin'].unique())
    df1['bin'] = bin_enc.transform(df1.bin)

    card_enc = LabelEncoder()
    card_enc.fit(df1['card_id'])
    df1['card_id'] = card_enc.transform(df1.card_id)

    ip_enc = LabelEncoder()
    ip_enc.fit(df1['ip_id'])
    df1['ip_id'] = ip_enc.transform(df1.ip_id)

    unique_issuer_cc = df1['issuercountrycode'].unique()
    unique_shopper_cc = df1['shoppercountrycode'].unique()
    both = np.append(unique_issuer_cc, unique_shopper_cc)
    df_countrycodes = pd.DataFrame(both)
    unique_codes = df_countrycodes[0].unique()
    enc = LabelEncoder()
    enc.fit(unique_codes)
    df1['issuercountrycode'] = enc.transform(df1.issuercountrycode)
    df1['shoppercountrycode'] = enc.transform(df1.shoppercountrycode)
    
    enc1 = LabelEncoder()
    enc1.fit(df1['txvariantcode'])
    df1['txvariantcode'] = enc1.transform(df1.txvariantcode)

    enc2 = LabelEncoder()
    enc2.fit(df1['currencycode'])
    df1['currencycode'] = enc2.transform(df1.currencycode)

    enc3 = LabelEncoder()
    enc3.fit(df1['shopperinteraction'])
    df1['shopperinteraction'] = enc3.transform(df1.shopperinteraction)

    enc4 = LabelEncoder()
    enc4.fit(df1['accountcode'])
    df1['accountcode'] = enc4.transform(df1.accountcode)

    enc5 = LabelEncoder()
    enc5.fit(df1['cardverificationcodesupplied'])
    df1['cardverificationcodesupplied'] = enc5.transform(df1.cardverificationcodesupplied)
    
    df1['creationdate'] = pd.to_datetime(df1['creationdate'])
    df1['date'] = df1['creationdate'].dt.date
        
    enc6 = LabelEncoder()
    enc6.fit(df1['mail_id'])
    df1['mail_id'] = enc6.transform(df1.mail_id)

    df1['bookingdate'] = pd.to_datetime(df1['bookingdate'])

    df1['countries_equal'] = (df1['shoppercountrycode'] == df1['issuercountrycode'])
    df1.loc[df1['countries_equal'] == False,'countries_equal'] = 0
    df1.loc[df1['countries_equal'] == True,'countries_equal'] = 1

    df1['day_of_week'] = df1['creationdate'].dt.dayofweek

    df1['hour'] = df1['creationdate'].dt.hour
    
    dates= df1["creationdate"]
    dates=pd.DatetimeIndex(dates)
    dates=dates.astype(np.int64) // 10**9
    df1['creationdate_Unix']=dates
    
    df1['amount_eur'] = df1.apply(lambda x: conv(x), axis=1)
    df1 = df1.loc[~((df1['simple_journal'] == 0)&(df1['amount_eur']>320))]
    
    return df1

In [5]:
def smote(n,k,df,y):

    df1=df.copy(deep=True)
    
    df1["simple_journal"]=y
    
    df_fraud= df1.loc[df1["simple_journal"]==1]
    
    df_notfraud = df1.loc[df1["simple_journal"]==0]
    
    df_fraudmat = df_fraud.values

    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(df_fraudmat)
    distances, indices = nbrs.kneighbors(df_fraudmat)
    
    synthetic=[]
    
    T = len(df_fraudmat)
    N = n
    
    for i in range(T):
        value = random.sample(range(1, k), N)
        for j in value:
            ind = indices[i][j]
            diff= df_fraudmat[ind]-df_fraudmat[i]
            synthetic.append((df_fraudmat[i]+(random.uniform(0, 1))*diff))#.astype(int))

    
    synthetic=pd.DataFrame(synthetic,columns=df_fraud.columns)
    
    
    DF_syn_nf = pd.concat([synthetic, df_notfraud],sort=False)
    
    df_syn_nf_mat = DF_syn_nf.values
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(df_syn_nf_mat)
    distances, indices = nbrs.kneighbors(df_syn_nf_mat[:len(synthetic)])
    
    notneeded=[]
    for i in indices:
        if i[1]>len(synthetic):
            notneeded.append(i[0])
    needed = list(set(np.arange(len(synthetic))) - set(notneeded))
    needed_df = pd.DataFrame([synthetic.iloc[i] for i in needed])
    
    DF_Fraud = pd.concat([needed_df,df_fraud],sort=False)
    
    DF=pd.concat([DF_Fraud, df_notfraud],sort=False)
    
    for i in DF.columns:
        if i!="amount_eur":
            DF[i]=DF[i].astype(int)
    
    return DF 

In [6]:
#subsampling keeping 10% fraud and 90% non fraud cases
def subsample(k,x,y):
    s=int(y.sum())
    posindex=[]
    for i in enumerate(y):
        if i[1]==1:
            posindex.append(i[0])
    fullindex=list(range(0,len(y)))
    sampleindex= list(set(fullindex)-set(posindex))
    randomlist = list(set(random.sample(range(0, len(sampleindex)),int(s*k-s))))
    randomlist = list(np.array(sampleindex)[randomlist])
    randomlist.extend(posindex)
    random.shuffle(randomlist)
    X_train=(x.iloc[randomlist])
    y_train=np.array(y)
    y_train=y[randomlist].ravel()
    return X_train,y_train

In [7]:
df = preprocess(df1)

In [8]:
df.head()

Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,...,accountcode,mail_id,ip_id,card_id,date,countries_equal,day_of_week,hour,creationdate_Unix,amount_eur
0,1,2015-11-09 14:26:51,104,2,1998,64800.0,2,104,1,1,...,1,187702,7386,73358,2015-07-01,1,2,23,1435791791,30.446979
1,2,2015-11-09 14:27:38,104,2,2364,44900.0,2,104,1,1,...,1,834,194109,44739,2015-07-02,1,3,4,1435812655,21.096749
2,3,2015-11-23 16:34:16,104,2,1965,149900.0,2,104,1,1,...,1,119287,189655,122802,2015-07-02,1,3,14,1435847428,70.432131
3,4,2015-11-23 16:34:51,104,2,2374,109900.0,2,104,1,1,...,1,173561,8254,70712,2015-07-03,1,4,7,1435910017,51.6377
4,5,2015-11-09 14:26:08,104,6,1301,89900.0,2,104,1,1,...,1,70647,196247,208481,2015-07-08,1,2,18,1436380535,42.240484


In [9]:
df2= df.drop(columns=['txid','creationdate','bookingdate','amount','date','shopperinteraction','cardverificationcodesupplied','mail_id','creationdate_Unix','card_id','countries_equal','day_of_week','currencycode'])#

In [10]:
X = df2.drop(columns="simple_journal").values
y = df2["simple_journal"].values

In [11]:
from sklearn.model_selection import StratifiedKFold

In [12]:
skf = StratifiedKFold(n_splits=10, )
skf.get_n_splits(X, y)
train_folds = []
test_folds = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    train_fold = np.concatenate((x_train_fold, y_train_fold.reshape(-1, 1)), axis=1)
    test_fold = np.concatenate((x_test_fold, y_test_fold.reshape(-1, 1)), axis=1)
    train_folds.append(train_fold)
    test_folds.append(test_fold)

TRAIN: (211952,) TEST: (23551,)
TRAIN: (211952,) TEST: (23551,)
TRAIN: (211952,) TEST: (23551,)
TRAIN: (211953,) TEST: (23550,)
TRAIN: (211953,) TEST: (23550,)
TRAIN: (211953,) TEST: (23550,)
TRAIN: (211953,) TEST: (23550,)
TRAIN: (211953,) TEST: (23550,)
TRAIN: (211953,) TEST: (23550,)
TRAIN: (211953,) TEST: (23550,)


In [14]:
df_temp = df2.drop(columns=["simple_journal"])
col = df_temp.columns
col

Index(['issuercountrycode', 'txvariantcode', 'bin', 'shoppercountrycode',
       'cvcresponsecode', 'accountcode', 'ip_id', 'hour', 'amount_eur'],
      dtype='object')

In [15]:
for k,l in zip(train_folds,test_folds):
    train_X = k[:,:-1]
    train_y = np.ravel(k[:,-1:])
    test_X = l[:,:-1]
    test_y = np.ravel(l[:,-1:])
    print(train_X.shape)
    print(test_X.shape)
    print(train_y.sum())
    print(test_y.sum())
    train_X=pd.DataFrame(train_X,columns=col)
    df_smote = smote(5,6,train_X,train_y)
    train_X = df_smote.drop(columns="simple_journal")
    train_y = df_smote["simple_journal"]
    print(train_X.shape)
    print(test_X.shape)
    print(train_y.sum())
    print(test_y.sum())
    train_X,train_y = subsample(5,train_X,np.array(train_y))
    print(train_X.shape)
    print(test_X.shape)
    print(train_y.sum())
    print(test_y.sum())

    
    

(211952, 9)
(23551, 9)
310.0
35.0
(213057, 9)
(23551, 9)
1415
35.0
(7075, 9)
(23551, 9)
1415
35.0
(211952, 9)
(23551, 9)
310.0
35.0
(212870, 9)
(23551, 9)
1228
35.0
(6140, 9)
(23551, 9)
1228
35.0
(211952, 9)
(23551, 9)
310.0
35.0
(212845, 9)
(23551, 9)
1203
35.0
(6015, 9)
(23551, 9)
1203
35.0
(211953, 9)
(23550, 9)
310.0
35.0
(212850, 9)
(23550, 9)
1207
35.0
(6035, 9)
(23550, 9)
1207
35.0
(211953, 9)
(23550, 9)
310.0
35.0
(212873, 9)
(23550, 9)
1230
35.0
(6150, 9)
(23550, 9)
1230
35.0
(211953, 9)
(23550, 9)
311.0
34.0
(212859, 9)
(23550, 9)
1217
34.0
(6085, 9)
(23550, 9)
1217
34.0
(211953, 9)
(23550, 9)
311.0
34.0
(212854, 9)
(23550, 9)
1212
34.0
(6060, 9)
(23550, 9)
1212
34.0
(211953, 9)
(23550, 9)
311.0
34.0
(212851, 9)
(23550, 9)
1209
34.0
(6045, 9)
(23550, 9)
1209
34.0
(211953, 9)
(23550, 9)
311.0
34.0
(212893, 9)
(23550, 9)
1251
34.0
(6255, 9)
(23550, 9)
1251
34.0
(211953, 9)
(23550, 9)
311.0
34.0
(212851, 9)
(23550, 9)
1209
34.0
(6045, 9)
(23550, 9)
1209
34.0


In [None]:
scaler =StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#from sklearn import decomposition
#pca = decomposition.PCA(n_components=3)
#pca.fit(X_train)
#X_train =pca.transform(X_train)
#pca.fit(X_test)
#X_test = pca.transform(X_test)

In [None]:
X_train.shape,y_train.sum(),X_test.shape,y_test.sum()

In [None]:
clf = LogisticRegression(penalty="elasticnet",class_weight="balanced",max_iter=1000,solver="saga",l1_ratio=0.9,C=.1)


y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" %(X_test.shape[0], (y_test != y_pred).sum()))
print("accuracy: ", metrics.accuracy_score(y_test, y_pred))

y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr, label="AUC for not rank-swapped data= " +str(round(auc*100,2)))

plt.legend(loc=4)
plt.show()


tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
(tn, fp, fn, tp)



In [None]:
clf = LogisticRegression(penalty="elasticnet",max_iter=100,solver="saga",l1_ratio=0.9,C=.1)


y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" %(X_test.shape[0], (y_test != y_pred).sum()))
print("accuracy: ", metrics.accuracy_score(y_test, y_pred))

y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr, label="AUC for not rank-swapped data= " +str(round(auc*100,2)))

plt.legend(loc=4)
plt.show()

tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
(tn, fp, fn, tp)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch import Tensor
from torch.utils.data.sampler import WeightedRandomSampler

dataset = TensorDataset( Tensor(X_train), Tensor(y_train).type(torch.LongTensor) )

use_gpu = torch.cuda.is_available()

batch_size_train = 200

class_sample_count = np.array(
    [len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
weight = 1. / class_sample_count
samples_weight=[]
for i in y_train:
    if i==0:
        samples_weight.append(weight[0])
    else:
        samples_weight.append(weight[1])
samples_weight=np.array(samples_weight)
samples_weight = torch.from_numpy(samples_weight)
samples_weigth = samples_weight.double()
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))



trainloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size_train, shuffle=False,num_workers=4,sampler=sampler)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, 64)
        self.fc4 = nn.Linear(64,32)
        self.fc5 = nn.Linear(32,16)
        self.fc6 = nn.Linear(16,8)
        self.fc7 = nn.Linear(8,4)
        self.fc8 = nn.Linear(4,2)
        
        nn.init.xavier_uniform_(self.fc1.weight)  # glorot
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)  # glorot
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)  # glorot
        nn.init.zeros_(self.fc3.bias)
        nn.init.xavier_uniform_(self.fc3.weight)  # glorot
        nn.init.zeros_(self.fc4.bias)
        nn.init.xavier_uniform_(self.fc5.weight)  # glorot
        nn.init.zeros_(self.fc5.bias)
        nn.init.xavier_uniform_(self.fc6.weight)  # glorot
        nn.init.zeros_(self.fc6.bias)
        nn.init.xavier_uniform_(self.fc7.weight)  # glorot
        nn.init.zeros_(self.fc7.bias)
        nn.init.xavier_uniform_(self.fc8.weight)  # glorot
        nn.init.zeros_(self.fc8.bias)
        
        
    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = self.fc8(x)
        
        return x

net = Net()

if use_gpu:
    net = net.cuda()


import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)

net.train()

for epoch in range(100):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 10 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 10))                                                            
            
            running_loss = 0.0
            niter = epoch * len(trainloader) + i

print('Finished Training')
    

In [None]:
testset = TensorDataset( Tensor(X_test), Tensor(np.array(y_test)).type(torch.LongTensor) )

In [None]:
testLoader = torch.utils.data.DataLoader(
    testset,
    batch_size=len(X_test),
    shuffle=True,
    num_workers=4
)

total = 0
correct = 0
net.eval()
with torch.no_grad():
    for data in testLoader:
        inputs, labels = data
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: ' + str(
    100 * correct / total) + "%")

In [None]:
predicted=predicted.cpu().numpy()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
(tn, fp, fn, tp)