In [5]:
import numpy as np
import pandas as pd
import csv
import sys
import os
  
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
import warnings
import pickle

max_iters = 501
#n_estimators = 10

def logreg(x,y,filename):

   # Model output file name
   file = (os.path.splitext(filename))[0]
   fname = './models/dt_' + file +'/'

   # File for writing precision,recall, f-measure scores for fraud transactions
   f = open('./prf/dt_'+ file + '_prf' +'.txt' ,'w')
   f.write('precision,recall,f-score \n')

   # Stratified sampling based on Y
   X_train, X_test, y_train, y_test = train_test_split(x, y,stratify=y , test_size=0.30, random_state=42)

   # Create 15% validation set and 15% test set split
   X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,stratify=y_test , test_size=0.50, random_state=42)
   
   #Iterations
   it = 1
   
   # Run training algorithm for multiple class weights
   while it < max_iters:
       cw = {}
       cw[0] = 1
       cw[1] = it
       # Train
       print('**************************************')
       print("Iteration number  " , it)
       dt = DecisionTreeClassifier(max_depth=5, random_state=1, class_weight = cw)
       #lr = LogisticRegression(class_weight = cw)
       print('Class weights ', cw)
       #lr.fit(X_train,y_train)
       dt.fit(X_train,y_train)

       # Save trained model to disk
       name = fname + str(cw[1]) + '.sav'
       pickle.dump(dt, open(name, 'wb'))

       # Predict on validation data
       y_val_pred = dt.predict(X_val)
       print('Performance on validation data - Confusion matrix')
       print(confusion_matrix(y_val,y_val_pred))
   
       precision,recall,fscore,support=score(y_val,y_val_pred,average=None)
       print('Precision, Recall, F-score, Support  on validation data' )
       print("Precision" , precision)
       print("Recall" , recall)
       print("F-score" , fscore)
       print("Support" , support)

       p1 = precision[1]
       r1 = recall[1]
       f1 = fscore[1]

       f.write(str(p1) +','+ str(r1) + ',' + str(f1) + '\n') 
       it += 1

   f.close()

def run():
   filename = 'm3.csv'
   df = pd.read_csv(filename, usecols = [9,10,11,12,13,14] , header = 0)
   
   results = list(map(int, df['Anomaly'])) 
   print('Number of fraudulent transactions ' , sum(results))

   features = ['Airfare', 'Lodging', 'Meals', 'Other_Transportation', 'Other_Expenses']
   targets = ['Anomaly']

   # Separating out the features and target variables
   x = df.loc[:, features].values
   y = df.loc[:, targets].values
    
   scaler = preprocessing.StandardScaler()
   x = scaler.fit_transform(x)

   y  = [i for j in y for i in j]
   
   #Ignore warnings
   warnings.filterwarnings("ignore", category=FutureWarning)

   print("***********Logistic Regression**********")
   logreg(x,y,filename)
  
run()

Number of fraudulent transactions  4097
***********Logistic Regression**********
**************************************
Iteration number   1
Class weights  {0: 1, 1: 1}
Performance on validation data - Confusion matrix
[[79312    10]
 [    3   611]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99996218 0.98389694]
Recall [0.99987393 0.99511401]
F-score [0.99991805 0.98947368]
Support [79322   614]
**************************************
Iteration number   2
Class weights  {0: 1, 1: 2}
Performance on validation data - Confusion matrix
[[79311    11]
 [    3   611]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99996218 0.98231511]
Recall [0.99986132 0.99511401]
F-score [0.99991175 0.98867314]
Support [79322   614]
**************************************
Iteration number   3
Class weights  {0: 1, 1: 3}
Performance on validation data - Confusion matrix
[[79311    11]
 [    3   611]]
Precision, Recall, F-score, Support  on validation data
Precis

Performance on validation data - Confusion matrix
[[79311    11]
 [    2   612]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99997478 0.9823435 ]
Recall [0.99986132 0.99674267]
F-score [0.99991805 0.9894907 ]
Support [79322   614]
**************************************
Iteration number   26
Class weights  {0: 1, 1: 26}
Performance on validation data - Confusion matrix
[[79311    11]
 [    2   612]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99997478 0.9823435 ]
Recall [0.99986132 0.99674267]
F-score [0.99991805 0.9894907 ]
Support [79322   614]
**************************************
Iteration number   27
Class weights  {0: 1, 1: 27}
Performance on validation data - Confusion matrix
[[79311    11]
 [    2   612]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99997478 0.9823435 ]
Recall [0.99986132 0.99674267]
F-score [0.99991805 0.9894907 ]
Support [79322   614]
**************************************
Iteration num

Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   50
Class weights  {0: 1, 1: 50}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   51
Class weights  {0: 1, 1: 51}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration num

Support [79322   614]
**************************************
Iteration number   73
Class weights  {0: 1, 1: 73}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   74
Class weights  {0: 1, 1: 74}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   75
Class weights  {0: 1, 1: 75}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.9983

Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   98
Class weights  {0: 1, 1: 98}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   99
Class weights  {0: 1, 1: 99}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration num

Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   122
Class weights  {0: 1, 1: 122}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration number   123
Class weights  {0: 1, 1: 123}
Performance on validation data - Confusion matrix
[[79306    16]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.9745628 ]
Recall [0.99979829 0.99837134]
F-score [0.99989283 0.98632341]
Support [79322   614]
**************************************
Iteration

**************************************
Iteration number   145
Class weights  {0: 1, 1: 145}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   146
Class weights  {0: 1, 1: 146}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   147
Class weights  {0: 1, 1: 147}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0

F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   169
Class weights  {0: 1, 1: 169}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   170
Class weights  {0: 1, 1: 170}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   171
Class weights  {0: 1, 1: 171}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739

Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   193
Class weights  {0: 1, 1: 193}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   194
Class weights  {0: 1, 1: 194}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   195
Class weights  {0: 1, 1: 195}
Performance on validation data - Conf

Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   217
Class weights  {0: 1, 1: 217}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   218
Class weights  {0: 1, 1: 218}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration

Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   241
Class weights  {0: 1, 1: 241}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   242
Class weights  {0: 1, 1: 242}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration

Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   265
Class weights  {0: 1, 1: 265}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   266
Class weights  {0: 1, 1: 266}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration

Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   289
Class weights  {0: 1, 1: 289}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   290
Class weights  {0: 1, 1: 290}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration

Iteration number   312
Class weights  {0: 1, 1: 312}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   313
Class weights  {0: 1, 1: 313}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   314
Class weights  {0: 1, 1: 314}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   

Iteration number   336
Class weights  {0: 1, 1: 336}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   337
Class weights  {0: 1, 1: 337}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   338
Class weights  {0: 1, 1: 338}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   

Support [79322   614]
**************************************
Iteration number   360
Class weights  {0: 1, 1: 360}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   361
Class weights  {0: 1, 1: 361}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   362
Class weights  {0: 1, 1: 362}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 

Support [79322   614]
**************************************
Iteration number   384
Class weights  {0: 1, 1: 384}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   385
Class weights  {0: 1, 1: 385}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   386
Class weights  {0: 1, 1: 386}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 

Class weights  {0: 1, 1: 408}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   409
Class weights  {0: 1, 1: 409}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   410
Class weights  {0: 1, 1: 410}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
******************

Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   433
Class weights  {0: 1, 1: 433}
Performance on validation data - Confusion matrix
[[79142   180]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.77301387]
Recall [0.99773077 0.99837134]
F-score [0.99885779 0.8713575 ]
Support [79322   614]
**************************************
Iteration number   434
Class weights  {0: 1, 1: 434}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration

Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   457
Class weights  {0: 1, 1: 457}
Performance on validation data - Confusion matrix
[[79311    11]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.98237179]
Recall [0.99986132 0.99837134]
F-score [0.99992435 0.99030695]
Support [79322   614]
**************************************
Iteration number   458
Class weights  {0: 1, 1: 458}
Performance on validation data - Confusion matrix
[[79111   211]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.74393204]
Recall [0.99733996 0.99837134]
F-score [0.9986619  0.85257302]
Support [79322   614]
**************************************
Iteration

Performance on validation data - Confusion matrix
[[79111   211]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.74393204]
Recall [0.99733996 0.99837134]
F-score [0.9986619  0.85257302]
Support [79322   614]
**************************************
Iteration number   481
Class weights  {0: 1, 1: 481}
Performance on validation data - Confusion matrix
[[79111   211]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.74393204]
Recall [0.99733996 0.99837134]
F-score [0.9986619  0.85257302]
Support [79322   614]
**************************************
Iteration number   482
Class weights  {0: 1, 1: 482}
Performance on validation data - Confusion matrix
[[79111   211]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998736 0.74393204]
Recall [0.99733996 0.99837134]
F-score [0.9986619  0.85257302]
Support [79322   614]
**************************************
Iteration