In [3]:
import numpy as np
import pandas as pd
import csv
import sys
import os
  
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
import warnings
import pickle

max_iters = 101
#n_estimators = 10

def logreg(x,y,filename):

   # Model output file name
   file = (os.path.splitext(filename))[0]
   fname = './models/lr_' + file +'/'

   # File for writing precision,recall, f-measure scores for fraud transactions
   f = open('./prf/lr_'+ file + '_prf' +'.txt' ,'w')
   f.write('precision,recall,f-score \n')

   # Stratified sampling based on Y
   X_train, X_test, y_train, y_test = train_test_split(x, y,stratify=y , test_size=0.30, random_state=42)

   # Create 15% validation set and 15% test set split
   X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,stratify=y_test , test_size=0.50, random_state=42)
   
   #Iterations
   it = 1
   
   # Run training algorithm for multiple class weights
   while it < max_iters:
       cw = {}
       cw[0] = 1
       cw[1] = it
       # Train
       print('**************************************')
       print("Iteration number  " , it)
       lr = LogisticRegression(class_weight = cw)
       print('Class weights ', cw)
       lr.fit(X_train,y_train)

       # Save trained model to disk
       name = fname + str(cw[1]) + '.sav'
       pickle.dump(lr, open(name, 'wb'))

       # Predict on validation data
       y_val_pred = lr.predict(X_val)
       print('Performance on validation data - Confusion matrix')
       print(confusion_matrix(y_val,y_val_pred))
   
       precision,recall,fscore,support=score(y_val,y_val_pred,average=None)
       print('Precision, Recall, F-score, Support  on validation data' )
       print("Precision" , precision)
       print("Recall" , recall)
       print("F-score" , fscore)
       print("Support" , support)

       p1 = precision[1]
       r1 = recall[1]
       f1 = fscore[1]

       f.write(str(p1) +','+ str(r1) + ',' + str(f1) + '\n') 
       it += 1

   f.close()

def run():
   filename = 'm3.csv'
   df = pd.read_csv(filename, usecols = [9,10,11,12,13,14] , header = 0)
   
   results = list(map(int, df['Anomaly'])) 
   print('Number of fraudulent transactions ' , sum(results))

   features = ['Airfare', 'Lodging', 'Meals', 'Other_Transportation', 'Other_Expenses']
   targets = ['Anomaly']

   # Separating out the features and target variables
   x = df.loc[:, features].values
   y = df.loc[:, targets].values

   y  = [i for j in y for i in j]
   
   #Ignore warnings
   warnings.filterwarnings("ignore", category=FutureWarning)

   print("***********Logistic Regression**********")
   logreg(x,y,filename)
  
run()

Number of fraudulent transactions  4097
***********Logistic Regression**********
**************************************
Iteration number   1
Class weights  {0: 1, 1: 1}
Performance on validation data - Confusion matrix
[[79277    45]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998739 0.93161094]
Recall [0.99943269 0.99837134]
F-score [0.99970996 0.96383648]
Support [79322   614]
**************************************
Iteration number   2
Class weights  {0: 1, 1: 2}
Performance on validation data - Confusion matrix
[[79264    58]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998738 0.91356185]
Recall [0.9992688  0.99837134]
F-score [0.99962796 0.9540856 ]
Support [79322   614]
**************************************
Iteration number   3
Class weights  {0: 1, 1: 3}
Performance on validation data - Confusion matrix
[[79247    75]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precis

Performance on validation data - Confusion matrix
[[79153   169]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998737 0.78388747]
Recall [0.99786944 0.99837134]
F-score [0.99892728 0.8782235 ]
Support [79322   614]
**************************************
Iteration number   26
Class weights  {0: 1, 1: 26}
Performance on validation data - Confusion matrix
[[79152   170]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998737 0.78288633]
Recall [0.99785684 0.99837134]
F-score [0.99892097 0.87759485]
Support [79322   614]
**************************************
Iteration number   27
Class weights  {0: 1, 1: 27}
Performance on validation data - Confusion matrix
[[79150   172]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998737 0.78089172]
Recall [0.99783162 0.99837134]
F-score [0.99890833 0.87634024]
Support [79322   614]
**************************************
Iteration num

Performance on validation data - Confusion matrix
[[78881   441]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998732 0.58159393]
Recall [0.99444038 0.99837134]
F-score [0.99720614 0.73501199]
Support [79322   614]
**************************************
Iteration number   50
Class weights  {0: 1, 1: 50}
Performance on validation data - Confusion matrix
[[78847   475]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998732 0.56341912]
Recall [0.99401175 0.99837134]
F-score [0.99699058 0.72032902]
Support [79322   614]
**************************************
Iteration number   51
Class weights  {0: 1, 1: 51}
Performance on validation data - Confusion matrix
[[78805   517]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998731 0.54247788]
Recall [0.99348226 0.99837134]
F-score [0.99672417 0.70298165]
Support [79322   614]
**************************************
Iteration num

Performance on validation data - Confusion matrix
[[78536   786]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998727 0.43817012]
Recall [0.99009102 0.99837134]
F-score [0.99501454 0.60904123]
Support [79322   614]
**************************************
Iteration number   74
Class weights  {0: 1, 1: 74}
Performance on validation data - Confusion matrix
[[78531   791]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998727 0.43660969]
Recall [0.99002799 0.99837134]
F-score [0.99498271 0.60753221]
Support [79322   614]
**************************************
Iteration number   75
Class weights  {0: 1, 1: 75}
Performance on validation data - Confusion matrix
[[78525   797]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998727 0.43475177]
Recall [0.98995235 0.99837134]
F-score [0.9949445  0.60573123]
Support [79322   614]
**************************************
Iteration num

Performance on validation data - Confusion matrix
[[78367   955]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998724 0.39094388]
Recall [0.98796046 0.99837134]
F-score [0.99393747 0.56186984]
Support [79322   614]
**************************************
Iteration number   98
Class weights  {0: 1, 1: 98}
Performance on validation data - Confusion matrix
[[78361   961]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998724 0.38945362]
Recall [0.98788482 0.99837134]
F-score [0.99389919 0.56032907]
Support [79322   614]
**************************************
Iteration number   99
Class weights  {0: 1, 1: 99}
Performance on validation data - Confusion matrix
[[78361   961]
 [    1   613]]
Precision, Recall, F-score, Support  on validation data
Precision [0.99998724 0.38945362]
Recall [0.98788482 0.99837134]
F-score [0.99389919 0.56032907]
Support [79322   614]
**************************************
Iteration num

In [4]:
file

NameError: name 'file' is not defined