In [2]:
import numpy as np
import pandas as pd
import csv
import sys
import os

import matplotlib.pyplot as plt    
    
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
import warnings
import pickle

max_iters = 101


def svm(x,y,filename):

   # Model output file name
   file = (os.path.splitext(filename))[0]
   fname = './models/svm_' + file +'/'

   # File for writing precision,recall, f-measure scores for fraud transactions
   f = open('./prf/svm_'+ file + '_prf' +'.txt' ,'w')
   f.write('precision,recall,f-score \n')

   # Stratified sampling based on Y
   X_train, X_test, y_train, y_test = train_test_split(x, y,stratify=y , test_size=0.30, random_state=42)

   # Create 15% validation set and 15% test set split
   X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,stratify=y_test , test_size=0.50, random_state=42)

   #Iterations
   it = 1
   
   # Run training algorithm for multiple class weights
   while it < max_iters:
       cw = {}
       cw[0] = 1
       cw[1] = it
       # Train
       print('**************************************')
       print("Iteration number  " , it)
       svm = LinearSVC(class_weight = cw, dual = False ,tol=1e-05,max_iter = 1000)
       print('Class weights ', cw)
       svm.fit(X_train,y_train)

       # Save trained model to disk
       name = fname + str(cw[1]) + '.sav'
       pickle.dump(svm, open(name, 'wb'))

       #Predict on validation data
       y_val_pred = svm.predict(X_val)
       print('Performance on validation data - Confusion matrix')
       print(confusion_matrix(y_val,y_val_pred))
   
       precision,recall,fscore,support=score(y_val,y_val_pred,average=None)
       print('Precision, Recall, F-score, Support on validation data' )
       print("Precision" , precision)
       print("Recall" , recall)
       print("F-score" , fscore)
       print("Support" , support)

       p1 = precision[1]
       r1 = recall[1]
       f1 = fscore[1]

       f.write(str(p1) +','+ str(r1) + ',' + str(f1) + '\n')    
       it += 1

   f.close()

def run():
   filename = 'm2.csv'
   df = pd.read_csv(filename, usecols = [9,10,11,12,13,14] , header = 0)
   
   results = list(map(int, df['Anomaly'])) 
   print('Number of fraudulent transactions ' , sum(results))

   features = ['Airfare', 'Lodging', 'Meals', 'Other_Transportation', 'Other_Expenses']
   targets = ['Anomaly']

   # Separating out the features and target variables
   x = df.loc[:, features].values
   y = df.loc[:, targets].values

   y  = [i for j in y for i in j]
   
   #Ignore warnings
   warnings.filterwarnings("ignore", category=FutureWarning)

   print("**************** SVM *******************")
   svm(x,y,filename)
  
run()

Number of fraudulent transactions  4116
**************** SVM *******************
**************************************
Iteration number   1
Class weights  {0: 1, 1: 1}
Performance on validation data - Confusion matrix
[[334937     70]
 [   132    486]]
Precision, Recall, F-score, Support on validation data
Precision [0.99960605 0.87410072]
Recall [0.99979105 0.78640777]
F-score [0.99969854 0.82793867]
Support [335007    618]
**************************************
Iteration number   2
Class weights  {0: 1, 1: 2}
Performance on validation data - Confusion matrix
[[334853    154]
 [   101    517]]
Precision, Recall, F-score, Support on validation data
Precision [0.99969847 0.7704918 ]
Recall [0.99954031 0.83656958]
F-score [0.99961938 0.80217223]
Support [335007    618]
**************************************
Iteration number   3
Class weights  {0: 1, 1: 3}
Performance on validation data - Confusion matrix
[[334768    239]
 [    82    536]]
Precision, Recall, F-score, Support on validatio

Performance on validation data - Confusion matrix
[[333849   1158]
 [    60    558]]
Precision, Recall, F-score, Support on validation data
Precision [0.99982031 0.32517483]
Recall [0.99654336 0.90291262]
F-score [0.99817914 0.4781491 ]
Support [335007    618]
**************************************
Iteration number   25
Class weights  {0: 1, 1: 25}
Performance on validation data - Confusion matrix
[[334340    667]
 [   205    413]]
Precision, Recall, F-score, Support on validation data
Precision [0.99938723 0.38240741]
Recall [0.998009   0.66828479]
F-score [0.99869764 0.48645465]
Support [335007    618]
**************************************
Iteration number   26
Class weights  {0: 1, 1: 26}
Performance on validation data - Confusion matrix
[[333592   1415]
 [    43    575]]
Precision, Recall, F-score, Support on validation data
Precision [0.99987112 0.28894472]
Recall [0.99577621 0.93042071]
F-score [0.99781946 0.44095092]
Support [335007    618]
*************************************

Performance on validation data - Confusion matrix
[[333706   1301]
 [   139    479]]
Precision, Recall, F-score, Support on validation data
Precision [0.99958364 0.26910112]
Recall [0.9961165  0.77508091]
F-score [0.99784706 0.39949958]
Support [335007    618]
**************************************
Iteration number   49
Class weights  {0: 1, 1: 49}
Performance on validation data - Confusion matrix
[[333125   1882]
 [    55    563]]
Precision, Recall, F-score, Support on validation data
Precision [0.99983492 0.23026585]
Recall [0.99438221 0.91100324]
F-score [0.99710111 0.36761345]
Support [335007    618]
**************************************
Iteration number   50
Class weights  {0: 1, 1: 50}
Performance on validation data - Confusion matrix
[[333680   1327]
 [   143    475]]
Precision, Recall, F-score, Support on validation data
Precision [0.99957163 0.263596  ]
Recall [0.99603889 0.76860841]
F-score [0.99780213 0.39256198]
Support [335007    618]
*************************************

KeyboardInterrupt: 