# Importing libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Loading Data from .txt file

In [2]:
list_dict = []

file = open("Transactions.txt", "r")
lines = file.readlines()

for line in lines:
    dict = json.loads(line)
    list_dict.append(dict)
    
file.close()

In [3]:
list_dict

[{'accountNumber': '737265056',
  'customerId': '737265056',
  'creditLimit': 5000.0,
  'availableMoney': 5000.0,
  'transactionDateTime': '2016-08-13T14:27:32',
  'transactionAmount': 98.55,
  'merchantName': 'Uber',
  'acqCountry': 'US',
  'merchantCountryCode': 'US',
  'posEntryMode': '02',
  'posConditionCode': '01',
  'merchantCategoryCode': 'rideshare',
  'currentExpDate': '06/2023',
  'accountOpenDate': '2015-03-14',
  'dateOfLastAddressChange': '2015-03-14',
  'cardCVV': '414',
  'enteredCVV': '414',
  'cardLast4Digits': '1803',
  'transactionType': 'PURCHASE',
  'echoBuffer': '',
  'currentBalance': 0.0,
  'merchantCity': '',
  'merchantState': '',
  'merchantZip': '',
  'cardPresent': False,
  'posOnPremises': '',
  'recurringAuthInd': '',
  'expirationDateKeyInMatch': False,
  'isFraud': False},
 {'accountNumber': '737265056',
  'customerId': '737265056',
  'creditLimit': 5000.0,
  'availableMoney': 5000.0,
  'transactionDateTime': '2016-10-11T05:05:54',
  'transactionAmount

In [4]:
data = pd.DataFrame(list_dict)

In [5]:
data.head(30)

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,,0.0,,,,True,,,False,False
5,830329091,830329091,5000.0,5000.0,2016-04-19T16:24:27,30.76,In-N-Out #422833,US,US,2,...,,0.0,,,,True,,,False,False
6,830329091,830329091,5000.0,5000.0,2016-05-21T14:50:35,57.28,Krispy Kreme #685312,US,US,2,...,,0.0,,,,True,,,False,False
7,830329091,830329091,5000.0,5000.0,2016-06-03T00:31:21,9.37,Shake Shack #968081,US,US,5,...,,0.0,,,,True,,,False,False
8,830329091,830329091,5000.0,4990.63,2016-06-10T01:21:46,523.67,Burger King #486122,,US,2,...,,9.37,,,,True,,,False,False
9,830329091,830329091,5000.0,5000.0,2016-07-11T10:47:16,164.37,Five Guys #510989,US,US,5,...,,0.0,,,,True,,,False,False


# Storing converted data to .csv file

In [6]:
data.to_csv("transaction_data.csv")

# Data Cleaning

In [7]:
data = data.replace(r'^\s*$', np.nan, regex=True)

In [8]:
data.head(30)

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,,0.0,,,,True,,,False,False
5,830329091,830329091,5000.0,5000.0,2016-04-19T16:24:27,30.76,In-N-Out #422833,US,US,2,...,,0.0,,,,True,,,False,False
6,830329091,830329091,5000.0,5000.0,2016-05-21T14:50:35,57.28,Krispy Kreme #685312,US,US,2,...,,0.0,,,,True,,,False,False
7,830329091,830329091,5000.0,5000.0,2016-06-03T00:31:21,9.37,Shake Shack #968081,US,US,5,...,,0.0,,,,True,,,False,False
8,830329091,830329091,5000.0,4990.63,2016-06-10T01:21:46,523.67,Burger King #486122,,US,2,...,,9.37,,,,True,,,False,False
9,830329091,830329091,5000.0,5000.0,2016-07-11T10:47:16,164.37,Five Guys #510989,US,US,5,...,,0.0,,,,True,,,False,False


In [9]:
data.isnull().sum()

accountNumber                    0
customerId                       0
creditLimit                      0
availableMoney                   0
transactionDateTime              0
transactionAmount                0
merchantName                     0
acqCountry                    4562
merchantCountryCode            724
posEntryMode                  4054
posConditionCode               409
merchantCategoryCode             0
currentExpDate                   0
accountOpenDate                  0
dateOfLastAddressChange          0
cardCVV                          0
enteredCVV                       0
cardLast4Digits                  0
transactionType                698
echoBuffer                  786363
currentBalance                   0
merchantCity                786363
merchantState               786363
merchantZip                 786363
cardPresent                      0
posOnPremises               786363
recurringAuthInd            786363
expirationDateKeyInMatch         0
isFraud             

In [10]:
data.shape

(786363, 29)

In [11]:
data.drop(['echoBuffer', 'merchantCity', 'merchantState', 'merchantZip', 'posOnPremises', 'recurringAuthInd'], axis = 1, inplace = True)

In [12]:
data.head(10)

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,2015-03-14,2015-03-14,414,414,1803,PURCHASE,0.0,False,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.0,True,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.0,False,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.0,False,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.0,True,False,False
5,830329091,830329091,5000.0,5000.0,2016-04-19T16:24:27,30.76,In-N-Out #422833,US,US,2,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.0,True,False,False
6,830329091,830329091,5000.0,5000.0,2016-05-21T14:50:35,57.28,Krispy Kreme #685312,US,US,2,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.0,True,False,False
7,830329091,830329091,5000.0,5000.0,2016-06-03T00:31:21,9.37,Shake Shack #968081,US,US,5,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.0,True,False,False
8,830329091,830329091,5000.0,4990.63,2016-06-10T01:21:46,523.67,Burger King #486122,,US,2,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,9.37,True,False,False
9,830329091,830329091,5000.0,5000.0,2016-07-11T10:47:16,164.37,Five Guys #510989,US,US,5,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.0,True,False,False


In [13]:
data.isnull().sum()

accountNumber                  0
customerId                     0
creditLimit                    0
availableMoney                 0
transactionDateTime            0
transactionAmount              0
merchantName                   0
acqCountry                  4562
merchantCountryCode          724
posEntryMode                4054
posConditionCode             409
merchantCategoryCode           0
currentExpDate                 0
accountOpenDate                0
dateOfLastAddressChange        0
cardCVV                        0
enteredCVV                     0
cardLast4Digits                0
transactionType              698
currentBalance                 0
cardPresent                    0
expirationDateKeyInMatch       0
isFraud                        0
dtype: int64

In [14]:
data['merchantCountryCode'] = data['merchantCountryCode'].fillna('US')

In [15]:
data['acqCountry'] = data['acqCountry'].fillna(data['merchantCountryCode'])

In [16]:
data.dropna(subset = ["posEntryMode"], inplace=True)
data.dropna(subset = ["posConditionCode"], inplace=True)
data.dropna(subset = ["transactionType"], inplace=True)

In [17]:
data.isnull().sum()

accountNumber               0
customerId                  0
creditLimit                 0
availableMoney              0
transactionDateTime         0
transactionAmount           0
merchantName                0
acqCountry                  0
merchantCountryCode         0
posEntryMode                0
posConditionCode            0
merchantCategoryCode        0
currentExpDate              0
accountOpenDate             0
dateOfLastAddressChange     0
cardCVV                     0
enteredCVV                  0
cardLast4Digits             0
transactionType             0
currentBalance              0
cardPresent                 0
expirationDateKeyInMatch    0
isFraud                     0
dtype: int64

In [18]:
data.shape

(781207, 23)

# Storing cleaned data into a new .csv file

In [19]:
data.to_csv("transaction_data_updated.csv")