In [1]:
from platform import python_version

print(python_version())

3.11.5


# Import necessary libraries:

In [1]:
import json
import itertools
import numpy as np
import pandas as pd

In [2]:
# Data Reading Directory
directory = r'C:\\Gopi\\Study material\\GL\\Revision for final exam\\Capstone project\\Credit card default\\Review 2\\transactions.txt'

# Reading text from text file
with open(directory) as txt_file:
    lines = txt_file.readlines()

# Text in text file was in JSON format
# Converting text lines to JSON format and then restructuring as a dataframe 
trxn_data = []
for line in lines:
    trxn_data.append(json.loads(line))

# Replacing blank fields with NaN values
df = pd.DataFrame(trxn_data)
df = df.replace(r'', np.NaN)
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,,0.0,,,,True,,,False,False


In [3]:
df.shape # The data has 786363 rows , we prefer a smaller size for the capstone project & opt for sampling methods to reduce 
         # The overall size to a more appropriate size considering hardware requirements

(786363, 29)

In [4]:
df['isFraud'].value_counts() # Given the high imbalance in the target variable data, there is a high chance
                                                 # 'Random sampling' may lead to even more imbalanced dataset

isFraud
False    773946
True      12417
Name: count, dtype: int64

In [5]:
df['isFraud'].value_counts(normalize=True) * 100  # Given the high imbalance in the target variable data, there is a high chance
                                                 # 'Random sampling' may lead to even more imbalanced dataset

isFraud
False    98.420958
True      1.579042
Name: proportion, dtype: float64

In [6]:
df[df.duplicated()]

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud


In [7]:
fraud_true = df[df['isFraud'] == True]
fraud_false = df[df['isFraud'] == False]

# Calculate the number of samples required for each class
n_samples_true = int(150000 * 0.01579042)  # Number of True samples
n_samples_false = 150000 - n_samples_true  # Number of False samples

# Sample from each stratum
sample_true = fraud_true.sample(n=n_samples_true, replace=False)
sample_false = fraud_false.sample(n=n_samples_false, replace=False)

# Concatenate the samples
stratified_sample = pd.concat([sample_true, sample_false])

# Shuffle the sample
stratified_sample = stratified_sample.sample(frac=1).reset_index(drop=True)

In [8]:
stratified_sample 

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,302785707,302785707,15000.0,3165.82,2016-11-02T15:36:59,16.28,Dinosaur Sandwitch Bar #303701,US,US,05,...,,11834.18,,,,True,,,False,False
1,459769579,459769579,15000.0,13362.73,2016-09-24T04:56:14,5.32,Play Store,US,US,05,...,,1637.27,,,,False,,,False,False
2,668771382,668771382,5000.0,4786.28,2016-12-19T11:11:56,95.44,amazon.com,US,US,09,...,,213.72,,,,False,,,False,True
3,690307166,690307166,15000.0,8230.64,2016-11-29T04:22:19,154.17,Delta Airlines,US,US,05,...,,6769.36,,,,False,,,False,False
4,447442368,447442368,1000.0,644.36,2016-06-10T00:36:53,25.36,Sunoco Gas #931794,US,US,02,...,,355.64,,,,True,,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,536582330,536582330,2500.0,2288.18,2016-05-17T00:41:55,196.84,Rodeway Inn #105130,US,US,05,...,,211.82,,,,False,,,False,False
149996,810471359,810471359,5000.0,3221.61,2016-10-20T17:10:45,5.19,walmart.com,US,US,09,...,,1778.39,,,,False,,,False,False
149997,865635967,865635967,500.0,200.20,2016-08-16T12:25:42,365.13,1st Restaurant,US,US,05,...,,299.80,,,,True,,,False,False
149998,894938833,894938833,15000.0,4641.56,2016-01-22T17:25:44,198.40,WSC #998991,US,US,05,...,,10358.44,,,,False,,,False,False


In [9]:
stratified_sample['isFraud'].value_counts()

isFraud
False    147632
True       2368
Name: count, dtype: int64

In [10]:
stratified_sample['isFraud'].value_counts(normalize=True) * 100

isFraud
False    98.421333
True      1.578667
Name: proportion, dtype: float64

In [11]:
stratified_sample[stratified_sample.duplicated()] # no duplicate values in the sample

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud


In [12]:
stratified_sample.to_csv('stratified_transactions.csv',index=False)