In [1]:
# ! pip install faker

# Generating synthetic credit card transactions for fraud detection with AutoML 

In this notebook, we will be using the Synthetic Credit Card Transaction Generator used in the Sparkov program to generate:
- customer data file
- transactions utilizing prior customer file

This is adapted from the GitHub repository [Sparkov_Data_Generation](https://github.com/namebrandon/Sparkov_Data_Generation).

In [2]:
import math
import os
import random
from datetime import date, datetime, timedelta

import faker
import pandas as pd
from faker import Faker

from sparkov_data_generation.datagen_customer import generate_customers
from sparkov_data_generation.datagen_transaction import generate_transactions

## 1 User Profile   

The profiles comprises of:
- Fraudulent / Non-fraudulent
- Male / Female
- Age groups ranging from:
    - <= 25
    - 25 - 50
    - \>= 50
- Geographical location
    - urban
    - rural
- Average transaction a day
- Weighted spending habits across:
    - the week
    - time of the year such as holiday or seasons
    - across each quarter of the year
    - time of the day (AM/PM)
- Weighted spending across various expenditure categories  

The profiles can be found under the directory __sparkov_data_generation/profiles__.

In [3]:
profiles = [
    p
    for p in os.listdir("sparkov_data_generation/profiles")
    if p.startswith("adults") or (p.startswith("young"))
]
profiles

['adults_2550_female_rural.json',
 'adults_2550_female_urban.json',
 'adults_2550_male_rural.json',
 'adults_2550_male_urban.json',
 'adults_50up_female_rural.json',
 'adults_50up_female_urban.json',
 'adults_50up_male_rural.json',
 'adults_50up_male_urban.json',
 'young_adults_female_rural.json',
 'young_adults_female_urban.json',
 'young_adults_male_rural.json',
 'young_adults_male_urban.json']

## 2 Instantiating variables

In [4]:
# Faker seed number to ensure constant customer list
seed_num = 4444

# initial number of customers
initial_cust_base_size = 500

# generate 4 months worth of data
start_dt = datetime.strptime("2021-06-01", "%Y-%m-%d")
end_dt = datetime.strptime("2021-10-30", "%Y-%m-%d")

cust_list = pd.DataFrame()
# txn_df = pd.DataFrame()

# output directory and file naming conventions
data_base_dir = "output/"
customer_output_file = f"{data_base_dir}/customers_list.csv"
txn_output_file = "_txn.csv"

# create output directory if not existing
os.makedirs(data_base_dir, exist_ok=True)

## 3 Simulated transactions

The program uses [Faker](https://faker.readthedocs.io/en/master/) to generate:
- customer details 
- transaction numbers
- credit card number
- merchant transaction locations

In order to generate the same list of customers, we will use a fixed seed number.

In [5]:
fake = Faker()
Faker.seed(seed_num)

On the start date, we create 500 customers and their transactions from the start to the end date. For each of the subsquent dates until the end date, the customer base increase between 1% to 3%. The customers list and their transaction history are saved in the "output" folder.

In [6]:
for dt in (start_dt + timedelta(n) for n in range(int((end_dt - start_dt).days))):
    print("Date", dt)

    if dt == start_dt:
        num_new_customers = initial_cust_base_size
    else:
        increment = random.uniform(0.01, 0.03)
        num_new_customers = math.ceil(len(cust_list) * increment)

    print("Number of customers newly onboard: ", num_new_customers)

    new_cust_list = generate_customers(
        fake, num_new_customers, "sparkov_data_generation/profiles/main_config.json"
    )
    cust_list = cust_list.append(new_cust_list, ignore_index=True)
    print("Total customer: ", len(cust_list))
    new_cust_list.to_csv(
        customer_output_file,
        mode="a",
        header=not os.path.exists(customer_output_file),
        index=False,
    )

    for p in profiles:
        # generate txns for the new customers from current date to the end date
        new_cust_txn_df = generate_transactions(new_cust_list, p, dt, end_dt)
        print(f"Number of transaction ({p}): ", len(new_cust_txn_df))
        new_cust_txn_df.to_csv(
            f"{data_base_dir}{p}{txn_output_file}",
            mode="a",
            header=not os.path.exists(f"{data_base_dir}{p}{txn_output_file}"),
            index=False,
        )

Date 2021-06-01 00:00:00
Number of customers newly onboard:  500
Total customer:  500
Number of transaction (adults_2550_female_rural.json):  24166
Number of transaction (adults_2550_female_urban.json):  27872
Number of transaction (adults_2550_male_rural.json):  18578
Number of transaction (adults_2550_male_urban.json):  24987
Number of transaction (adults_50up_female_rural.json):  20914
Number of transaction (adults_50up_female_urban.json):  12861
Number of transaction (adults_50up_male_rural.json):  22518
Number of transaction (adults_50up_male_urban.json):  18419
Number of transaction (young_adults_female_rural.json):  3072
Number of transaction (young_adults_female_urban.json):  4434
Number of transaction (young_adults_male_rural.json):  4543
Number of transaction (young_adults_male_urban.json):  12947
Date 2021-06-02 00:00:00
Number of customers newly onboard:  7
Total customer:  507
Number of transaction (adults_2550_female_rural.json):  0
Number of transaction (adults_2550_fema

In [7]:
print(f"Number of customers: {len(cust_list)}")

Number of customers: 9353
