In [1]:
pip install pandas numpy faker


Collecting faker
  Downloading Faker-30.3.0-py3-none-any.whl (1.8 MB)
     ---------------------------------------- 1.8/1.8 MB 14.6 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-30.3.0


In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random 

In [2]:
## Initalize Faker
fake = Faker()
Faker.seed(42)  # Seed for reproducibility
random.seed(42)

In [3]:
# Define the number of records you want to generate
num_records = 20000

In [4]:
# Define transaction categories and payment methods
categories = ['Groceries', 'Rent', 'Utilities', 'Entertainment', 'Dining', 'Transport', 'Health', 'Shopping']
payment_methods = ['Credit Card', 'Debit Card', 'Cash', 'Bank Transfer']

In [5]:
def generate_transactions(num_records):
    transactions = []
    
    for _ in range(num_records):
        # Generate random date
        date = fake.date_this_year()
        # Random category and corresponding amount variability
        category = random.choice(categories)
        base_amount = round(random.uniform(5.0, 1000.0), 2)
        
        # Add category-based variability to amount
        if category == 'Rent':
            amount = round(random.uniform(500, 2000), 2)
        elif category == 'Groceries':
            amount = round(random.uniform(20, 200), 2)
        elif category == 'Utilities':
            amount = round(random.uniform(50, 500), 2)
        else:
            amount = base_amount
        
        transaction = {
            'Transaction ID': fake.uuid4(),
            'Date': date,
            'Weekday': date.weekday(),  # New feature: Day of the week (0 = Monday, 6 = Sunday)
            'Month': date.month,        # New feature: Month of the transaction
            'Category': category,
            'Amount': amount,  # Amount adjusted by category
            'Transaction Type': random.choice(['Expense', 'Income']),
            'Payment Method': random.choice(payment_methods),
            'Merchant': fake.company(),
            'User ID': fake.uuid4()  # Optional for multi-user system
        }
        transactions.append(transaction)
    
    return pd.DataFrame(transactions)

In [6]:
# Generate synthetic data
synthetic_data = generate_transactions(num_records)

In [7]:
# Convert Date column to datetime type to ensure proper formatting
synthetic_data['Date'] = pd.to_datetime(synthetic_data['Date'])

In [8]:
# Print first few rows to verify dates
print(synthetic_data.head())

                         Transaction ID       Date  Weekday  Month   Category  \
0  46685257-bdd6-40fb-8667-1ad11c80317f 2024-09-05        3      9       Rent   
1  0822e8f3-6c03-4199-972a-846916419f82 2024-07-30        1      7       Rent   
2  8b8148f6-b38a-488c-a65e-d389b74d0fb1 2024-03-18        0      3  Groceries   
3  571aa876-6c30-4511-b2b9-437a28df6ec4 2024-11-08        4     11     Health   
4  9a8dca03-580d-4b71-98f5-64135be6128e 2024-02-07        2      2  Utilities   

    Amount Transaction Type Payment Method           Merchant  \
0   912.54          Expense     Debit Card         Walker Ltd   
1  1838.27          Expense  Bank Transfer     Davis and Sons   
2    59.35          Expense     Debit Card  Robinson-Lawrence   
3   224.34           Income    Credit Card      Moore-Bernard   
4   203.11          Expense     Debit Card        Roman-Blair   

                                User ID  
0  e465e150-bd9c-46b3-ad3c-2d6d1a3d1fa7  
1  8fadc1a6-06cb-4fb3-9a1d-e644815ef6d

In [9]:
# Save Date as string in 'YYYY-MM-DD' format to ensure correct display in CSV
synthetic_data['Date'] = synthetic_data['Date'].dt.strftime('%Y-%m-%d')


In [10]:
# Save to CSV
synthetic_data.to_csv('synthetic_expense_data1.csv', index=False)
print('Synthetic dataset generated and saved to synthetic_expense_data.csv')

Synthetic dataset generated and saved to synthetic_expense_data.csv
