In [None]:
import csv
import random
from faker import Faker
from datetime import datetime
from pathlib import Path
import pandas as pd

fake = Faker()

# Load valid stock market dates from the specified file
valid_dates_df = pd.read_csv("valid_stock_market_dates.csv")  # Change path if needed

# Extract and parse dates from all columns, keep unique and filter >= 2016-01-01
raw_dates = set()
for col in valid_dates_df.columns:
    for val in valid_dates_df[col].dropna().astype(str):
        try:
            parsed = datetime.strptime(val.strip(), "%Y-%m-%d")
            if parsed >= datetime(2016, 1, 1):
                raw_dates.add(parsed)
        except:
            try:
                parsed = pd.to_datetime(val, errors='coerce')
                if pd.notnull(parsed) and parsed >= pd.Timestamp("2016-01-01"):
                    raw_dates.add(parsed.to_pydatetime())
            except:
                pass

# Sort the valid dates to get proper order for next business day
valid_dates = sorted(list(raw_dates))

# Map each date to its index for quick next-day lookup
date_index_map = {date: idx for idx, date in enumerate(valid_dates)}

NUM_ACCOUNTS = 150
NUM_TRANSACTIONS = 1100

assets = [
    ("Apple Inc.", "AAPL"), ("Tesla Inc.", "TSLA"), ("Amazon.com, Inc.", "AMZN"),
    ("NVIDIA Corporation", "NVDA"), ("Alphabet Inc.", "GOOGL"), ("Microsoft Corp", "MSFT"),
    ("Meta Platforms", "META"), ("Berkshire Hathaway", "BRK.A"), ("Netflix", "NFLX"),
    ("Johnson & Johnson", "JNJ")
]

account_types = ["Brokerage", "Roth IRA", "401(k)", "Joint", "Custodial", "Trust"]
currencies = ["USD", "EUR", "CAD"]
notes_pool = ["", "pending", "dividends reinvested", "auto-invest", "short-term", "watchlist"]

def random_capitalize(s):
    r = random.random()
    return s.lower() if r < 0.3 else s.title() if r < 0.6 else s.upper()

def typo_ticker(ticker):
    if random.random() < 0.1:
        return ticker[:-1] if random.random() < 0.5 else ticker + " "
    return ticker

def format_random_date(date_obj):
    if date_obj is None:
        return ""
    formats = ["dash", "slash"]
    style = random.choice(formats)
    day = date_obj.day
    month = date_obj.month
    year = date_obj.year

    if style == "dash":
        month_abbr = date_obj.strftime("%b")  # Jun
        return f"{day}-{month_abbr}-{year}"   # 3-Jun-2024
    else:  # slash
        return f"{month}/{day}/{year}"        # 6/1/2024

accounts = []
for _ in range(NUM_ACCOUNTS):
    name = fake.name()
    account_id = fake.bothify("ACC-####-##")
    account_type = random.choice(account_types)
    join_date = fake.date_between(start_date="-5y", end_date="today")
    ssn = fake.ssn()
    accounts.append((name, account_id, account_type, join_date, ssn))  # SSN added

def generate_transaction(account):
    name, account_id, account_type, join_date, ssn = account
    name = " ".join(random_capitalize(n) for n in name.split())
    asset, ticker = random.choice(assets)

    asset = random_capitalize(asset).strip()
    if random.random() < 0.1:
        asset = f" {asset} "

    ticker = typo_ticker(ticker)
    ticker = random_capitalize(ticker).strip()

    q_chance = random.random()
    quantity = (
        "ten" if q_chance < 0.05
        else "N/A" if q_chance < 0.1
        else f"{random.randint(1, 1000):,}" if q_chance < 0.2
        else str(random.randint(1, 1000))
    )

    price = round(random.uniform(10, 5000), 2)
    price_formats = [
        f"${price:,.2f}",
        f"{price:.2f}",
        "N/A",
        f"${int(price)}.{random.randint(0,9)}",
        f"${price:,.2f}".replace(",", ""),
        f"{price:,.2f} USD",
        f"{price:.2f}usd",
    ]
    price_str = random.choice(price_formats)

    currency = random.choice(currencies)
    note = random.choice(notes_pool)
    status = "Pending" if note == "pending" else "Completed"
    transaction_id = fake.uuid4()[:8]

    # Ensure trade date is after the account's join date (TypeError fix)
    if valid_dates:
        join_date_dt = datetime.combine(join_date, datetime.min.time())
        eligible_trade_dates = [d for d in valid_dates if d > join_date_dt]
        if eligible_trade_dates:
            trade_date_obj = random.choice(eligible_trade_dates)
            trade_idx = date_index_map.get(trade_date_obj, None)
            if trade_idx is not None and trade_idx + 1 < len(valid_dates):
                settlement_date_obj = valid_dates[trade_idx + 1]
            else:
                settlement_date_obj = trade_date_obj
        else:
            # Fallback: no valid date found, use join_date + 1 day
            trade_date_obj = join_date_dt + pd.Timedelta(days=1)
            settlement_date_obj = trade_date_obj + pd.Timedelta(days=1)
    else:
        trade_date_obj = None
        settlement_date_obj = None

    trade_date_str = format_random_date(trade_date_obj)
    settlement_date_str = format_random_date(settlement_date_obj)

    if status == "Pending":
        settlement_date_str = ""

    if random.random() < 0.05:
        name = ""
    if random.random() < 0.05:
        account_id = ""

    return [
        name,
        account_id,
        transaction_id,
        account_type,
        asset,
        ticker,
        quantity,
        trade_date_str,
        settlement_date_str,
        price_str,
        currency,
        status,
        note
    ]

def create_profile(account):
    name, account_id, account_type, join_date, ssn = account
    first_name, last_name = name.split()[0], " ".join(name.split()[1:])
    email = fake.email()
    phone = fake.phone_number()
    address = fake.address().replace("\n", ", ")
    dob_obj = fake.date_of_birth(minimum_age=21, maximum_age=85)
    dob = format_random_date(dob_obj)
    risk_tolerance = random.choice(["Low", "Medium", "High"])
    income_bracket = random.choice(["<50k", "50k-100k", "100k-250k", "250k+"])
    employment_status = random.choice(["Employed", "Unemployed", "Retired", "Self-employed", "Student"])
    account_opened = format_random_date(join_date)

    if random.random() < 0.05:
        email = ""
    if random.random() < 0.05:
        phone = "N/A"
    if random.random() < 0.03:
        income_bracket = income_bracket.upper()

    return [
        account_id, first_name, last_name, ssn, email, phone, dob, address,
        account_type, risk_tolerance, income_bracket, employment_status, account_opened
    ]

downloads_path = str(Path.home() / "Downloads")
transactions_file = Path(downloads_path) / "portfolio_unclean_with_pending_dates.csv"
profiles_file = Path(downloads_path) / "customer_profiles_unclean.csv"

# Write transaction data
with open(transactions_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "Name", "Account ID", "Transaction ID", "Account Type",
        "Asset", "Ticker", "Quantity", "Trade Date", "Settlement Date",
        "Price Per Share", "Currency", "Status", "Notes"
    ])
    for _ in range(NUM_TRANSACTIONS):
        account = random.choice(accounts)
        writer.writerow(generate_transaction(account))
    for _ in range(10):  # duplicate rows
        account = random.choice(accounts)
        txn = generate_transaction(account)
        writer.writerow(txn)
        writer.writerow(txn)
    writer.writerow(["TOTAL", "", "", "", "", "", "", "", "", "$3,245,000", "", "", ""])
    writer.writerow(["", "", "", "", "", "", "", "", "", "", "", "", ""])
    writer.writerow(["*", "*", "*", "*", "*", "*", "*", "*", "*", "*", "*", "*", "*"])

# Write profile data
with open(profiles_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "Account ID", "First Name", "Last Name", "SSN", "Email", "Phone",
        "Date of Birth", "Mailing Address", "Account Type",
        "Risk Tolerance", "Income Bracket", "Employment Status", "Account Opened Date"
    ])
    for acc in accounts:
        writer.writerow(create_profile(acc))

print(f"Transactions CSV saved to: {transactions_file}")
print(f"Profiles CSV saved to: {profiles_file}")
