## 1. Setup and Configuration

In [59]:
# Import required libraries
import sys
import os
import logging
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

# Add utils to path
sys.path.append(os.path.abspath('../'))

from utils.common_utils import (
    load_config, setup_logging, get_spark_session,
    print_section_header, Timer
)
from utils.data_loader import save_data_to_destination

In [60]:
# Load configuration
config = load_config('../config/config.yaml')
setup_logging(config)

print_section_header("Data Generation for Next Best Product Recommendation")
logging.info("Starting data generation process...")


              Data Generation for Next Best Product Recommendation              



## 2. Generate Synthetic Data
# MAGIC
We'll create realistic synthetic data for demonstration purposes.

In [61]:
def generate_address_data(n_addresses=1000):
    """Generate synthetic address data."""
    cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 
              'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose']
    states = ['NY', 'CA', 'IL', 'TX', 'AZ', 'PA', 'TX', 'CA', 'TX', 'CA']
    
    np.random.seed(config['random_seed'])
    
    addresses = pd.DataFrame({
        'ADDRESSID': range(1, n_addresses + 1),
        'STREETLINE1': [f"{np.random.randint(1, 9999)} {np.random.choice(['Main', 'Oak', 'Park', 'Elm', 'Washington'])} St"
                        for _ in range(n_addresses)],
        'CITY': np.random.choice(cities, n_addresses),
        'STATE': [states[cities.index(city)] if city in cities else 'CA'
                  for city in np.random.choice(cities, n_addresses)],
        'POSTALCODE': [f"{np.random.randint(10000, 99999)}" for _ in range(n_addresses)],
        'COUNTRY': ['USA'] * n_addresses
    })
    
    logging.info(f"Generated {len(addresses)} address records")
    return addresses

In [62]:
def generate_banking_product_data():
    """Generate banking product data."""
    products = pd.DataFrame({
        'PRODUCTID': [1, 2, 3, 4, 5, 6, 7],
        'PRODUCTNAME': ['Personal Loan', 'Home Loan', 'Credit Card', 'Auto Loan', 
                       'Business Loan', 'Student Loan', 'Savings Account'],
        'PRODUCTDESCRIPTION': [
            'Personal loan for various expenses',
            'Mortgage loan for home purchase',
            'Credit card with rewards',
            'Vehicle financing loan',
            'Small business lending',
            'Education financing',
            'High-yield savings account'
        ],
        'PRODUCTMINIMUMAMOUNT': [1000, 50000, 500, 5000, 10000, 2000, 100],
        'PRODUCTMAXIMUMAMOUNT': [50000, 500000, 50000, 100000, 500000, 100000, 1000000],
        'PRODUCTMINIMUMTERM': [6, 120, 0, 12, 12, 24, 0],
        'PRODUCTMAXIMUMTERM': [60, 360, 0, 84, 120, 120, 0]
    })
    
    logging.info(f"Generated {len(products)} product records")
    return products

In [63]:
def generate_channel_data():
    """Generate channel data."""
    channels = pd.DataFrame({
        'CHANNELID': [1, 2, 3, 4, 5],
        'CHANNELNAME': ['Online', 'Branch', 'Mobile App', 'Phone', 'Agent'],
        'CHANNELTYPEID': [1, 2, 3, 4, 5]
    })
    
    logging.info(f"Generated {len(channels)} channel records")
    return channels

In [64]:
def generate_party_data(n_parties=5000, addresses_df=None):
    """Generate party (person) data."""
    np.random.seed(config['random_seed'])
    
    first_names = ['John', 'Jane', 'Michael', 'Emily', 'David', 'Sarah', 'Robert', 'Lisa']
    last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis']
    
    parties = pd.DataFrame({
        'PARTYID': range(1, n_parties + 1),
        'PARTYTYPE': ['PERSON'] * n_parties,
        'LEGALNAME': [f"{np.random.choice(first_names)} {np.random.choice(last_names)}"
                      for _ in range(n_parties)],
        'DATEOFBIRTH': [datetime.now() - timedelta(days=np.random.randint(18*365, 70*365))
                       for _ in range(n_parties)],
        'PRIMARYADDRESSID': np.random.choice(addresses_df['ADDRESSID'].values, n_parties),
        'PRIMARYEMAIL': [f"customer{i}@email.com" for i in range(1, n_parties + 1)],
        'PRIMARYPHONE': [f"{np.random.randint(2000000000, 9999999999, dtype=np.int64)}" for _ in range(n_parties)]
    })
    
    logging.info(f"Generated {len(parties)} party records")
    return parties


In [65]:
def generate_customer_data(n_customers=5000, parties_df=None):
    """Generate customer data."""
    np.random.seed(config['random_seed'])
    
    customers = pd.DataFrame({
        'CUSTOMERID': range(1, n_customers + 1),
        'PARTYID': parties_df['PARTYID'].values[:n_customers],
        'CUSTOMERESTABLISHEDDATE': [datetime.now() - timedelta(days=np.random.randint(1, 1825))
                                   for _ in range(n_customers)],
        'CUSTOMERTYPEID': np.random.choice([1, 2, 3], n_customers, p=[0.7, 0.2, 0.1])
    })
    
    logging.info(f"Generated {len(customers)} customer records")
    return customers

In [66]:
def generate_customer_account_data(n_accounts=8000, customers_df=None, products_df=None, channels_df=None):
    """Generate customer account data with controlled future purchases for target creation.
    
    Strategy:
      - Create historical accounts (origination before reference_date)
      - Force a configurable proportion of customers (training_data_ratio) to have exactly 1 new account
        originated AFTER the reference_date within the prediction window (default 90 days) so they
        become labeled (NEXT_PRODUCT_ID available)
    
    This guarantees we can reach the desired labeled vs unlabeled customer ratio for training.
    """
    np.random.seed(config['random_seed'])
    
    reference_date = datetime.strptime(config['feature_engineering']['reference_date'], '%Y-%m-%d')
    prediction_window_days = 90  # keep aligned with feature engineering notebook usage
    training_ratio = config['feature_engineering'].get('training_data_ratio', 0.80)
    
    total_customers = customers_df['CUSTOMERID'].nunique()
    target_labeled_customers = int(total_customers * training_ratio)
    
    # Sample customers who will get a future (post-reference) account
    labeled_customer_ids = set(np.random.choice(customers_df['CUSTOMERID'].values, size=target_labeled_customers, replace=False))
    
    accounts = []
    
    # We reserve one future account per labeled customer. Remaining accounts are historical.
    historical_accounts_needed = n_accounts - len(labeled_customer_ids)
    if historical_accounts_needed < 0:
        # If n_accounts is too small, adjust so we at least create one account per labeled customer
        logging.warning("n_accounts smaller than labeled customers requirement; increasing n_accounts")
        historical_accounts_needed = 0
    
    # --- Generate historical accounts (before reference_date) ---
    for account_id in range(1, historical_accounts_needed + 1):
        customer_id = np.random.choice(customers_df['CUSTOMERID'].values)
        product_id = np.random.choice(products_df['PRODUCTID'].values)
        channel_id = np.random.choice(channels_df['CHANNELID'].values)
        
        origination_date = reference_date - timedelta(days=np.random.randint(1, 730))  # up to 2 years back
        product_info = products_df[products_df['PRODUCTID'] == product_id].iloc[0]
        principal = np.random.uniform(product_info['PRODUCTMINIMUMAMOUNT'], product_info['PRODUCTMAXIMUMAMOUNT'])
        maturity_days = np.random.randint(
            product_info['PRODUCTMINIMUMTERM'] * 30,
            max(product_info['PRODUCTMAXIMUMTERM'] * 30, 30)
        ) if product_info['PRODUCTMAXIMUMTERM'] > 0 else 0
        maturity_date = origination_date + timedelta(days=maturity_days) if maturity_days > 0 else None
        
        accounts.append({
            'CUSTOMERACCOUNTID': account_id,
            'CUSTOMERID': customer_id,
            'PRODUCTID': product_id,
            'CHANNELID': channel_id,
            'ORIGINATIONDATE': origination_date,
            'MATURITYDATE': maturity_date,
            'PRINCIPALAMOUNT': round(principal, 2),
            'INTERESTRATE': round(np.random.uniform(0.03, 0.15), 4),
            'ACCOUNTSTATUS': np.random.choice(['ACTIVE', 'CLOSED', 'PENDING'], p=[0.7, 0.2, 0.1])
        })
    
    # --- Generate future accounts (post reference) to create labels ---
    next_account_id = historical_accounts_needed + 1
    for customer_id in labeled_customer_ids:
        product_id = np.random.choice(products_df['PRODUCTID'].values)
        channel_id = np.random.choice(channels_df['CHANNELID'].values)
        # Origination within prediction window
        origination_date = reference_date + timedelta(days=np.random.randint(1, prediction_window_days + 1))
        product_info = products_df[products_df['PRODUCTID'] == product_id].iloc[0]
        principal = np.random.uniform(product_info['PRODUCTMINIMUMAMOUNT'], product_info['PRODUCTMAXIMUMAMOUNT'])
        maturity_days = np.random.randint(
            product_info['PRODUCTMINIMUMTERM'] * 30,
            max(product_info['PRODUCTMAXIMUMTERM'] * 30, 30)
        ) if product_info['PRODUCTMAXIMUMTERM'] > 0 else 0
        maturity_date = origination_date + timedelta(days=maturity_days) if maturity_days > 0 else None
        
        accounts.append({
            'CUSTOMERACCOUNTID': next_account_id,
            'CUSTOMERID': customer_id,
            'PRODUCTID': product_id,
            'CHANNELID': channel_id,
            'ORIGINATIONDATE': origination_date,
            'MATURITYDATE': maturity_date,
            'PRINCIPALAMOUNT': round(principal, 2),
            'INTERESTRATE': round(np.random.uniform(0.03, 0.15), 4),
            'ACCOUNTSTATUS': np.random.choice(['ACTIVE', 'CLOSED', 'PENDING'], p=[0.7, 0.2, 0.1])
        })
        next_account_id += 1
    
    accounts_df = pd.DataFrame(accounts)
    
    # Logging diagnostics
    labeled_in_accounts = accounts_df[(accounts_df['ORIGINATIONDATE'] >= reference_date) & (accounts_df['ORIGINATIONDATE'] < reference_date + timedelta(days=prediction_window_days))]['CUSTOMERID'].nunique()
    logging.info(f"Generated {len(accounts_df)} customer account records")
    logging.info(f"Forced labeled customers (future accounts within {prediction_window_days}d): {labeled_in_accounts}/{total_customers} ({labeled_in_accounts/total_customers*100:.1f}%)")
    logging.info(f"Target labeled ratio: {training_ratio*100:.1f}%")
    
    return accounts_df

In [67]:
def generate_transaction_data(n_transactions=50000, accounts_df=None):
    """Generate transaction data with reference to config reference_date.
    
    Generates transactions across a 2-year window:
    - 50% before reference_date (for historical features)
    - 50% after reference_date (for target variable - next product purchases)
    """
    np.random.seed(config['random_seed'])
    
    # Get reference date from config
    reference_date = datetime.strptime(config['feature_engineering']['reference_date'], '%Y-%m-%d')
    
    transactions = []
    
    # Split transactions: 50% before reference, 50% after
    n_before = n_transactions // 2
    n_after = n_transactions - n_before
    
    # Generate transactions BEFORE reference_date (for historical features)
    for txn_id in range(1, n_before + 1):
        account_id = np.random.choice(accounts_df['CUSTOMERACCOUNTID'].values)
        # 730 days (2 years) before reference date
        days_before = np.random.randint(1, 730)
        txn_date = reference_date - timedelta(days=days_before)
        
        transactions.append({
            'TRANSACTIONID': txn_id,
            'CUSTOMERACCOUNTID': account_id,
            'TRANSACTIONINITIATEDTIMESTAMP': txn_date,
            'TRANSACTIONAMOUNT': round(np.random.uniform(10, 5000), 2),
            'TRANSACTIONTYPEID': np.random.choice([1, 2, 3, 4])
        })
    
    # Generate transactions AFTER reference_date (for target variable)
    for txn_id in range(n_before + 1, n_transactions + 1):
        account_id = np.random.choice(accounts_df['CUSTOMERACCOUNTID'].values)
        # 180 days (6 months) after reference date to ensure coverage within prediction window
        days_after = np.random.randint(1, 180)
        txn_date = reference_date + timedelta(days=days_after)
        
        transactions.append({
            'TRANSACTIONID': txn_id,
            'CUSTOMERACCOUNTID': account_id,
            'TRANSACTIONINITIATEDTIMESTAMP': txn_date,
            'TRANSACTIONAMOUNT': round(np.random.uniform(10, 5000), 2),
            'TRANSACTIONTYPEID': np.random.choice([1, 2, 3, 4])
        })
    
    transactions_df = pd.DataFrame(transactions)
    
    # Log distribution for verification
    before_ref = len(transactions_df[transactions_df['TRANSACTIONINITIATEDTIMESTAMP'] < reference_date])
    after_ref = len(transactions_df[transactions_df['TRANSACTIONINITIATEDTIMESTAMP'] >= reference_date])
    logging.info(f"Generated {len(transactions_df)} transaction records")
    logging.info(f"  - {before_ref} transactions before reference_date ({reference_date.date()})")
    logging.info(f"  - {after_ref} transactions after reference_date")
    
    return transactions_df

In [68]:
def generate_communication_data(n_communications=10000, parties_df=None):
    """Generate communication data."""
    np.random.seed(config['random_seed'])
    
    communications = []
    
    for comm_id in range(1, n_communications + 1):
        party_id = np.random.choice(parties_df['PARTYID'].values)
        comm_date = datetime.now() - timedelta(days=np.random.randint(1, 365))
        
        communications.append({
            'COMMUNICATIONID': comm_id,
            'PARTYID': party_id,
            'COMMUNICATIONSTARTTIMESTAMP': comm_date,
            'COMMUNICATIONDESCRIPTION': f"Communication regarding account services",
            'COMMUNICATIONMETHODID': np.random.choice([1, 2, 3]),  # Email, Phone, SMS
            'INBOUNDOUTBOUNDCOMMUNICATIONINDICATOR': np.random.choice([0, 1])
        })
    
    communications_df = pd.DataFrame(communications)
    logging.info(f"Generated {len(communications_df)} communication records")
    return communications_df

In [69]:
def generate_document_data(n_documents=15000, parties_df=None, accounts_df=None):
    """Generate document data."""
    np.random.seed(config['random_seed'])
    
    documents = []
    
    for doc_id in range(1, n_documents + 1):
        party_id = np.random.choice(parties_df['PARTYID'].values)
        account_id = np.random.choice(accounts_df['CUSTOMERACCOUNTID'].values) if np.random.random() > 0.3 else None
        doc_date = datetime.now() - timedelta(days=np.random.randint(1, 730))
        
        documents.append({
            'DOCUMENTID': doc_id,
            'PARTYID': party_id,
            'CUSTOMERACCOUNTID': account_id,
            'DOCUMENTNAME': np.random.choice(['ID Proof', 'Address Proof', 'Income Proof', 'Loan Agreement']),
            'DOCUMENTTYPEID': np.random.choice([1, 2, 3, 4]),
            'DOCUMENTCREATEDTIMESTAMP': doc_date
        })
    
    documents_df = pd.DataFrame(documents)
    logging.info(f"Generated {len(documents_df)} document records")
    return documents_df

## 3. Generate All Tables

In [70]:
with Timer("Data Generation"):
    # Generate data in proper order (respecting foreign keys)
    addresses = generate_address_data(n_addresses=1000)
    products = generate_banking_product_data()
    channels = generate_channel_data()
    parties = generate_party_data(n_parties=5000, addresses_df=addresses)
    customers = generate_customer_data(n_customers=5000, parties_df=parties)
    accounts = generate_customer_account_data(n_accounts=8000, customers_df=customers, 
                                              products_df=products, channels_df=channels)
    transactions = generate_transaction_data(n_transactions=50000, accounts_df=accounts)
    communications = generate_communication_data(n_communications=10000, parties_df=parties)
    documents = generate_document_data(n_documents=15000, parties_df=parties, accounts_df=accounts)
    
    print("\nâœ… Data generation completed successfully!")


âœ… Data generation completed successfully!


## 4. Save Generated Data

In [71]:
# Debug: Check configuration
print("Data source type:", config['data_source']['type'])
print("Config raw path:", config['data_source']['csv']['input_path'])
print("Config output path:", config['data_source']['csv']['output_path'])
print("\nActual save location (absolute path):", os.path.abspath('../data/raw'))
print("Files will be saved to project root data/raw folder âœ…")

Data source type: csv
Config raw path: ./data/raw
Config output path: ./data/processed

Actual save location (absolute path): d:\home-credit-hyperpersonalization-poc\data\raw
Files will be saved to project root data/raw folder âœ…


In [72]:
# Get Spark session (if using Unity Catalog)
spark = None
if config['data_source']['type'] == 'unity_catalog':
    spark = get_spark_session(config)

# Save all tables
tables_dict = {
    'address': addresses,
    'banking_product': products,
    'channel': channels,
    'party': parties,
    'customer': customers,
    'customer_account': accounts,
    'transaction': transactions,
    'communication': communications,
    'document': documents
}

# For data generation, save to data/raw (not data/processed)
# Use absolute path relative to project root (one level up from notebooks)
raw_data_path = os.path.abspath('../data/raw') if config['data_source']['type'] == 'csv' else None

with Timer("Saving Data"):
    for table_name, df in tables_dict.items():
        # Pass custom output path for CSV mode
        if config['data_source']['type'] == 'csv':
            save_data_to_destination(df, config, table_name, spark, output_path=raw_data_path)
        else:
            save_data_to_destination(df, config, table_name, spark)
        print(f"âœ… Saved {table_name}: {len(df)} records")

print("\nâœ… All data saved successfully!")
print(f"ðŸ“‚ Files saved to: {raw_data_path}")

âœ… Saved address: 1000 records
âœ… Saved banking_product: 7 records
âœ… Saved channel: 5 records
âœ… Saved party: 5000 records
âœ… Saved customer: 5000 records
âœ… Saved customer_account: 8000 records
âœ… Saved transaction: 50000 records
âœ… Saved communication: 10000 records
âœ… Saved document: 15000 records

âœ… All data saved successfully!
ðŸ“‚ Files saved to: d:\home-credit-hyperpersonalization-poc\data\raw
âœ… Saved transaction: 50000 records
âœ… Saved communication: 10000 records
âœ… Saved document: 15000 records

âœ… All data saved successfully!
ðŸ“‚ Files saved to: d:\home-credit-hyperpersonalization-poc\data\raw


## 5. Data Validation Summary

In [73]:
print_section_header("Data Generation Summary")

summary_data = []
for table_name, df in tables_dict.items():
    summary_data.append({
        'Table': table_name,
        'Records': len(df),
        'Columns': len(df.columns)
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

total_records = sum([item['Records'] for item in summary_data])
print(f"\nðŸ“Š Total Records Generated: {total_records}")
print(f"ðŸ“… Generation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nâœ… Data generation notebook completed!")


                            Data Generation Summary                             

           Table  Records  Columns
         address     1000        6
 banking_product        7        7
         channel        5        3
           party     5000        7
        customer     5000        4
customer_account     8000        9
     transaction    50000        5
   communication    10000        6
        document    15000        6

ðŸ“Š Total Records Generated: 94012
ðŸ“… Generation Date: 2025-11-12 21:44:16

âœ… Data generation notebook completed!


## Next Steps

1. Run **01_eda.py** for exploratory data analysis
2. Proceed with feature engineering in **02_feature_engineering.py**