In [3]:
# -------------------------------
# generate_new_leads.ipynb
# Create new synthetic leads with numeric percentiles and weighted categorical sampling
# -------------------------------

import sys
import os
import pandas as pd
import numpy as np

# -------------------------------
# Step 0: Fix imports
# -------------------------------
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.features.select_features import select_features

# -------------------------------
# Step 1: Load original dataset
# -------------------------------
raw_path = "../data/raw/bank_leads_v4.csv"
df_orig = pd.read_csv(raw_path, sep="\t")

# -------------------------------
# Step 2: Get numeric & categorical features
# -------------------------------
X, _, numeric_features, categorical_features = select_features(df_orig)

# -------------------------------
# Step 3: Ask user for number of new leads
# -------------------------------
n_new = int(input("How many new synthetic leads do you want to generate? "))

# -------------------------------
# Step 4: Generate numeric features using percentiles
# -------------------------------
numeric_data = {}
for col in numeric_features:
    low = np.percentile(df_orig[col], 33)
    high = np.percentile(df_orig[col], 66)

    n_low = n_new // 3
    n_med = n_new // 3
    n_high = n_new - n_low - n_med

    numeric_data[col] = np.concatenate([
        np.random.uniform(df_orig[col].min(), low, n_low),
        np.random.uniform(low, high, n_med),
        np.random.uniform(high, df_orig[col].max(), n_high)
    ]).astype(int)

# -------------------------------
# Step 5: Generate categorical features
# -------------------------------
categorical_data = {}
for col in categorical_features:
    freqs = df_orig[col].value_counts(normalize=True)
    categories = freqs.index.tolist()
    probabilities = freqs.values
    categorical_data[col] = np.random.choice(categories, size=n_new, p=probabilities)

# -------------------------------
# Step 6: Combine numeric, categorical & LeadID
# -------------------------------
df_new = pd.DataFrame({**numeric_data, **categorical_data})
df_new['LeadID'] = [f"Lead_{i+1:03d}" for i in range(n_new)]

# Make LeadID the first column
cols = ['LeadID'] + [col for col in df_new.columns if col != 'LeadID']
df_new = df_new[cols]

# -------------------------------
# Step 7: Save synthetic leads
# -------------------------------
output_path = "../data/raw/new_leads.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_new.to_csv(output_path, index=False)

# -------------------------------
# Step 8: Summary
# -------------------------------
print(f"\nSynthetic dataset with {n_new} leads created and saved to {output_path}")
print("\nPreview of generated leads:")
print(df_new.head())
print("\nColumns:")
print(df_new.columns.tolist())

for col in numeric_features:
    print(f"{col} min: {df_new[col].min()}, 33pct: {np.percentile(df_new[col],33):.1f}, "
          f"66pct: {np.percentile(df_new[col],66):.1f}, max: {df_new[col].max()}")


Synthetic dataset with 999 leads created and saved to ../data/raw/new_leads.csv

Preview of generated leads:
     LeadID  Age  Income  WebsiteVisits_PreConversion  TimeOnWebsite_Minutes  \
0  Lead_001   28   32833                            1                      0   
1  Lead_002   27   34864                            1                      0   
2  Lead_003   34   33030                            1                      1   
3  Lead_004   28   59356                            1                      5   
4  Lead_005   33   27114                            1                      8   

   EmailOpenedCount  DaysSinceInquiry  CallCenterInquiries  BranchVisits  \
0                 1                28                    0             0   
1                 1                34                    0             0   
2                 2                20                    0             0   
3                 0                12                    0             0   
4                 1          