In [8]:
# -------------------------------
# generate_new_leads.ipynb
# Generate synthetic leads with percentile-based ranges (Low, Medium, High)
# -------------------------------

import sys
import os
import pandas as pd
import numpy as np

# -------------------------------
# Step 0: Fix imports for notebook or VS Code
# -------------------------------
# Add the src folder to sys.path so we can import our features module
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the select_features function
from src.features.select_features import select_features

# -------------------------------
# Step 1: Load original dataset
# -------------------------------
raw_path = "../data/raw/bank_leads_v4.csv"  # adjust if notebook location changes
df_orig = pd.read_csv(raw_path, sep="\t")

# -------------------------------
# Step 2: Select features (numeric & categorical)
# -------------------------------
X, _, numeric_features, categorical_features = select_features(df_orig)

# -------------------------------
# Step 3: Configure how many new leads to generate
# -------------------------------
n_new = int(input("How many new synthetic leads do you want to generate? "))

# -------------------------------
# Step 4: Percentile-based ranges for numeric features
# -------------------------------
percentile_ranges = {}
for col in numeric_features:
    # 30th and 70th percentiles
    p30 = np.percentile(df_orig[col], 30)
    p70 = np.percentile(df_orig[col], 70)
    min_val = df_orig[col].min()
    max_val = df_orig[col].max()
    
    percentile_ranges[col] = {
        "low": (min_val, p30),
        "medium": (p30, p70),
        "high": (p70, max_val)
    }

# -------------------------------
# Step 5: Decide distribution of new leads
# Example: 30% Low, 40% Medium, 30% High
# -------------------------------
n_low = int(0.3 * n_new)
n_medium = int(0.4 * n_new)
n_high = n_new - n_low - n_medium  # ensure total matches n_new

def sample_numeric(col, tier, size):
    """Sample numeric values from percentile-based ranges"""
    low, high = percentile_ranges[col][tier]
    if df_orig[col].dtype in [np.int64, np.int32]:
        return np.random.randint(low, high+1, size=size)
    else:
        return np.random.uniform(low, high, size=size)

# -------------------------------
# Step 6: Generate numeric features
# -------------------------------
numeric_data = {col: [] for col in numeric_features}

# Sample Low leads
for col in numeric_features:
    numeric_data[col].extend(sample_numeric(col, "low", n_low))
# Sample Medium leads
for col in numeric_features:
    numeric_data[col].extend(sample_numeric(col, "medium", n_medium))
# Sample High leads
for col in numeric_features:
    numeric_data[col].extend(sample_numeric(col, "high", n_high))

# -------------------------------
# Step 7: Generate categorical features
# We'll randomly assign categories but can skew Low/Medium/High if desired
# -------------------------------
categorical_data = {col: [] for col in categorical_features}

for col in categorical_features:
    choices = df_orig[col].dropna().unique()
    categorical_data[col].extend(np.random.choice(choices, size=n_low))
    categorical_data[col].extend(np.random.choice(choices, size=n_medium))
    categorical_data[col].extend(np.random.choice(choices, size=n_high))

# -------------------------------
# Step 8: Combine numeric & categorical into a new DataFrame
# -------------------------------
df_new = pd.DataFrame({**numeric_data, **categorical_data})

# Optional: shuffle rows
df_new = df_new.sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------------
# Step 9: Save synthetic leads
# -------------------------------
output_path = "../data/raw/new_leads.csv"  # matches pipeline expectation
df_new.to_csv(output_path, index=False)

# -------------------------------
# Step 10: Summary
# -------------------------------
print(f"Synthetic dataset with {n_new} leads created and saved to {output_path}\n")
print("Preview of generated leads:")
print(df_new.head())
print("\nDistribution of numeric features percentiles (Low/Medium/High):")
for col in numeric_features:
    print(f"{col}: min={df_new[col].min()}, max={df_new[col].max()}")

Synthetic dataset with 500 leads created and saved to ../data/raw/new_leads.csv

Preview of generated leads:
   Age        Income  WebsiteVisits_PreConversion  TimeOnWebsite_Minutes  \
0   58  2.137486e+05                            5              40.668006   
1   30  5.315590e+04                            2               7.725315   
2   59  1.058790e+06                           13              25.011175   
3   35  8.714785e+04                            2              11.572413   
4   31  4.854450e+04                            0               9.489149   

   EmailOpenedCount  DaysSinceInquiry  CallCenterInquiries  BranchVisits  \
0                 7               101                    5             5   
1                 2                 8                    0             0   
2                 6                84                    7             2   
3                 3                56                    1             1   
4                 1                 7                 