In [None]:
import numpy as np
import pandas as pd

# Seed for reproducibility
np.random.seed(42)

# Generate 1000 samples
n_samples = 1000

# ----------------------------------------------------
# Column 1: Income (Log-normal + Extreme Outliers)
# ----------------------------------------------------
income = np.random.lognormal(mean=10.5, sigma=0.8, size=n_samples-10)
income = np.append(income, [500000, 750000, 1e6, 1.2e6, 2e6, 3e6, 5e6, 2.5e6, 4e6, 1.8e6])

# ----------------------------------------------------
# Column 2: Website_Visits (Poisson + Severe Outliers)
# ----------------------------------------------------
web_visits = np.random.poisson(lam=10, size=n_samples-15)
web_visits = np.append(web_visits, [1000, 1500, 2000, 800, 1200, 2500, 3000, 
                                   1800, 950, 2200, 3500, 4000, 2800, 1500, 5000])

# ----------------------------------------------------
# Column 3: Transaction_Amt (Gamma + Extreme Outliers)
# ----------------------------------------------------
transactions = np.random.gamma(shape=2, scale=100, size=n_samples-8)
transactions = np.append(transactions, [10000, 50000, 25000, 75000, 1e5, 2e5, 5e5, 3e5])

# ----------------------------------------------------
# Column 4: City_Population (Pareto + Mega Cities)
# ----------------------------------------------------
city_pop = np.random.pareto(a=1.5, size=n_samples-5) * 10000  # Base population
city_pop = np.append(city_pop, [1e7, 2.5e7, 5e7, 1.5e7, 3e7])  # Mega cities

# Create DataFrame
df = pd.DataFrame({
    'Income': income,
    'Website_Visits': web_visits,
    'Transaction_Amt': transactions,
    'City_Population': city_pop
})

# Save to CSV
df.to_csv('skewed_data_with_outliers.csv', index=False)