In [1]:
import pandas as pd
import numpy as np
import uuid
from faker import Faker
from datetime import datetime, timedelta
import random

# --- Configuration ---
NUM_DONORS = 950_000
NUM_BANK_NEEDS = 50_000
NUM_UNIQUE_BANKS = 75 # Let's assume 75 distinct physical banks

# Approximate coordinates for Punjab cities and region boundaries
cities_punjab = {
    "Ludhiana": (30.9010, 75.8573),
    "Amritsar": (31.6340, 74.8723),
    "Jalandhar": (31.3260, 75.5762),
    "Patiala": (30.3398, 76.3869),
    "Bathinda": (30.2110, 74.9455),
    "Mohali": (30.7046, 76.7179),
    "Pathankot": (32.2686, 75.6494),
    "Moga": (30.8042, 75.1719),
    "Firozpur": (30.9214, 74.6041),
    "Hoshiarpur": (31.5315, 75.9066),
    # Add more cities if needed
}
city_names = list(cities_punjab.keys())

# Realistic Blood Type Distributions (approximated)
blood_types = ["O+", "B+", "A+", "AB+", "O-", "B-", "A-", "AB-"]
blood_type_probabilities = [0.35, 0.33, 0.20, 0.07, 0.015, 0.015, 0.015, 0.005]

urgency_levels = ["low", "medium", "high", "critical"]
urgency_probabilities = [0.50, 0.30, 0.15, 0.05] # Skewed towards lower urgency

genders = ["Male", "Female"]
gender_probabilities = [0.6, 0.4] # Slight bias if observed

fake = Faker('en_IN') # Use Indian locale for potential name/address realism if needed later

# --- Helper Functions ---
def generate_random_location(city_name, radius_km=15):
    """Generates a random lat/lon near a city center."""
    base_lat, base_lon = cities_punjab[city_name]
    # Approx conversion: 1 degree lat ~= 111 km, 1 degree lon varies
    radius_deg_lat = radius_km / 111.0
    radius_deg_lon = radius_km / (111.0 * np.cos(np.radians(base_lat)))
    
    rand_lat = base_lat + np.random.uniform(-radius_deg_lat, radius_deg_lat)
    rand_lon = base_lon + np.random.uniform(-radius_deg_lon, radius_deg_lon)
    return rand_lat, rand_lon

def generate_last_donation_date(start_date=datetime.now() - timedelta(days=2*365), end_date=datetime.now()):
    """Generates a random date within the specified range."""
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date.date()

def is_eligible(last_donation_date, eligibility_days=56):
    """Checks eligibility based on last donation date."""
    if last_donation_date is None:
        return True # Assume new donors are eligible
    days_since_donation = (datetime.now().date() - last_donation_date).days
    return days_since_donation > eligibility_days

# --- Generate Bank Data ---
print(f"Generating {NUM_UNIQUE_BANKS} unique bank locations...")
unique_banks = []
for _ in range(NUM_UNIQUE_BANKS):
    bank_id = str(uuid.uuid4())
    city = np.random.choice(city_names) # Simple random city for banks
    lat, lon = generate_random_location(city, radius_km=5) # Banks usually within city limits
    unique_banks.append({"blood_bank_id": bank_id, "city": city, "latitude": lat, "longitude": lon})

print(f"Generating {NUM_BANK_NEEDS} bank need records...")
bank_needs_data = []
for i in range(NUM_BANK_NEEDS):
    bank_info = random.choice(unique_banks) # Pick one of the unique banks
    
    # Required blood type - higher chance for common types, occasional "ALL"
    if random.random() < 0.05: # 5% chance of needing "ALL"
        req_blood_type = "ALL"
    else:
        req_blood_type = np.random.choice(blood_types, p=blood_type_probabilities)

    urgency = np.random.choice(urgency_levels, p=urgency_probabilities)
    
    # Required units - maybe fewer for critical, more for low?
    if urgency == "critical":
        units = np.random.randint(1, 11)
    elif urgency == "high":
        units = np.random.randint(5, 26)
    else: # medium or low
        units = np.random.randint(10, 51)

    bank_needs_data.append({
        "blood_bank_id": bank_info["blood_bank_id"],
        "city": bank_info["city"],
        "latitude": bank_info["latitude"],
        "longitude": bank_info["longitude"],
        "required_blood_type": req_blood_type,
        "urgency": urgency,
        "required_units": units
    })
    if (i + 1) % 5000 == 0:
            print(f"...generated {i+1}/{NUM_BANK_NEEDS} bank needs")

bank_needs_df = pd.DataFrame(bank_needs_data)

# --- Generate Donor Data ---
print(f"Generating {NUM_DONORS} donor records...")
donor_data = []
# Pre-generate random numbers where possible for speed
cities_assigned = np.random.choice(city_names, size=NUM_DONORS, p=[1/len(city_names)]*len(city_names)) # Adjust weights if needed
blood_types_assigned = np.random.choice(blood_types, size=NUM_DONORS, p=blood_type_probabilities)
ages_assigned = np.random.randint(18, 66, size=NUM_DONORS) # Simple uniform age
genders_assigned = np.random.choice(genders, size=NUM_DONORS, p=gender_probabilities)
# Generate availability/regular status randomly for now
availability_assigned = np.random.choice([True, False], size=NUM_DONORS, p=[0.4, 0.6])
regular_assigned = np.random.choice([True, False], size=NUM_DONORS, p=[0.25, 0.75])

for i in range(NUM_DONORS):
    donor_id = str(uuid.uuid4())
    city = cities_assigned[i]
    lat, lon = generate_random_location(city, radius_km=25) # Donors can be further out

    # Generate last donation date, potentially None for ~10% (new donors)
    if random.random() < 0.10:
         last_donation = None
         eligible = True
    else:
         last_donation = generate_last_donation_date()
         eligible = is_eligible(last_donation)

    donor_data.append({
        "donor_id": donor_id,
        "city": city,
        "latitude": lat,
        "longitude": lon,
        "blood_type": blood_types_assigned[i],
        "age": ages_assigned[i], # Add age
        "gender": genders_assigned[i], # Add gender
        "last_donation_date": last_donation,
        "is_eligible": eligible, # Derived field
        "availability_status": availability_assigned[i], # Simplified random boolean
        "is_regular_donor": regular_assigned[i] # Simplified random boolean
    })
    if (i + 1) % 50000 == 0:
            print(f"...generated {i+1}/{NUM_DONORS} donors")


donor_df = pd.DataFrame(donor_data)

# --- Data Type Conversion & Saving ---
print("Converting data types...")
donor_df['last_donation_date'] = pd.to_datetime(donor_df['last_donation_date'])
donor_df['age'] = donor_df['age'].astype(int)
donor_df['is_eligible'] = donor_df['is_eligible'].astype(bool)
donor_df['availability_status'] = donor_df['availability_status'].astype(bool)
donor_df['is_regular_donor'] = donor_df['is_regular_donor'].astype(bool)

bank_needs_df['required_units'] = bank_needs_df['required_units'].astype(int)

print("Saving data to CSV files...")
donor_df.to_csv("synthetic_donors.csv", index=False)
bank_needs_df.to_csv("synthetic_bank_needs.csv", index=False)

print("-" * 30)
print("Generated Data Summary:")
print("\nDonor Data Sample:")
print(donor_df.head())
print(f"\nDonor Data Shape: {donor_df.shape}")
print(donor_df.info())

print("\nBank Needs Data Sample:")
print(bank_needs_df.head())
print(f"\nBank Needs Data Shape: {bank_needs_df.shape}")
print(bank_needs_df.info())
print("-" * 30)
print("Synthetic data generation complete!")
print("Files saved: synthetic_donors.csv, synthetic_bank_needs.csv")

Generating 75 unique bank locations...
Generating 50000 bank need records...
...generated 5000/50000 bank needs
...generated 10000/50000 bank needs
...generated 15000/50000 bank needs
...generated 20000/50000 bank needs
...generated 25000/50000 bank needs
...generated 30000/50000 bank needs
...generated 35000/50000 bank needs
...generated 40000/50000 bank needs
...generated 45000/50000 bank needs
...generated 50000/50000 bank needs
Generating 950000 donor records...
...generated 50000/950000 donors
...generated 100000/950000 donors
...generated 150000/950000 donors
...generated 200000/950000 donors
...generated 250000/950000 donors
...generated 300000/950000 donors
...generated 350000/950000 donors
...generated 400000/950000 donors
...generated 450000/950000 donors
...generated 500000/950000 donors
...generated 550000/950000 donors
...generated 600000/950000 donors
...generated 650000/950000 donors
...generated 700000/950000 donors
...generated 750000/950000 donors
...generated 800000/

In [2]:
import pandas as pd
import numpy as np
import uuid
from faker import Faker
from datetime import datetime, timedelta
import random
import matplotlib as plt

# --- Configuration ---
NUM_DONORS = 950_000
NUM_BANK_NEEDS = 50_000
NUM_UNIQUE_BANKS = 75
TARGET_CLUSTERS = 8 # Aiming for data suitable for K=8

# Use a random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Approximate coordinates for Punjab cities
cities_punjab = {
    "Ludhiana": (30.9010, 75.8573),
    "Amritsar": (31.6340, 74.8723),
    "Jalandhar": (31.3260, 75.5762),
    "Patiala": (30.3398, 76.3869),
    "Bathinda": (30.2110, 74.9455),
    "Mohali": (30.7046, 76.7179), # SAS Nagar
    "Pathankot": (32.2686, 75.6494),
    "Moga": (30.8042, 75.1719),
    # We need 8 centers if aiming for K=8, add/reuse cities if needed
    # Let's make slightly different 'centers' even if using same city name for profile
    "Ludhiana_Sub": (30.8500, 75.9500), # Fictional distinct area near Ludhiana
    "Jalandhar_Sub": (31.4000, 75.5000), # Fictional distinct area near Jalandhar
}
city_names_all = list(cities_punjab.keys()) # All available city names

# --- Define Regional Donor Profiles (TARGET_CLUSTERS = 8 profiles) ---
# Each profile has: center_city, age_mean, age_std, recent_donor_prob, new_donor_prob
# recent_donor_prob: Probability the last donation was within ~180 days
# new_donor_prob: Probability the donor has no last donation date (None)

# Ensure we have enough unique profile centers, even if reusing city names
profile_centers = list(cities_punjab.keys())
if len(profile_centers) < TARGET_CLUSTERS:
     needed = TARGET_CLUSTERS - len(profile_centers)
     profile_centers.extend(random.choices(profile_centers, k=needed)) # Repeat some cities if necessary
     profile_centers = profile_centers[:TARGET_CLUSTERS] # Ensure exactly TARGET_CLUSTERS


donor_profiles = [
    {"name": "Profile_LDH_Young_Recent", "center_city": "Ludhiana", "age_mean": 25, "age_std": 5, "recent_donor_prob": 0.7, "new_donor_prob": 0.05},
    {"name": "Profile_ASR_Older_Infrequent", "center_city": "Amritsar", "age_mean": 45, "age_std": 8, "recent_donor_prob": 0.2, "new_donor_prob": 0.15},
    {"name": "Profile_JAL_Mid_Regular", "center_city": "Jalandhar", "age_mean": 35, "age_std": 7, "recent_donor_prob": 0.5, "new_donor_prob": 0.10},
    {"name": "Profile_PTA_V_Old_V_Infrequent", "center_city": "Patiala", "age_mean": 55, "age_std": 6, "recent_donor_prob": 0.1, "new_donor_prob": 0.20},
    {"name": "Profile_MOH_Mixed_New", "center_city": "Mohali", "age_mean": 38, "age_std": 10, "recent_donor_prob": 0.3, "new_donor_prob": 0.25},
    {"name": "Profile_PTK_Young_Infrequent", "center_city": "Pathankot", "age_mean": 28, "age_std": 6, "recent_donor_prob": 0.25,"new_donor_prob": 0.12},
    {"name": "Profile_BAT_Mid_Recent", "center_city": "Bathinda", "age_mean": 33, "age_std": 6, "recent_donor_prob": 0.6, "new_donor_prob": 0.08},
    {"name": "Profile_MOG_Old_Regular", "center_city": "Moga", "age_mean": 48, "age_std": 7, "recent_donor_prob": 0.4, "new_donor_prob": 0.10},
]

# Make sure we have the target number of profiles
if len(donor_profiles) < TARGET_CLUSTERS:
    donor_profiles.extend(random.choices(donor_profiles, k=TARGET_CLUSTERS - len(donor_profiles)))
donor_profiles = donor_profiles[:TARGET_CLUSTERS]

print(f"Defined {len(donor_profiles)} distinct donor profiles.")

# Realistic Blood Type Distributions & Genders (same as before)
blood_types = ["O+", "B+", "A+", "AB+", "O-", "B-", "A-", "AB-"]
blood_type_probabilities = [0.35, 0.33, 0.20, 0.07, 0.015, 0.015, 0.015, 0.005]
genders = ["Male", "Female"]
gender_probabilities = [0.6, 0.4]

fake = Faker('en_IN')

# --- Helper Functions ---
def generate_random_location(city_name, radius_km=10): # Smaller radius for potentially tighter clusters
    """Generates a random lat/lon near a city center."""
    base_lat, base_lon = cities_punjab[city_name]
    radius_deg_lat = radius_km / 111.0
    radius_deg_lon = radius_km / (111.0 * np.cos(np.radians(base_lat)))
    rand_lat = base_lat + np.random.uniform(-radius_deg_lat, radius_deg_lat)
    rand_lon = base_lon + np.random.uniform(-radius_deg_lon, radius_deg_lon)
    return rand_lat, rand_lon

def generate_profiled_last_donation_date(profile, today=datetime.now().date()):
    """Generates a last donation date based on profile probabilities."""
    if random.random() < profile["new_donor_prob"]:
        return None # New donor

    if random.random() < profile["recent_donor_prob"]:
        # Generate date within last 180 days (more recent)
        start_date = today - timedelta(days=180)
    else:
        # Generate date between 180 days ago and 2 years ago (less recent)
        start_date = today - timedelta(days=2*365)
        today = today - timedelta(days=181) # Ensure it's not in the recent period

    if start_date >= today: # Avoid invalid date range if today < start_date
         start_date = today - timedelta(days=1)

    time_between_dates = today - start_date
    days_between_dates = time_between_dates.days
    if days_between_dates <= 0: # Handle edge case
        return start_date
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date

def generate_profiled_age(profile):
     """Generates age based on profile's normal distribution."""
     age = int(np.random.normal(profile["age_mean"], profile["age_std"]))
     return max(18, min(65, age)) # Ensure age is within valid donation range

def is_eligible(last_donation_date, eligibility_days=56):
    """Checks eligibility based on last donation date."""
    if last_donation_date is None:
        return True
    days_since_donation = (datetime.now().date() - last_donation_date).days
    return days_since_donation > eligibility_days

# --- Generate Bank Data (No changes needed here for donor clustering) ---
print(f"Generating {NUM_UNIQUE_BANKS} unique bank locations...")
unique_banks = []
for _ in range(NUM_UNIQUE_BANKS):
    bank_id = str(uuid.uuid4())
    city = np.random.choice(city_names_all)
    lat, lon = generate_random_location(city, radius_km=5)
    unique_banks.append({"blood_bank_id": bank_id, "city": city, "latitude": lat, "longitude": lon})

print(f"Generating {NUM_BANK_NEEDS} bank need records...")
bank_needs_data = []
urgency_levels = ["low", "medium", "high", "critical"]
urgency_probabilities = [0.50, 0.30, 0.15, 0.05]
for i in range(NUM_BANK_NEEDS):
    bank_info = random.choice(unique_banks)
    if random.random() < 0.05: req_blood_type = "ALL"
    else: req_blood_type = np.random.choice(blood_types, p=blood_type_probabilities)
    urgency = np.random.choice(urgency_levels, p=urgency_probabilities)
    if urgency == "critical": units = np.random.randint(1, 11)
    elif urgency == "high": units = np.random.randint(5, 26)
    else: units = np.random.randint(10, 51)
    bank_needs_data.append({
        "blood_bank_id": bank_info["blood_bank_id"], "city": bank_info["city"],
        "latitude": bank_info["latitude"], "longitude": bank_info["longitude"],
        "required_blood_type": req_blood_type, "urgency": urgency, "required_units": units
    })
    if (i + 1) % 10000 == 0: print(f"...generated {i+1}/{NUM_BANK_NEEDS} bank needs")
bank_needs_df = pd.DataFrame(bank_needs_data)


# --- Generate Donor Data Based on Profiles ---
print(f"Generating {NUM_DONORS} donor records based on profiles...")
donor_data = []
# Assign donors roughly equally to profiles
donors_per_profile = NUM_DONORS // TARGET_CLUSTERS

profile_idx = 0
for i in range(NUM_DONORS):
    # Select profile - cycle through them for somewhat balanced distribution
    current_profile = donor_profiles[profile_idx % TARGET_CLUSTERS]
    profile_idx += 1

    donor_id = str(uuid.uuid4())
    target_city = current_profile["center_city"]

    # Generate location near the profile's city center
    lat, lon = generate_random_location(target_city, radius_km=15) # Keep some spread

    # Generate age based on profile
    age = generate_profiled_age(current_profile)

    # Generate last donation date based on profile
    last_donation = generate_profiled_last_donation_date(current_profile)
    eligible = is_eligible(last_donation)

    # Other attributes (less critical for clustering score, keep random for now)
    blood_type = np.random.choice(blood_types, p=blood_type_probabilities)
    gender = np.random.choice(genders, p=gender_probabilities)
    # Make availability/regularity slightly correlated with profile perhaps? (Optional complexity)
    # Example: Recent donors more likely to be available?
    availability_status = random.choice([True, False]) if last_donation is None else (True if (datetime.now().date() - last_donation).days < 90 else random.choice([True, False]))
    is_regular_donor = False if last_donation is None else (True if current_profile["recent_donor_prob"] > 0.5 and random.random() < 0.6 else random.choice([True, False])) # Heuristic


    donor_data.append({
        "donor_id": donor_id,
        "city": target_city, # Store the profile's city
        "profile_name": current_profile["name"], # Store profile name for verification
        "latitude": lat,
        "longitude": lon,
        "blood_type": blood_type,
        "age": age,
        "gender": gender,
        "last_donation_date": last_donation,
        "is_eligible": eligible,
        "availability_status": availability_status,
        "is_regular_donor": is_regular_donor
    })
    if (i + 1) % 50000 == 0:
            print(f"...generated {i+1}/{NUM_DONORS} donors")


donor_df = pd.DataFrame(donor_data)

# --- Data Type Conversion & Saving ---
print("Converting data types...")
donor_df['last_donation_date'] = pd.to_datetime(donor_df['last_donation_date'])
donor_df['age'] = donor_df['age'].astype(int)
donor_df['is_eligible'] = donor_df['is_eligible'].astype(bool)
donor_df['availability_status'] = donor_df['availability_status'].astype(bool)
donor_df['is_regular_donor'] = donor_df['is_regular_donor'].astype(bool)

bank_needs_df['required_units'] = bank_needs_df['required_units'].astype(int)

print("Saving data to NEW CSV files...")
# SAVE WITH NEW NAMES TO AVOID OVERWRITING OLD DATA
donor_df.to_csv("synthetic_donors_profiled.csv", index=False)
bank_needs_df.to_csv("synthetic_bank_needs_profiled.csv", index=False) # Can reuse bank needs

print("-" * 30)
print("Generated Profiled Data Summary:")
print("\nDonor Data Sample:")
print(donor_df.head())
print(f"\nDonor Data Shape: {donor_df.shape}")
print(donor_df.info())

# --- Verification of Profiles (Example) ---
print("\n--- Profile Verification (Age Distribution Example) ---")
plt.figure(figsize=(15, 8))
sns.violinplot(data=donor_df, x='profile_name', y='age')
plt.title('Age Distribution per Generated Donor Profile')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\n--- Profile Verification (Recency Example) ---")
# Recalculate days_since for verification plot
now_ts = pd.Timestamp.now()
temp_days_since = (now_ts - donor_df['last_donation_date']).dt.days.fillna(9999) # Use temp var
donor_df['temp_days_since'] = temp_days_since # Add temporarily for plotting

plt.figure(figsize=(15, 8))
sns.boxplot(data=donor_df, x='profile_name', y='temp_days_since')
plt.ylabel("Days Since Last Donation (9999=New)")
plt.title('Donation Recency Distribution per Generated Donor Profile')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1000) # Zoom in on typical range, excluding 9999
plt.tight_layout()
plt.show()

# Drop the temporary column
donor_df.drop(columns=['temp_days_since'], inplace=True)

print("-" * 30)
print("Profiled synthetic data generation complete!")
print("Files saved: synthetic_donors_profiled.csv, synthetic_bank_needs_profiled.csv")

Defined 8 distinct donor profiles.
Generating 75 unique bank locations...
Generating 50000 bank need records...
...generated 10000/50000 bank needs
...generated 20000/50000 bank needs
...generated 30000/50000 bank needs
...generated 40000/50000 bank needs
...generated 50000/50000 bank needs
Generating 950000 donor records based on profiles...
...generated 50000/950000 donors
...generated 100000/950000 donors
...generated 150000/950000 donors
...generated 200000/950000 donors
...generated 250000/950000 donors
...generated 300000/950000 donors
...generated 350000/950000 donors
...generated 400000/950000 donors
...generated 450000/950000 donors
...generated 500000/950000 donors
...generated 550000/950000 donors
...generated 600000/950000 donors
...generated 650000/950000 donors
...generated 700000/950000 donors
...generated 750000/950000 donors
...generated 800000/950000 donors
...generated 850000/950000 donors
...generated 900000/950000 donors
...generated 950000/950000 donors
Converting

AttributeError: module 'matplotlib' has no attribute 'figure'

In [None]:
# This notebook is used to generate synthetic data using various python library. Here it has been tried to generate the data close to real world.
# 2nd block generates better data than the first block.