In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# --- 1. Configuration and Parameters ---
N_RECORDS = 10000
START_DATE = datetime(2023, 10, 1) 
END_DATE = datetime(2024, 12, 31) 
CSV_FILE = 'synthetic_gts_survey_data.csv'

# --- Contextual Lists (Based on recent humanitarian crises) ---
LOCATIONS = [
    'Darfur Region (Sudan)', 'Gaza Strip / West Bank (oPt)', 'Jigjiga Zone (Ethiopia)',
    'North-East DRC', 'Lviv / Odesa Region (Ukraine)', "Cox's Bazar (Bangladesh)"
]
AID_PROVIDERS = ['WFP', 'UNHCR', 'UNICEF', 'IRC', 'NRC', 'ICRC']
DISPLACEMENT_STATUS = ['IDP (Internally Displaced Person)', 'Refugee', 'Host Community', 'Returnee']
GENDERS = ['Female', 'Male', 'Other', 'Prefer Not to Say']
AGE_GROUPS = ['18-25', '26-40', '41-60', '60+']

def generate_synthetic_feedback(n_records):
    data = {}
    
    # 2.1 Identifiers and Time
    data['response_id'] = [f"GTS-{i+1:05d}" for i in range(n_records)]
    time_delta = END_DATE - START_DATE
    data['survey_date'] = [
        (START_DATE + timedelta(days=random.randint(0, time_delta.days))).strftime('%Y-%m-%d')
        for _ in range(n_records)
    ]
    
    # 2.2 Context and Demographics
    data['location'] = np.random.choice(LOCATIONS, n_records, p=[0.25, 0.20, 0.15, 0.15, 0.15, 0.10])
    data['aid_provider'] = np.random.choice(AID_PROVIDERS, n_records, p=[0.25, 0.20, 0.15, 0.15, 0.15, 0.10])
    data['displacement_status'] = np.random.choice(DISPLACEMENT_STATUS, n_records, p=[0.4, 0.3, 0.2, 0.1])
    data['gender'] = np.random.choice(GENDERS, n_records, p=[0.45, 0.45, 0.05, 0.05])
    data['age_group'] = np.random.choice(AGE_GROUPS, n_records, p=[0.2, 0.4, 0.3, 0.1])
    
    # 2.3 Core Feedback Metrics (Likert Scales 1-5)
    data['aid_satisfaction'] = np.random.choice([1, 2, 3, 4, 5], n_records, p=[0.1, 0.1, 0.2, 0.3, 0.3])
    data['trust_in_aid_provider'] = np.random.choice([1, 2, 3, 4, 5], n_records, p=[0.2, 0.2, 0.3, 0.2, 0.1])
    data['communication_clarity'] = np.random.choice([1, 2, 3, 4, 5], n_records, p=[0.1, 0.15, 0.25, 0.3, 0.2])
    data['aid_fairness'] = np.random.choice([1, 2, 3, 4, 5], n_records, p=[0.1, 0.15, 0.3, 0.25, 0.2])
    
    # 2.4 Open-ended Comment 
    comments_pool = ["Timely assistance.", "Staff were helpful."] * 4 + ["The waiting time was too long.", "Staff were rude."] * 3 + ["No comment."] * 3
    data['feedback_comment'] = np.random.choice(comments_pool, n_records)

    df = pd.DataFrame(data)
    
    # 3. Introduce Data Quality Issues (Missing and Typos)
    
    # Missing Data (approx 10%)
    for col in ['aid_satisfaction', 'gender', 'trust_in_aid_provider']:
        df.loc[np.random.choice(n_records, int(n_records * 0.1), replace=False), col] = np.nan
        
    # Inconsistent/Corrupted Data (1% spelling/typos)
    typo_indices = np.random.choice(n_records, int(n_records * 0.01), replace=False)
    for i in typo_indices:
        if df.loc[i, 'aid_provider'] == 'UNHCR':
            df.loc[i, 'aid_provider'] = 'UNHCR ' # Trailing space
        elif df.loc[i, 'aid_provider'] == 'IRC':
            df.loc[i, 'aid_provider'] = 'Intl Rescue Commitee' # Misspelling
            
    # Save to CSV
    df.to_csv(CSV_FILE, index=False)
    print(f"Successfully generated {n_records} records and saved to {CSV_FILE}")
    return df

df_synthetic = generate_synthetic_feedback(N_RECORDS)

Successfully generated 10000 records and saved to synthetic_gts_survey_data.csv
