# Synthetic QoE Session Generator

- Simulates large-scale, privacy-safe application sessions for eight Asia/US regions.
- Operator and RAT-aware: realistic mixes, radio context, and device variety.
- Physics-based: SINR-to-throughput, realistic RTT/jitter, variance capped by a quality score.
- Core observables: `page_load_time_ms`, `startup_delay_ms`, `buffering_ratio`, plus radio and context features.
- Sane distributions: per-device file sizes, modern access (4G/5G/Wi-Fi), optional missingness.
- Rich identifiers: for slicing, joining, and time analysis.
- Output: `synthetic_qoe_sessions.csv` for downstream cleaning and analytics.


In [None]:
# Step 1: Imports
import pandas as pd
import numpy as np
import uuid
from typing import Optional

# Step 2: Seed
np.random.seed(42)

# Step 3: Rows
N = 100_000

# Step 4: Categorical domains
countries = ['ID', 'SG', 'MY', 'US', 'IN', 'CN', 'PH', 'VN']
devices = ['Desktop', 'Mobile', 'Tablet']
network_types = ['4G', '5G', 'WiFi']  # 3G removed

# Operators by country (weights sum to 1)
operators_by_country = {
    'ID': (['Telkomsel','Indosat','XL','Tri'], [0.40,0.25,0.25,0.10]),
    'SG': (['Singtel','StarHub','M1'], [0.50,0.25,0.25]),
    'MY': (['Maxis','CelcomDigi','U Mobile'], [0.40,0.40,0.20]),
    'US': (['Verizon','AT&T','T-Mobile'], [0.33,0.33,0.34]),
    'IN': (['Jio','Airtel','Vi'], [0.50,0.35,0.15]),
    'CN': (['China Mobile','China Unicom','China Telecom'], [0.50,0.25,0.25]),
    'PH': (['Globe','Smart'], [0.50,0.50]),
    'VN': (['Viettel','MobiFone','VinaPhone'], [0.50,0.25,0.25]),
}

# ---- RAT physics/latency settings (smooth mapping) ----
peak_mbps_map = {'4G': 150, '5G': 1200, 'WiFi': 200}  # illustrative
base_rtt_map  = {'5G': 25, 'WiFi': 35, '4G': 50}      # ms

# Step 6: Time & base categories
timestamps = pd.date_range(start='2024-08-01', periods=N, freq='min')
country_list = np.random.choice(countries, size=N)
device_list = np.random.choice(devices, size=N)
network_types_list = np.random.choice(network_types, size=N, p=[0.40/0.90, 0.20/0.90, 0.30/0.90])

# Helper: choose operator by country (WiFi gets a generic tag)
def pick_operator(country, rat):
    if rat == 'WiFi':
        return 'WiFi'
    ops, ws = operators_by_country[country]
    return np.random.choice(ops, p=ws)

# Helper: band/channel + radio KPIs
def sample_radio(rat):
    rsrp = np.nan; rsrq = np.nan; sinr = np.nan
    pci = np.nan; tac = np.nan; cid = np.nan
    band = np.nan; chan = np.nan

    if rat == '4G':
        bands = [('B3', (1200, 1949)), ('B7', (2750, 3449)), ('B8', (3450, 3799))]
        band, er = bands[np.random.randint(len(bands))]
        chan = np.random.randint(er[0], er[1]+1)
        rsrp = float(np.clip(np.random.normal(-95, 8), -125, -60))
        rsrq = float(np.clip(np.random.normal(-9, 2), -20, -3))
        sinr = float(np.clip(np.random.normal(12, 5), -3, 35))
        pci  = int(np.random.randint(0, 504))
        tac  = int(np.random.randint(1, 65536))
        cid  = int(np.random.randint(10_000, 50_000_000))

    elif rat == '5G':
        bands = [('n78', (620000, 680000)), ('n41', (499200, 537999)), ('n28', (151600, 160600))]
        band, er = bands[np.random.randint(len(bands))]
        chan = np.random.randint(er[0], er[1]+1)
        rsrp = float(np.clip(np.random.normal(-90, 7), -125, -60))
        rsrq = float(np.clip(np.random.normal(-8, 1.8), -20, -3))
        sinr = float(np.clip(np.random.normal(18, 6), -3, 35))
        pci  = int(np.random.randint(0, 504))
        tac  = int(np.random.randint(1, 65536))
        cid  = int(np.random.randint(10_000, 200_000_000))

    elif rat == 'WiFi':
        sinr = float(np.clip(np.random.normal(25, 7), 0, 40))
    return rsrp, rsrq, sinr, pci, tac, cid, band, chan

# Helper: radio quality score in [0,1]
def radio_quality_score(rat: str,
                        rsrp: Optional[float],
                        sinr: Optional[float]) -> float:
    """
    Returns a radio quality score in [0,1].
    Uses sentinel values that normalize to 0 when a metric is missing,
    so math never touches Optional[float].
    """
    if rat in ('4G', '5G'):
        # If missing: rsrp -> -125 dBm, sinr -> -3 dB => both normalize to 0
        rsrp_val: float = float(rsrp) if (rsrp is not None and not np.isnan(rsrp)) else -125.0
        sinr_val: float = float(sinr) if (sinr is not None and not np.isnan(sinr)) else -3.0

        q_rsrp = (rsrp_val + 125.0) / 65.0     # -125..-60 → 0..1
        q_sinr = (sinr_val + 3.0)  / 38.0      #  -3..35  → 0..1
        q = 0.4 * q_rsrp + 0.6 * q_sinr

    elif rat == 'WiFi':
        # If missing: sinr -> 0 → quality 0
        sinr_val: float = float(sinr) if (sinr is not None and not np.isnan(sinr)) else 0.0
        q = sinr_val / 40.0                    # 0..40 → 0..1

    else:
        q = 0.5                                # fallback (not used in this dataset)

    return float(np.clip(q, 0.0, 1.0))

# Device-level typical page size (KB) medians
device_page_median = {'Desktop': 1800.0, 'Mobile': 900.0, 'Tablet': 1400.0}

# Step 7: Generate rows (smooth radio→QoE; variance tied to q)
page_load_time_ms, buffering_ratio, startup_delay_ms = [], [], []
operators, rsrp_dbm, rsrq_db, sinr_db = [], [], [], []
pci_list, tac_list, cell_id_list, band_list, chan_list = [], [], [], [], []
app_size_kb_list, rtt_ms_list = [], []

for i in range(N):
    nt = network_types_list[i]
    dev = device_list[i]

    # Radio & IDs
    rsrp, rsrq, sinr, pci, tac, cid, band, chan = sample_radio(nt)
    rsrp_dbm.append(rsrp); rsrq_db.append(rsrq); sinr_db.append(sinr)
    pci_list.append(pci); tac_list.append(tac); cell_id_list.append(cid)
    band_list.append(band); chan_list.append(chan)

    # Operator
    op = pick_operator(country_list[i], nt)
    operators.append(op)

    # Quality + load
    q: float = radio_quality_score(nt, rsrp, sinr)   # 0..1 (↑ better)
    cell_load = np.random.beta(2, 5)                 # 0..1 (mostly light)

    # ---- Throughput (Mbps) ----
    peak_mbps = peak_mbps_map[nt]
    sinr_eff: float = 0.0 if (sinr is None or np.isnan(sinr)) else float(sinr)
    eff = 0.1 + 0.9 * (1.0 / (1.0 + np.exp(-(sinr_eff - 5.0) / 4.0)))  # logistic(SINR)
    throughput_mbps = peak_mbps * eff * (0.5 + 0.5*q) * (1 - 0.6*cell_load)
    throughput_mbps = max(throughput_mbps, 0.5)

    # ---- Latency / RTT (ms), variance depends on q ----
    base_rtt = base_rtt_map[nt]
    jitter_mean = 8.0 + 20.0*(1.0 - q)              # ms
    jitter_sigma = 0.35 + 0.25*(1.0 - q)            # lognormal sigma
    jitter_ms = np.random.lognormal(mean=np.log(jitter_mean), sigma=jitter_sigma)
    rtt_ms = base_rtt + jitter_ms + 20.0*(1.0 - q)  # small extra penalty on bad radio
    rtt_ms_list.append(float(rtt_ms))

    # ---- App/page size (KB), variance depends on q & load ----
    base_median = device_page_median[dev]
    overhead = 1.0 + 0.12*(1.0 - q) + 0.10*cell_load
    size_sigma = 0.45 + 0.25*(1.0 - q)
    size_mean  = np.log(base_median * overhead)
    page_kb = np.random.lognormal(mean=size_mean, sigma=size_sigma)
    page_kb = float(np.clip(page_kb, 100.0, 8000.0))
    app_size_kb_list.append(page_kb)

    # ---- Page load time ----
    transfer_ms  = (page_kb / (throughput_mbps * 125.0)) * 1000.0  # 1 Mbps = 125 KB/s
    handshake_ms = 3.0 * rtt_ms
    op_noise_sd = 40.0 + 80.0*(1.0 - q)            # noisier when radio is bad
    op_bias = np.random.normal(0.0, op_noise_sd)
    plt_val = int(np.clip(transfer_ms + handshake_ms + op_bias, 400.0, 8000.0))
    page_load_time_ms.append(plt_val)

    # ---- Startup delay ----
    sdl = int(np.clip(np.random.normal(loc=rtt_ms*2.0 + 160.0*(1.0 - q),
                                       scale=40.0 + 40.0*(1.0 - q)),
                      80.0, 2500.0))
    startup_delay_ms.append(sdl)

    # ---- Buffering ratio (0..0.20) ----
    page_load_norm = (plt_val - 400.0) / (8000.0 - 400.0)
    buf = 0.01 + 0.22*page_load_norm + 0.18*(1.0 - q) + np.random.normal(0.0, 0.01)
    buffering_ratio.append(float(np.clip(buf, 0.0, 0.20)))

# Step 8: Assemble DataFrame
df = pd.DataFrame({
    'timestamp': timestamps,
    'country': country_list,
    'device': device_list,
    'network_type': network_types_list,
    'operator': operators,
    'page_load_time_ms': page_load_time_ms,
    'buffering_ratio': buffering_ratio,
    'startup_delay_ms': startup_delay_ms,
    'rsrp_dbm': rsrp_dbm,
    'rsrq_db': rsrq_db,
    'sinr_db': sinr_db,
    'pci': pci_list,
    'tac': tac_list,
    'cell_id': cell_id_list,
    'band': band_list,
    'channel_number': chan_list,
    'app_size_kb': app_size_kb_list,
    'rtt_ms': rtt_ms_list,
    'session_id': [str(uuid.uuid4()) for _ in range(N)],
})

# Step 9: inject ~1% NaNs into radio/IDs (+ new observables at 0.5%)
for col in ['rsrp_dbm','rsrq_db','sinr_db','pci','tac','cell_id','band','channel_number']:
    nan_idx = np.random.choice(df.index, size=int(0.01 * len(df)), replace=False)
    df.loc[nan_idx, col] = np.nan
for col in ['app_size_kb', 'rtt_ms']:
    nan_idx = np.random.choice(df.index, size=int(0.005 * len(df)), replace=False)
    df.loc[nan_idx, col] = np.nan

# Step 10: inject NaNs into QoE columns
for col in ['page_load_time_ms', 'buffering_ratio', 'startup_delay_ms']:
    nan_idx = np.random.choice(df.index, size=int(0.01 * len(df)), replace=False)
    df.loc[nan_idx, col] = np.nan

# Step 11: Save CSV
df.to_csv("synthetic_qoe_sessions.csv", index=False)

# Step 12: Quick sanity prints
print(f"Generated dataset with shape: {df.shape}")
display(df.head())
