# Data Generation 

In [6]:
import csv, uuid
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# --- CONFIG ---
CONFIG = {
    "days": 30,
    "interval_min": 30,
    "intervals": 1440 // 30,
    "start_date": datetime(2025, 8, 1),

    "queues": ["support", "billing", "customer service", "sales"],
    "queue_p": [0.5, 0.15, 0.25, 0.10],

    "sentiments": ["Positive", "Neutral", "Negative"],
    "sentiment_p": [0.6, 0.25, 0.15],

    "csat": [1, 2, 3, 4, 5],
    "csat_p": [0.1, 0.1, 0.15, 0.3, 0.35],

    # weighted (prob, (low, high)) in seconds
    "wait_dist":   [(0.7, (0, 30)), (0.2, (31, 60)), (0.1, (61, 180))],
    "handle_dist": [(0.50, (180, 300)), (0.30, (301, 600)), (0.20, (601, 1200))],

    "resolve_p": 0.85,
    "fcr_p": 0.65,

    "col_names": [
        "call_id","agent_id","queue","date","time","interval_num",
        "wait_time","abandoned","customer_id","handle_time",
        "resolution_time","resolved","fcr","sentiment","csat_score"
    ]
}

SLA = {"max_wait_time": 60}

In [61]:
# --- RNG helpers (NumPy) ---
rng      = np.random.default_rng(42)
pick     = lambda items, p=None: rng.choice(items, p=p)
randint  = lambda lo, hi: rng.integers(lo, hi + 1)

def sample_dist(dist):
    """Pick a bucket by probability, then sample uniformly inside."""
    probs  = np.array([p for p, _ in dist], dtype=float)
    probs /= probs.sum()
    lows   = np.array([lo for _, (lo, hi) in dist])
    highs  = np.array([hi for _, (lo, hi) in dist])
    idx    = rng.choice(len(dist), p=probs)
    return int(randint(lows[idx], highs[idx]))

def modifier(day, interval):
    """Weekend busier; mid-day peak higher."""
    weekday = 1.0 if day < 5 else 1.4
    peak    = 1.3 if CONFIG["intervals"] * 0.3 <= interval <= CONFIG["intervals"] * 0.7 else 0.8
    return weekday * peak

In [62]:
# --- One call record ---
def gen_call(day, interval):
    # timestamp within interval (random minute)
    minutes  = interval * CONFIG["interval_min"] + randint(0, CONFIG["interval_min"] - 1)
    ts       = CONFIG["start_date"] + timedelta(days=day, minutes=int(minutes))

    wait     = sample_dist(CONFIG["wait_dist"])
    # abandon only if past SLA and with some probability (not all)
    abandoned = (wait > SLA["max_wait_time"]) and (rng.random() < 0.75)

    handle   = 0 if abandoned else sample_dist(CONFIG["handle_dist"])
    resolved = False if abandoned else (rng.random() < CONFIG["resolve_p"])
    fcr      = bool(resolved and (rng.random() < CONFIG["fcr_p"]))

    # csat only if not abandoned and not skipped (~15% skip)
    csat = None
    if not abandoned and rng.random() >= 0.15:
        csat = int(pick(CONFIG["csat"], CONFIG["csat_p"]))

    return [
        str(uuid.uuid4())[:13],
        f"agent_{int(randint(1, 100))}" if not abandoned else None,
        pick(CONFIG["queues"], CONFIG["queue_p"]),
        ts.date().isoformat(),
        ts.time().strftime("%H:%M:%S"),
        int(interval + 1),
        int(wait),
        bool(abandoned),
        f"cust_{int(randint(1000, 10_000_000))}" if not abandoned else None,
        int(handle),
        int(handle + wait) if not abandoned else 0,  # simple total time proxy
        bool(resolved),
        bool(fcr),
        (pick(CONFIG["sentiments"], CONFIG["sentiment_p"]) if not abandoned else ""),
        csat,
    ]

# --- All calls ---
def generate_calls(base_calls=100):
    calls = []
    for d in range(CONFIG["days"]):
        for i in range(CONFIG["intervals"]):
            # Poisson arrivals per interval (more realistic than uniform jitter)
            lam = base_calls * modifier(d, i)
            n_calls = int(rng.poisson(lam=lam))
            # Append actual records (not a generator)
            for _ in range(n_calls):
                calls.append(gen_call(d, i))
    return calls

def write_csv(filename, rows, headers):
    with open(filename, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(rows)
    print(f"✅ {len(rows)} calls → {filename}")

# --- Run ---
calls = generate_calls(base_calls=100)
write_csv("../data/raw_calls.csv", calls, CONFIG["col_names"])


✅ 191605 calls → ../data/raw_calls.csv


In [8]:
df = pd.read_csv("../data/raw_calls.csv")
df.head()

         call_id  agent_id             queue        date      time  \
0  1deebd4b-9fb4  agent_98           support  2025-08-01  00:12:00   
1  ae780011-18da  agent_86           billing  2025-08-01  00:11:00   
2  ae9e302b-d4f9  agent_20  customer service  2025-08-01  00:21:00   
3  c9853ff5-3c37  agent_95  customer service  2025-08-01  00:29:00   
4  d48e35ce-9e3f  agent_39           support  2025-08-01  00:02:00   

   interval_num  wait_time  abandoned   customer_id  handle_time  \
0             1         26      False  cust_5004019          243   
1             1         12      False  cust_8276484          279   
2             1        103      False  cust_3667060          528   
3             1          2      False  cust_4372082          236   
4             1         40      False  cust_8049187         1106   

   resolution_time  resolved    fcr sentiment  csat_score  
0              269      True  False  Negative         NaN  
1              291      True   True   Neutral     

- Call Volume → total calls per interval/day/week.
- Arrival Rate → calls per minute/hour.
- Service Level (SL) → % of calls answered within SLA wait time.
- Average Wait Time (AWT) → mean time customers spend waiting.
- Max Wait Time → longest wait experienced.
- Queue Length → how many customers are waiting per interval.
- Abandonment Rate → % of calls dropped before answer.
- Re-Queue Rate → % of calls transferred between queues.

👩‍💻 Agent Performance Metrics
- Average Handle Time (AHT) → talk + wrap-up.
- Occupancy → % of agent time spent on calls vs idle.
- Utilization → calls handled vs capacity.
- First Call Resolution (FCR) → % resolved on first attempt.
- Agent Transfers → number of transfers per agent.
- Calls per Agent → workload distribution.
- Adherence → time agents stick to scheduled shifts.

😊 Customer Experience Metrics
- Customer Satisfaction (CSAT) → post-call ratings (1–5).
- Net Promoter Score (NPS) (if you simulate it).
- Sentiment Analysis → positive/neutral/negative tone of call.
- Resolution Rate → % of calls resolved successfully.
- Repeat Calls → same customer calling multiple times in period.

💰 Business/Strategic Metrics
- Call Distribution by Queue → e.g., tech vs billing vs sales.
- Trend Analysis → by day, week, peak vs off-peak.

Main Metrics:

- Call Volume
- Service Level
- Average Wait Time
- Abandonment Rate
- Average Handle Time
- FCR
- CSAT
- Sentiment


In [10]:
call_volume = len(df)
answered_calls = df['abandoned'].value_counts()[False]
calls_per_day = pd.DataFrame(df.groupby('date')['date'].agg('count'))

calls_per_interval = df.groupby(['interval_num',"date"])["interval_num"].agg('count')
# sns.lineplot(data=calls_per_day)
# plt.title('Simple Line Plot')
# plt.pie(df['sentiment'].value_counts().sort_index(),x="sentiment")
# pt.show()
# plt.show()
print(calls_per_day)
# # Create a line plot with hue for multiple lines
# sns.lineplot(data=df, x='time', y='value_A', hue='value_B') # Note: 'value_B' as hue will treat it as categories if not continuous
# plt.title('Line Plot with Hue')
# plt.show()

            date
date            
2025-08-01  4789
2025-08-02  4785
2025-08-03  4771
2025-08-04  4857
2025-08-05  4793
2025-08-06  6624
2025-08-07  6772
2025-08-08  6650
2025-08-09  6784
2025-08-10  6656
2025-08-11  6642
2025-08-12  6605
2025-08-13  6642
2025-08-14  6668
2025-08-15  6595
2025-08-16  6686
2025-08-17  7007
2025-08-18  6811
2025-08-19  6575
2025-08-20  6778
2025-08-21  6825
2025-08-22  6671
2025-08-23  6625
2025-08-24  6657
2025-08-25  6733
2025-08-26  6616
2025-08-27  6764
2025-08-28  6775
2025-08-29  6743
2025-08-30  6706
