# 04 — Batch Scoring & Export for Power BI

**Goal (Day 2):** Score all cases either **locally** (saved model) or via **Azure ML Online Endpoint**, then export CSVs for Power BI:
- `artifacts/powerbi/cases.csv` — case-level KPIs + risk
- `artifacts/powerbi/events.csv` — event log (Case ID, Activity, Timestamp, Resource)
- `artifacts/powerbi/edges.csv` — directly-follows edges with counts/avg gap (minutes)

Set `SCORING_MODE` to `local` or `endpoint`. For endpoint scoring, set env vars:
- `AML_ENDPOINT_URL` — scoring URI
- `AML_ENDPOINT_KEY` — primary key (Bearer)

In [None]:
import os, json, math, time, joblib, requests
import pandas as pd, numpy as np
from pathlib import Path

DATA_DIR = Path('../data')
ART_DIR = Path('../artifacts'); ART_DIR.mkdir(exist_ok=True)
PBI_DIR = ART_DIR / 'powerbi'; PBI_DIR.mkdir(parents=True, exist_ok=True)

# Inputs from Day 1
LABELED = ART_DIR / 'processed_case_features_labeled.csv'
MODEL_DIR = ART_DIR / 'model'
MODEL_PATH = MODEL_DIR / 'model.joblib'
FEATS_JSON = MODEL_DIR / 'feature_names.json'

# Event log for Power BI visualizations
CSV_PATH = DATA_DIR / 'bpi2012.csv'
XES_PATH = DATA_DIR / 'BPI_Challenge_2012.xes'

# Output CSVs
CASES_CSV = PBI_DIR / 'cases.csv'
EVENTS_CSV = PBI_DIR / 'events.csv'
EDGES_CSV = PBI_DIR / 'edges.csv'

# Choose: 'local' or 'endpoint'
# SCORING_MODE = os.getenv('SCORING_MODE', 'local')
SCORING_MODE = 'endpoint'
BATCH_SIZE = int(os.getenv('BATCH_SIZE', '128'))

In [None]:
# Utilities
def ensure_schema(df, case_col, act_col, ts_col, res_col=None):
    df = df.rename(columns={
        case_col: 'case_id',
        act_col: 'activity',
        ts_col: 'timestamp',
        **({res_col: 'resource'} if res_col and res_col in df.columns else {})
    })
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True, errors='coerce')
    df = df.dropna(subset=['case_id','activity','timestamp'])
    return df

def load_event_log():
    if CSV_PATH.exists():
        df = pd.read_csv(CSV_PATH)
        return ensure_schema(df, 'case_id','activity','timestamp','resource')
    elif XES_PATH.exists():
        from pm4py.objects.log.importer.xes import importer as xes_importer
        from pm4py import convert_to_dataframe
        log = xes_importer.apply(str(XES_PATH))
        df = convert_to_dataframe(log)
        df = df.rename(columns={
            'case:concept:name':'case_id',
            'concept:name':'activity',
            'time:timestamp':'timestamp',
            'org:resource':'resource'
        })
        return ensure_schema(df, 'case_id','activity','timestamp','resource')
    else:
        raise FileNotFoundError('Place CSV or XES in data/.')

def make_edges(df):
    # directly-follows edges with counts and avg inter-event minutes
    df = df.sort_values(['case_id','timestamp']).copy()
    df['next_activity'] = df.groupby('case_id')['activity'].shift(-1)
    df['next_ts'] = df.groupby('case_id')['timestamp'].shift(-1)
    edges = df.dropna(subset=['next_activity']).copy()
    edges['gap_min'] = (edges['next_ts'] - edges['timestamp']).dt.total_seconds()/60.0
    g = edges.groupby(['activity','next_activity']).agg(
        count=('case_id','count'),
        avg_gap_min=('gap_min','mean')
    ).reset_index().rename(columns={'activity':'from','next_activity':'to'})
    return g

def predict_local(X):
    import json
    model = joblib.load(MODEL_PATH)
    with open(FEATS_JSON,'r') as f:
        feats = json.load(f)
    X = X.reindex(columns=feats, fill_value=0)
    prob = model.predict_proba(X)[:,1]
    return prob

def predict_endpoint(X):
    url = os.getenv('AML_ENDPOINT_URL'); key = os.getenv('AML_ENDPOINT_KEY')
    # url = 'https://bpi2012-risk-endpoint.koreacentral.inference.ml.azure.com/score'
    
    assert url and key, 'Set AML_ENDPOINT_URL and AML_ENDPOINT_KEY'
    out = np.zeros(len(X), dtype=float)
    cols = list(X.columns)
    i = 0
    while i < len(X):
        chunk = X.iloc[i:i+BATCH_SIZE]
        payload = {'instances':[ {c: float(v) for c,v in row.items()} for _,row in chunk.iterrows() ]}
        t0 = time.time()
        r = requests.post(url, headers={'Authorization': f'Bearer {key}','Content-Type':'application/json'},
                          data=json.dumps(payload), timeout=60)
        if r.status_code != 200:
            raise RuntimeError(f'HTTP {r.status_code}: {r.text[:200]}')
        resp = json.loads(r.json())
        prob = resp.get('probabilities') or resp.get('prob') or []
        out[i:i+len(chunk)] = np.array(prob, dtype=float)
        i += len(chunk)
    return out

In [29]:
# Load labeled features from Day 1
labeled = pd.read_csv(LABELED)
feature_cols = [c for c in labeled.columns if c not in ('case_id','label_late')]
X = labeled[feature_cols].copy()

if SCORING_MODE == 'local':
    print('[SCORING] Local model')
    prob = predict_local(X)
else:
    print('[SCORING] Endpoint')
    prob = predict_endpoint(X)

labeled['risk_score'] = prob
labeled['risk_bin'] = pd.qcut(labeled['risk_score'], q=10, labels=False, duplicates='drop')
labeled['pred_late@0.5'] = (labeled['risk_score'] >= 0.5).astype(int)
labeled.rename(columns={'label_late':'y_late'}, inplace=True)

# Case-level export for Power BI
CASES = labeled.copy()
CASES.to_csv(CASES_CSV, index=False)
print('[OK] Wrote', CASES_CSV)

[SCORING] Endpoint
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'pred': [0]}
<class 'dict'>
{'prob': [6.251074768925946e-09], 'p

In [30]:
# Build event and edges tables for Power BI
events = load_event_log()[['case_id','activity','timestamp','resource']].copy()
events.sort_values(['case_id','timestamp'], inplace=True)
events.to_csv(EVENTS_CSV, index=False)
print('[OK] Wrote', EVENTS_CSV)

edges = make_edges(events)
edges.to_csv(EDGES_CSV, index=False)
print('[OK] Wrote', EDGES_CSV)

parsing log, completed traces :: 100%|██████████| 13087/13087 [00:05<00:00, 2591.70it/s]


[OK] Wrote artifacts/powerbi/events.csv
[OK] Wrote artifacts/powerbi/edges.csv


### Notes
- To get **scoring URI** and **key** for Azure ML Online Endpoint, use CLI (replace name):
  ```bash
  az ml online-endpoint show -n <endpoint> --query scoring_uri -o tsv
  az ml online-endpoint get-credentials -n <endpoint> --query primaryKey -o tsv
  ```
- Set them as environment variables: `AML_ENDPOINT_URL`, `AML_ENDPOINT_KEY` then rerun with `SCORING_MODE='endpoint'`.