# 04 — Correlation Timeline (Staging → Exfil)

This notebook correlates endpoint staging (archive creation), DLP incidents, and proxy uploads to build an investigation timeline.

**Data:** `edr.csv`, `dlp.csv`, `proxy.csv`

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

data_dir = Path('data')
edr = pd.read_csv(data_dir/'edr.csv', parse_dates=['timestamp'])
dlp = pd.read_csv(data_dir/'dlp.csv', parse_dates=['timestamp'])
proxy = pd.read_csv(data_dir/'proxy.csv', parse_dates=['timestamp'])

edr.head()

## Find archive creation events on endpoints

In [None]:
archive_creates = edr[(edr['user']=='engineer.a') & (edr['event_type']=='file_create') & (edr['file_path'].str.contains(r'\.(zip|7z|tar)$', na=False))].copy()
archive_creates = archive_creates.sort_values('timestamp')
archive_creates[['timestamp','device','file_path','file_size','data_sensitivity','process_name']].head(50)

## Join to DLP incidents (±2 hours)

In [None]:
hits = dlp[(dlp['user']=='engineer.a') & (dlp['policy']=='Restricted-AI-IP')].copy().sort_values('timestamp')

window = pd.Timedelta('2h')
rows=[]
for _, a in archive_creates.iterrows():
    t=a['timestamp']
    cand = hits[(hits['timestamp']>=t-window) & (hits['timestamp']<=t+window)]
    for _, e in cand.iterrows():
        rows.append({
            'archive_time': t,
            'archive_path': a['file_path'],
            'archive_size': a['file_size'],
            'archive_sensitivity': a['data_sensitivity'],
            'dlp_time': e['timestamp'],
            'dlp_action': e['action'],
            'destination': e['destination'],
            'file_name': e['file_name'],
            'file_size': e['file_size'],
        })

stage_to_dlp = pd.DataFrame(rows).sort_values('archive_time')
stage_to_dlp

## Join to proxy uploads (±2 hours)

In [None]:
proxy_personal = proxy[(proxy['user']=='engineer.a') & (proxy['dest_category']=='personal_cloud_storage') & (proxy['http_method'].isin(['POST','PUT']))].copy().sort_values('timestamp')

rows=[]
for _, a in archive_creates.iterrows():
    t=a['timestamp']
    cand = proxy_personal[(proxy_personal['timestamp']>=t-window) & (proxy_personal['timestamp']<=t+window)]
    for _, p in cand.iterrows():
        rows.append({
            'archive_time': t,
            'archive_path': a['file_path'],
            'archive_size': a['file_size'],
            'proxy_time': p['timestamp'],
            'dest_domain': p['dest_domain'],
            'bytes_out': p['bytes_out'],
            'process_name': p['process_name'],
            'user_agent': p['user_agent']
        })

stage_to_proxy = pd.DataFrame(rows).sort_values('archive_time')
stage_to_proxy.head(50)

## Build a single timeline view

In [None]:
timeline=[]
for _, r in archive_creates.iterrows():
    timeline.append({'time': r['timestamp'], 'type': 'archive_created', 'detail': r['file_path'], 'bytes': r['file_size']})
for _, r in hits.iterrows():
    timeline.append({'time': r['timestamp'], 'type': f"dlp_{r['action']}", 'detail': f"{r['destination']}:{r['file_name']}", 'bytes': r['file_size']})
for _, r in proxy_personal.iterrows():
    if r['bytes_out'] > 50_000_000:
        timeline.append({'time': r['timestamp'], 'type': 'proxy_upload_personal_cloud', 'detail': f"{r['dest_domain']} via {r['process_name']}", 'bytes': r['bytes_out']})

t = pd.DataFrame(timeline).sort_values('time')
t.head(100)

## Investigation notes

Use the timeline to:
- justify containment decisions
- prioritize which devices/repos to image and preserve
- create an evidence-backed executive narrative
