# 02 — DLP Event Analysis

This notebook reviews DLP incidents and correlates them with proxy uploads.

**Data:** `dlp.csv`, `proxy.csv`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

data_dir = Path('data')
dlp = pd.read_csv(data_dir/'dlp.csv', parse_dates=['timestamp'])
proxy = pd.read_csv(data_dir/'proxy.csv', parse_dates=['timestamp'])

dlp.sort_values('timestamp').head()

## Focus on restricted policy hits

In [None]:
hits = dlp[(dlp['policy']=='Restricted-AI-IP') & (dlp['user']=='engineer.a')].copy()
hits = hits.sort_values('timestamp')
hits[['timestamp','action','destination','file_name','file_size','data_sensitivity','device']]

In [None]:
# Summary by destination and outcome
summary = (hits.groupby(['destination','action'])
           .agg(events=('file_name','count'), total_bytes=('file_size','sum'))
           .reset_index()
           .sort_values('total_bytes', ascending=False))
summary

## Correlate with proxy uploads

We look for large POST/PUT to personal cloud within ±30 minutes of DLP events.

In [None]:
proxy_personal = proxy[(proxy['user']=='engineer.a') & (proxy['dest_category']=='personal_cloud_storage') & (proxy['http_method'].isin(['POST','PUT']))].copy()
proxy_personal = proxy_personal.sort_values('timestamp')

# For each DLP event, find nearest proxy upload
window = pd.Timedelta('30min')
rows=[]
for _, e in hits.iterrows():
    t = e['timestamp']
    cand = proxy_personal[(proxy_personal['timestamp']>=t-window) & (proxy_personal['timestamp']<=t+window)].copy()
    if len(cand):
        best = cand.iloc[(cand['timestamp']-t).abs().argsort()].iloc[0]
        rows.append({
            'dlp_time': t,
            'dlp_action': e['action'],
            'file_name': e['file_name'],
            'file_size': e['file_size'],
            'destination': e['destination'],
            'proxy_time': best['timestamp'],
            'proxy_bytes_out': best['bytes_out'],
            'proxy_process': best['process_name'],
            'proxy_user_agent': best['user_agent'],
        })

corr = pd.DataFrame(rows)
corr.sort_values('dlp_time')

In [None]:
# Plot proxy bytes out for personal cloud uploads
fig, ax = plt.subplots()
pp = proxy_personal.copy()
pp['date'] = pp['timestamp'].dt.date
per_day = pp.groupby('date')['bytes_out'].sum().reset_index()
ax.plot(per_day['date'], per_day['bytes_out'])
ax.set_title('Total personal-cloud upload bytes per day (engineer.a)')
ax.set_xlabel('Date')
ax.set_ylabel('Bytes out')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Interpretation checklist

- Are there **allowed** uploads for restricted policy hits?
- Do uploads correlate with **archiving** events on endpoints?
- Are uploads performed by **non-browser** processes (curl/python)?
