# 01 â€” Access Baselining (Docs + Code)

This notebook identifies abnormal repository access and bulk operations for the subject account (`engineer.a`) compared to baseline and peers.

**Data:** `notebooks/data/docs_audit.csv`, `notebooks/data/git_audit.csv`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

data_dir = Path('data')
docs = pd.read_csv(data_dir/'docs_audit.csv', parse_dates=['timestamp'])
git = pd.read_csv(data_dir/'git_audit.csv', parse_dates=['timestamp'])

docs.head()

## Docs repository anomalies

We compute per-user daily metrics:
- unique objects accessed
- total bytes downloaded/exported
- breadth of spaces accessed

In [None]:
docs['date'] = docs['timestamp'].dt.date
# Focus on sensitive actions
sensitive_docs = docs[docs['data_sensitivity'].isin(['restricted','crown_jewel'])].copy()

daily = (sensitive_docs
         .groupby(['user','date'])
         .agg(uniq_objects=('object_id','nunique'),
              total_bytes=('bytes','sum'),
              spaces=('space', lambda s: len(set(s))),
              actions=('action','count'))
         .reset_index())

daily[daily['user']=='engineer.a'].sort_values('date').tail(10)

In [None]:
# Baseline z-scores for engineer.a
a = daily[daily['user']=='engineer.a'].copy()
for col in ['uniq_objects','total_bytes','spaces','actions']:
    mu, sigma = a[col].mean(), a[col].std(ddof=0)
    a[f'z_{col}'] = (a[col]-mu) / (sigma if sigma else 1)

# A simple combined score
a['anomaly_score'] = a[[f'z_{c}' for c in ['uniq_objects','total_bytes','actions']]].mean(axis=1)

anom = a.sort_values('anomaly_score', ascending=False).head(10)
anom[['date','uniq_objects','total_bytes','actions','anomaly_score']]

In [None]:
# Plot daily unique objects and bytes
fig, ax = plt.subplots()
a_sorted = a.sort_values('date')
ax.plot(a_sorted['date'], a_sorted['uniq_objects'])
ax.set_title('Sensitive unique objects accessed per day (engineer.a)')
ax.set_xlabel('Date')
ax.set_ylabel('Unique objects')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

fig, ax = plt.subplots()
ax.plot(a_sorted['date'], a_sorted['total_bytes'])
ax.set_title('Sensitive bytes downloaded/exported per day (engineer.a)')
ax.set_xlabel('Date')
ax.set_ylabel('Bytes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Peer comparison

Compare `engineer.a` to a peer group for the same metrics.

In [None]:
peer = daily.copy()
# Compute peer distribution per day
day_stats = (peer.groupby('date')
             .agg(peer_mean_objects=('uniq_objects','mean'),
                  peer_p95_objects=('uniq_objects', lambda x: np.percentile(x,95)),
                  peer_mean_bytes=('total_bytes','mean'),
                  peer_p95_bytes=('total_bytes', lambda x: np.percentile(x,95)))
             .reset_index())

merged = a.merge(day_stats, on='date', how='left')
merged['objects_vs_p95'] = merged['uniq_objects'] / merged['peer_p95_objects'].replace(0,np.nan)
merged['bytes_vs_p95'] = merged['total_bytes'] / merged['peer_p95_bytes'].replace(0,np.nan)

merged.sort_values('objects_vs_p95', ascending=False).head(10)[['date','uniq_objects','peer_p95_objects','objects_vs_p95','total_bytes','peer_p95_bytes','bytes_vs_p95']]

## Git bulk operations

Look for clone/mirror/bundle/archive activity and large bytes out.

In [None]:
git['date'] = git['timestamp'].dt.date
bulk_actions = ['git_clone','git_clone_mirror','git_bundle_create','git_archive_download']

bulk = git[(git['user']=='engineer.a') & (git['action'].isin(bulk_actions)) & (git['repo_sensitivity'].isin(['restricted','crown_jewel']))]

bulk.sort_values('bytes_out', ascending=False).head(20)[['timestamp','action','repo','repo_sensitivity','bytes_out','device','ip']]

## Output: recommended escalation artifacts

- Top anomaly days and what happened
- Top repositories/spaces and object sets accessed

In [None]:
# Top spaces accessed on the most anomalous day
if len(anom):
    top_day = anom.iloc[0]['date']
    day_events = sensitive_docs[(sensitive_docs['user']=='engineer.a') & (sensitive_docs['date']==top_day)]
    print('Top anomaly day:', top_day)
    print('Top spaces:')
    print(day_events['space'].value_counts().head(10))
    print('\nSample crown-jewel objects accessed:')
    print(day_events[day_events['data_sensitivity']=='crown_jewel']['object_id'].dropna().unique()[:20])