# Zeek Log Loading & Inspection

Load `conn.log` and `dns.log` from Zeek JSON output, convert timestamps,
handle missing fields, and verify the resulting DataFrames.

In [None]:
import sys
from pathlib import Path

# Make the scripts/ package importable from notebooks/
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import matplotlib.pyplot as plt

from scripts.zeek_to_dataframe import (
    load_zeek_log,
    CONN_SCHEMA,
    DNS_SCHEMA,
)

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 160)

## 1 — Load conn.log

Passing `CONN_SCHEMA` enforces nullable dtypes (`Int64`, `Float64`, `string`)
and backfills any columns absent from the file with `pd.NA`.

In [None]:
ZEEK_DIR = Path("../data/zeek_logs/sample")

conn = load_zeek_log(ZEEK_DIR / "conn.log", schema=CONN_SCHEMA)

print(f"Rows:    {len(conn)}")
print(f"Columns: {len(conn.columns)}")
conn.head()

In [None]:
# Verify timestamps parsed correctly (should be datetime64[ns, UTC])
print(conn["ts"].dtype)
conn[["ts", "uid", "id.orig_h", "id.resp_h", "id.resp_p"]].head()

In [None]:
# Verify nullable integer dtypes — NaN-safe without float coercion
conn.dtypes

In [None]:
# Check for missing values across all columns
missing = conn.isna().sum()
missing[missing > 0]

## 2 — Load dns.log

In [None]:
dns = load_zeek_log(ZEEK_DIR / "dns.log", schema=DNS_SCHEMA)

print(f"Rows:    {len(dns)}")
print(f"Columns: {len(dns.columns)}")
dns.head()

In [None]:
dns.dtypes

In [None]:
# "answers" stays as a Python list — Zeek JSON arrays load natively
dns[["query", "rcode_name", "answers"]].head()

In [None]:
# Rows where answers is null (NXDOMAIN, timeouts, etc.)
dns[dns["answers"].isna()][["query", "rcode_name", "answers"]]

## 3 — Quick sanity checks

In [None]:
# Connection state distribution — S0 (no SYN-ACK) may indicate scanning
conn["conn_state"].value_counts()

In [None]:
# DNS response code distribution — NXDOMAIN spikes may indicate DGA
dns["rcode_name"].value_counts()

In [None]:
# Join conn + dns on uid to see which connections triggered DNS lookups
merged = conn.merge(dns[["uid", "query", "answers"]], on="uid", how="left")
merged[["ts", "id.orig_h", "id.resp_h", "id.resp_p", "query"]].head(10)

In [None]:
# Export cleaned DataFrames to Parquet for downstream use
out_dir = Path("../data/processed")
conn.to_parquet(out_dir / "conn.parquet", index=False)
dns.to_parquet(out_dir / "dns.parquet", index=False)
print(f"Wrote conn.parquet ({len(conn)} rows) and dns.parquet ({len(dns)} rows)")