# SEC Anomaly Detection: Exploratory Data Analysis (EDA)

This notebook explores filing patterns in our SEC database to understand:
- Normal filing behavior
- Potential anomalies
- Key features for detection

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import sqlite3
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

In [None]:
# Load data from SQLite database
db_path = Path.cwd().parent / "data" / "sec_anomaly.db"

conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row

# Load filings and companies
filings = pd.read_sql_query("SELECT * FROM filing_events", conn)
companies = pd.read_sql_query("SELECT * FROM companies", conn)

conn.close()

# Convert date columns to datetime
filings['filed_at'] = pd.to_datetime(filings['filed_at'])
companies['updated_at'] = pd.to_datetime(companies['updated_at'])

print(f"âœ… Loaded {len(companies)} companies and {len(filings)} filings")
print(f"\nDate range: {filings['filed_at'].min()} to {filings['filed_at'].max()}")
print(f"Filing types: {filings['filing_type'].unique().tolist()}")

## 1. Filing Volume by Company

Which companies file most frequently? Anomalies often have unusual filing patterns.

In [None]:
# Filing count per company
filing_counts = filings.groupby('cik').size().reset_index(name='filing_count')
filing_counts = filing_counts.merge(companies[['cik', 'name', 'ticker']], on='cik', how='left')

print("Top 10 companies by filing count:")
print(filing_counts.nlargest(10, 'filing_count')[['ticker', 'name', 'filing_count']])

# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(filing_counts['filing_count'], bins=30, edgecolor='black', color='steelblue')
axes[0].set_xlabel('Number of Filings')
axes[0].set_ylabel('Number of Companies')
axes[0].set_title('Distribution of Filing Counts per Company')
axes[0].axvline(filing_counts['filing_count'].mean(), color='red', linestyle='--', label=f'Mean: {filing_counts["filing_count"].mean():.1f}')
axes[0].legend()

# Box plot to spot outliers
axes[1].boxplot(filing_counts['filing_count'])
axes[1].set_ylabel('Number of Filings')
axes[1].set_title('Filing Count Distribution (Box Plot)')

plt.tight_layout()
plt.show()

# Identify outliers (unusually high/low filers)
Q1 = filing_counts['filing_count'].quantile(0.25)
Q3 = filing_counts['filing_count'].quantile(0.75)
IQR = Q3 - Q1
outliers = filing_counts[(filing_counts['filing_count'] < Q1 - 1.5*IQR) | (filing_counts['filing_count'] > Q3 + 1.5*IQR)]
print(f"\nðŸš¨ {len(outliers)} companies with unusual filing counts (potential anomalies)")
print(outliers[['ticker', 'name', 'filing_count']].sort_values('filing_count', ascending=False))

## 2. Filing Type Distribution

Which filing types appear most? Are there type anomalies (e.g., unusual form types)?

In [None]:
filing_type_counts = filings['filing_type'].value_counts()

print("Filing type distribution:")
print(filing_type_counts)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
filing_type_counts.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Filing Type Distribution')
axes[0].set_xlabel('Filing Type')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
axes[1].pie(filing_type_counts.values, labels=filing_type_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Filing Type Proportion')

plt.tight_layout()
plt.show()

## 3. Filing Frequency Over Time

Is filing activity uniform or do we see patterns/spikes?

In [None]:
# Daily filing count
daily_filings = filings.set_index('filed_at').resample('D').size()

fig, ax = plt.subplots(figsize=(15, 5))
daily_filings.plot(ax=ax, color='steelblue', linewidth=1.5)
ax.set_title('Daily Filing Volume')
ax.set_xlabel('Date')
ax.set_ylabel('Number of Filings')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nDaily filing statistics:")
print(f"  Mean: {daily_filings.mean():.1f} filings/day")
print(f"  Std Dev: {daily_filings.std():.1f}")
print(f"  Max: {daily_filings.max()} (date: {daily_filings.idxmax()})")
print(f"  Min: {daily_filings.min()} (date: {daily_filings.idxmin()})")

## 4. Inter-Filing Gap Analysis

How long does it take companies to file between submissions? Unusual gaps = anomalies.

In [None]:
# Calculate gaps between consecutive filings per company
gaps = []

for cik in filings['cik'].unique():
    company_filings = filings[filings['cik'] == cik].sort_values('filed_at')
    if len(company_filings) > 1:
        company_filings_sorted = company_filings.reset_index(drop=True)
        for i in range(len(company_filings_sorted) - 1):
            gap = (company_filings_sorted.iloc[i+1]['filed_at'] - company_filings_sorted.iloc[i]['filed_at']).days
            gaps.append(gap)

gaps_series = pd.Series(gaps)

print("Inter-filing gap statistics (in days):")
print(f"  Mean: {gaps_series.mean():.1f} days")
print(f"  Median: {gaps_series.median():.1f} days")
print(f"  Std Dev: {gaps_series.std():.1f} days")
print(f"  Min: {gaps_series.min()} days")
print(f"  Max: {gaps_series.max()} days")

fig, ax = plt.subplots(figsize=(15, 5))
ax.hist(gaps_series, bins=50, edgecolor='black', color='steelblue')
ax.set_xlabel('Gap Between Filings (days)')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Inter-Filing Gaps')
ax.axvline(gaps_series.mean(), color='red', linestyle='--', label=f'Mean: {gaps_series.mean():.1f}')
ax.legend()
plt.tight_layout()
plt.show()

# Identify unusually long gaps
long_gaps_threshold = gaps_series.quantile(0.95)
print(f"\nðŸš¨ Filing gaps > {long_gaps_threshold:.0f} days (95th percentile) = potential anomalies")

## 5. Day-of-Week & Hour Analysis

Do companies file on specific days/times? Unusual timing = anomalies.

In [None]:
filings['dayofweek'] = filings['filed_at'].dt.day_name()
filings['hour'] = filings['filed_at'].dt.hour

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = filings['dayofweek'].value_counts().reindex(day_order)
day_counts.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Filings by Day of Week')
axes[0].set_xlabel('Day')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Hour of day
filings['hour'].value_counts().sort_index().plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Filings by Hour of Day (ET)')
axes[1].set_xlabel('Hour')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

print("Filings by day of week:")
print(day_counts)
print("\nFilings by hour:")
print(filings['hour'].value_counts().sort_index())

## 6. Key Features for Anomaly Detection

Based on this EDA, here are the features to track:

In [None]:
features_for_detection = {
    "Filing Frequency": "Unusual filing counts per company (IQR-based outliers)",
    "Inter-Filing Gap": "Abnormally long delays between consecutive filings",
    "Filing Type Mix": "Unusual ratio of 8-K to 10-Q/K filings",
    "Temporal Pattern": "Filings outside normal business hours/days",
    "Volume Spike": "Sudden increase in daily filing volume",
    "Company Deviation": "Company filing pattern change from historical baseline",
    "Primary Document": "Missing or anomalous primary_document field"
}

print("âœ… FEATURES FOR ANOMALY DETECTION:\n")
for feature, description in features_for_detection.items():
    print(f"  â€¢ {feature}: {description}")

# Save summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS FOR ANOMALY DETECTION")
print("="*60)
print(f"Total companies: {len(companies)}")
print(f"Total filings: {len(filings)}")
print(f"Filing type diversity: {filings['filing_type'].nunique()} types")
print(f"Avg filings/company: {len(filings)/len(companies):.1f}")
print(f"Max gap between filings: {gaps_series.max()} days")
print(f"Companies with >1 filing: {len(filing_counts[filing_counts['filing_count'] > 1])}")
print(f"Filings outside 8-17h ET: {len(filings[~filings['hour'].between(8, 17)])}")
print(f"Filings on weekends: {len(filings[filings['dayofweek'].isin(['Saturday', 'Sunday'])])}")