# UAP Explorer - Per-Report Anomaly Detection

This notebook combines all previous work to generate final per-report anomaly scores.

## Objectives
1. Merge cleaned sightings with spatiotemporal anomalies and text clusters
2. Engineer features combining all data sources
3. Train Isolation Forest for per-report anomaly detection
4. Generate final anomaly scores (0-1 scale)
5. Export complete dataset ready for frontend

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings

# ML libraries
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Load All Data Sources

In [None]:
# Load cleaned sightings
print("Loading data sources...\n")

cleaned_path = Path('../data/processed/cleaned_sightings.parquet')
df = pd.read_parquet(cleaned_path)
print(f"‚úì Loaded {len(df):,} cleaned sightings")

# Load cluster assignments
clusters_path = Path('../data/processed/sightings_with_clusters.parquet')
if clusters_path.exists():
    df_clusters = pd.read_parquet(clusters_path)
    print(f"‚úì Loaded {len(df_clusters):,} cluster assignments")
else:
    print("‚ö†Ô∏è  Cluster data not found. Please run 03_text_clusters.ipynb first.")
    df_clusters = None

# Load spatiotemporal anomalies
grid_anomalies_path = Path('../data/processed/grid_time_anomalies.parquet')
if grid_anomalies_path.exists():
    df_grid_anomalies = pd.read_parquet(grid_anomalies_path)
    print(f"‚úì Loaded {len(df_grid_anomalies):,} grid-time anomaly scores")
else:
    print("‚ö†Ô∏è  Grid anomaly data not found. Please run 02_spatiotemporal_baseline.ipynb first.")
    df_grid_anomalies = None

# Load cluster labels
labels_path = Path('../data/processed/cluster_labels.json')
if labels_path.exists():
    with open(labels_path) as f:
        cluster_labels = json.load(f)
    # Convert keys to integers
    cluster_labels = {int(k): v for k, v in cluster_labels.items()}
    print(f"‚úì Loaded {len(cluster_labels)} cluster labels")
else:
    print("‚ö†Ô∏è  Cluster labels not found")
    cluster_labels = None

print(f"\nInitial dataset shape: {df.shape}")

## 2. Merge All Data Sources

In [None]:
# Merge clusters
if df_clusters is not None:
    df = df.merge(df_clusters[['id', 'cluster_id', 'cluster_label']], on='id', how='left')
    print(f"‚úì Merged cluster data")
    print(f"  Records with cluster: {df['cluster_id'].notna().sum():,}")
else:
    # Create dummy cluster
    df['cluster_id'] = 0
    df['cluster_label'] = 'unknown'
    print("‚ö†Ô∏è  Using dummy clusters (all set to 0)")

# Merge spatiotemporal anomalies
if df_grid_anomalies is not None:
    # Convert year_month to string for matching (both dataframes)
    df['year_month_str'] = df['year_month'].astype(str)
    df_grid_anomalies['year_month_str'] = df_grid_anomalies['year_month'].astype(str)
    
    # Merge on grid_id and year_month_str
    merge_cols = ['grid_id', 'year_month_str', 'anomaly_score_cell', 'predicted_count']
    df_grid_subset = df_grid_anomalies[merge_cols]
    
    df = df.merge(
        df_grid_subset,
        on=['grid_id', 'year_month_str'],
        how='left',
        suffixes=('', '_grid')
    )
    
    # Fill NaN anomaly scores with 0 (for cells without enough data)
    df['anomaly_score_cell'] = df['anomaly_score_cell'].fillna(0)
    df['predicted_count'] = df['predicted_count'].fillna(df['predicted_count'].mean())
    
    print(f"‚úì Merged spatiotemporal anomalies")
    print(f"  Records with cell anomaly score: {df['anomaly_score_cell'].notna().sum():,}")
else:
    # Create dummy anomaly scores
    df['anomaly_score_cell'] = 0
    df['predicted_count'] = 1
    print("‚ö†Ô∏è  Using dummy cell anomaly scores (all set to 0)")

print(f"\nMerged dataset shape: {df.shape}")
print(f"\nSample merged data:")
df[['id', 'datetime', 'location', 'cluster_label', 'anomaly_score_cell']].head()

## 3. Feature Engineering for Anomaly Detection

Create features combining:
- Temporal: hour, day_of_week, month
- Duration: normalized duration
- Spatial: cell anomaly score
- Text: cluster ID (one-hot encoded)
- Shape: shape category (one-hot encoded)

In [None]:
# Create feature matrix
print("Engineering features for anomaly detection...\n")

features_list = []

# 1. Duration (log-transformed to handle outliers)
df['duration_log'] = np.log1p(df['duration_seconds'])
features_list.append('duration_log')
print("‚úì Added duration_log")

# 2. Time of day
features_list.append('hour')
print("‚úì Added hour")

# 3. Day of week
features_list.append('day_of_week')
print("‚úì Added day_of_week")

# 4. Month (seasonality)
features_list.append('month')
print("‚úì Added month")

# 5. Cell anomaly score
features_list.append('anomaly_score_cell')
print("‚úì Added anomaly_score_cell")

# 6. Description length (normalized)
df['description_length_norm'] = df['description_length'] / df['description_length'].max()
features_list.append('description_length_norm')
print("‚úì Added description_length_norm")

# 7. Cluster ID (one-hot encoded)
cluster_dummies = pd.get_dummies(df['cluster_id'], prefix='cluster')
cluster_cols = cluster_dummies.columns.tolist()
df = pd.concat([df, cluster_dummies], axis=1)
features_list.extend(cluster_cols)
print(f"‚úì Added {len(cluster_cols)} cluster features")

# 8. Shape (one-hot encoded) - limit to top shapes
top_shapes = df['shape'].value_counts().head(10).index.tolist()
df['shape_category'] = df['shape'].apply(lambda x: x if x in top_shapes else 'other')
shape_dummies = pd.get_dummies(df['shape_category'], prefix='shape')
shape_cols = shape_dummies.columns.tolist()
df = pd.concat([df, shape_dummies], axis=1)
features_list.extend(shape_cols)
print(f"‚úì Added {len(shape_cols)} shape features")

print(f"\nTotal features: {len(features_list)}")

In [None]:
# Prepare feature matrix
X = df[features_list].copy()

# Handle any remaining NaN values
X = X.fillna(0)

print(f"Feature matrix shape: {X.shape}")
print(f"\nFeature summary:")
print(X.describe())

## 4. Train Isolation Forest for Anomaly Detection

In [None]:
# Standardize features for better anomaly detection
print("Standardizing features...\n")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"‚úì Features standardized")
print(f"  Shape: {X_scaled.shape}")

In [None]:
# Train Isolation Forest
print("Training Isolation Forest...\n")

iso_forest = IsolationForest(
    contamination=0.1,  # Expect ~10% anomalies
    random_state=42,
    n_estimators=100,
    max_samples='auto',
    n_jobs=-1,
    verbose=1
)

# Fit the model
anomaly_predictions = iso_forest.fit_predict(X_scaled)

# Get anomaly scores (lower is more anomalous)
anomaly_scores_raw = iso_forest.score_samples(X_scaled)

print("\n‚úì Isolation Forest training complete")
print(f"  Anomalies detected: {(anomaly_predictions == -1).sum():,} ({(anomaly_predictions == -1).sum()/len(anomaly_predictions)*100:.1f}%)")

In [None]:
# Normalize anomaly scores to 0-1 range
# (Higher score = more anomalous)
scaler_anomaly = MinMaxScaler()
# Negate scores so higher = more anomalous
anomaly_scores_normalized = scaler_anomaly.fit_transform(
    -anomaly_scores_raw.reshape(-1, 1)
).flatten()

# Add to dataframe
df['anomaly_score_report'] = anomaly_scores_normalized
df['is_anomaly'] = (anomaly_predictions == -1).astype(int)

print("‚úì Anomaly scores normalized to 0-1 range")
print(f"\nAnomaly score statistics:")
print(df['anomaly_score_report'].describe())

## 5. Analyze Results

In [None]:
# Visualize anomaly score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['anomaly_score_report'], bins=100, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Anomaly Score (0-1)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Per-Report Anomaly Scores')
axes[0].axvline(df['anomaly_score_report'].median(), color='red', linestyle='--', 
                label=f'Median: {df["anomaly_score_report"].median():.2f}')
axes[0].legend()

# Cumulative distribution
sorted_scores = np.sort(df['anomaly_score_report'])
cumulative = np.arange(1, len(sorted_scores) + 1) / len(sorted_scores)
axes[1].plot(sorted_scores, cumulative, linewidth=2)
axes[1].set_xlabel('Anomaly Score')
axes[1].set_ylabel('Cumulative Probability')
axes[1].set_title('Cumulative Distribution of Anomaly Scores')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Top anomalies
print("üî• TOP 20 MOST ANOMALOUS SIGHTINGS\n")
print("=" * 100)

top_anomalies = df.nlargest(20, 'anomaly_score_report')[[
    'datetime', 'location', 'shape', 'duration_seconds',
    'cluster_label', 'anomaly_score_cell', 'anomaly_score_report', 'description'
]]

for idx, row in top_anomalies.iterrows():
    print(f"\nScore: {row['anomaly_score_report']:.3f} | {row['datetime']} | {row['location']}")
    print(f"Shape: {row['shape']} | Duration: {row['duration_seconds']:.0f}s | Cluster: {row['cluster_label']}")
    print(f"Cell Anomaly: {row['anomaly_score_cell']:.2f}")
    print(f"Description: {row['description'][:150]}...")
    print("-" * 100)

In [None]:
# Anomaly score by cluster
cluster_anomalies = df.groupby('cluster_label')['anomaly_score_report'].agg(['mean', 'median', 'max', 'count'])
cluster_anomalies = cluster_anomalies.sort_values('mean', ascending=False)

print("\nAverage Anomaly Score by Cluster:")
print(cluster_anomalies)

# Plot
plt.figure(figsize=(12, 6))
cluster_anomalies['mean'].plot(kind='barh')
plt.xlabel('Average Anomaly Score')
plt.title('Average Anomaly Score by Description Cluster')
plt.tight_layout()
plt.show()

In [None]:
# Correlation between cell anomaly and report anomaly
correlation = df[['anomaly_score_cell', 'anomaly_score_report']].corr()
print("\nCorrelation between Cell and Report Anomaly Scores:")
print(correlation)

plt.figure(figsize=(10, 6))
plt.scatter(df['anomaly_score_cell'], df['anomaly_score_report'], alpha=0.1, s=1)
plt.xlabel('Cell Anomaly Score (Spatiotemporal)')
plt.ylabel('Report Anomaly Score (Per-Sighting)')
plt.title('Relationship Between Cell and Report Anomaly Scores')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Export Final Dataset

Create the final dataset with all information ready for the frontend.

In [None]:
# Select columns for final export
final_columns = [
    # Identifiers
    'id',
    
    # Datetime
    'datetime', 'year', 'month', 'day_of_week', 'hour',
    
    # Location
    'city', 'state', 'country', 'location',
    'latitude', 'longitude',
    'grid_lat', 'grid_lon', 'grid_id',
    
    # Shape and duration
    'shape', 'duration_seconds',
    
    # Text
    'description', 'description_length',
    
    # Clusters
    'cluster_id', 'cluster_label',
    
    # Anomaly scores
    'anomaly_score_cell',      # Spatiotemporal anomaly
    'anomaly_score_report',    # Per-report anomaly (MAIN SCORE)
    'is_anomaly',              # Binary flag
    
    # Additional
    'date posted'
]

# Create final dataframe
df_final = df[final_columns].copy()

# Sort by anomaly score (highest first)
df_final = df_final.sort_values('anomaly_score_report', ascending=False)

print(f"Final dataset shape: {df_final.shape}")
print(f"\nColumn list:")
for i, col in enumerate(final_columns, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Export to parquet
output_path = Path('../data/processed/sightings_with_scores.parquet')
df_final.to_parquet(output_path, index=False)

print(f"‚úì Exported final dataset to: {output_path}")
print(f"  Records: {len(df_final):,}")
print(f"  Columns: {len(final_columns)}")
print(f"  File size: {output_path.stat().st_size / (1024**2):.2f} MB")

# Also export to CSV for easy inspection
csv_path = output_path.with_suffix('.csv')
df_final.head(1000).to_csv(csv_path, index=False)
print(f"‚úì Exported top 1000 to CSV: {csv_path}")

In [None]:
# Summary statistics
print("\n" + "=" * 100)
print("FINAL DATASET SUMMARY")
print("=" * 100)

print(f"\nüìä Dataset Statistics:")
print(f"  Total sightings: {len(df_final):,}")
print(f"  Date range: {df_final['year'].min():.0f} - {df_final['year'].max():.0f}")
print(f"  Countries: {df_final['country'].nunique()}")
print(f"  Unique locations: {df_final['location'].nunique():,}")
print(f"  Grid cells: {df_final['grid_id'].nunique():,}")

print(f"\nüî∑ Clusters:")
print(f"  Total clusters: {df_final['cluster_id'].nunique()}")
cluster_dist = df_final['cluster_label'].value_counts().head(5)
for label, count in cluster_dist.items():
    print(f"  ‚Ä¢ {label}: {count:,} sightings")

print(f"\nüéØ Anomaly Scores:")
print(f"  Mean: {df_final['anomaly_score_report'].mean():.3f}")
print(f"  Median: {df_final['anomaly_score_report'].median():.3f}")
print(f"  95th percentile: {df_final['anomaly_score_report'].quantile(0.95):.3f}")
print(f"  99th percentile: {df_final['anomaly_score_report'].quantile(0.99):.3f}")
print(f"  Flagged as anomaly: {df_final['is_anomaly'].sum():,} ({df_final['is_anomaly'].sum()/len(df_final)*100:.1f}%)")

print("\n" + "=" * 100)
print("‚úÖ TASK 2.5 COMPLETE! ALL ML PIPELINE TASKS DONE!")
print("=" * 100)

print("\nüéâ Phase 2 Complete!\n")
print("Next steps:")
print("  1. Review the final dataset: data/processed/sightings_with_scores.parquet")
print("  2. Proceed to Task 3.1: Design tile format & zoom strategy")
print("  3. Then Task 3.2: Implement export script to generate frontend data")
print("\nThe dataset is now ready to be exported as JSON tiles for the frontend!")