# UAP Explorer - Data Exploration

This notebook provides initial exploration of the UAP sighting dataset.

## Objectives
1. Load and inspect the raw dataset
2. Understand the data structure and quality
3. Identify key columns for analysis
4. Detect missing or invalid data
5. Perform basic statistical analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Dataset

Place your UAP/UFO dataset CSV file in the `data/raw/` directory.

Expected columns (adjust based on your dataset):
- Date/Time
- Location (city, state, country)
- Latitude/Longitude
- Shape
- Duration
- Description

In [None]:
# Load the dataset
# Update the filename to match your dataset
data_path = Path('../data/raw/uap_sightings.csv')

# Check if file exists
if not data_path.exists():
    print(f"⚠️  Dataset not found at {data_path}")
    print("\nPlease:")
    print("1. Download a UAP/UFO sighting dataset (e.g., NUFORC)")
    print("2. Place it in the ml/data/raw/ directory")
    print("3. Update the filename in this cell")
else:
    df = pd.read_csv(data_path)
    print(f"✓ Dataset loaded successfully")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")

## 2. Initial Inspection

In [None]:
# Display first few rows
# df.head(10)

In [None]:
# Display column names and types
# df.info()

In [None]:
# Basic statistics for numeric columns
# df.describe()

## 3. Data Quality Analysis

In [None]:
# Check for missing values
# missing_data = df.isnull().sum()
# missing_percent = (missing_data / len(df) * 100).round(2)
# 
# missing_df = pd.DataFrame({
#     'Missing Count': missing_data,
#     'Percentage': missing_percent
# }).sort_values('Missing Count', ascending=False)
# 
# print("Missing Values by Column:")
# print(missing_df[missing_df['Missing Count'] > 0])

## 4. Key Field Analysis

In [None]:
# Analyze shape field (adjust column name as needed)
# if 'shape' in df.columns:
#     print("Shape Distribution:")
#     print(df['shape'].value_counts())
#     
#     plt.figure(figsize=(12, 6))
#     df['shape'].value_counts().head(15).plot(kind='bar')
#     plt.title('Top 15 UFO Shapes')
#     plt.xlabel('Shape')
#     plt.ylabel('Count')
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()

In [None]:
# Analyze temporal patterns (adjust column name as needed)
# if 'datetime' in df.columns or 'date' in df.columns:
#     date_col = 'datetime' if 'datetime' in df.columns else 'date'
#     df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
#     
#     # Extract year
#     df['year'] = df[date_col].dt.year
#     
#     # Plot sightings over time
#     plt.figure(figsize=(14, 6))
#     yearly_counts = df.groupby('year').size()
#     yearly_counts.plot(kind='line', marker='o')
#     plt.title('UAP Sightings Over Time')
#     plt.xlabel('Year')
#     plt.ylabel('Number of Sightings')
#     plt.grid(True, alpha=0.3)
#     plt.tight_layout()
#     plt.show()

## 5. Geographic Distribution

In [None]:
# Analyze geographic distribution (adjust column names as needed)
# if 'latitude' in df.columns and 'longitude' in df.columns:
#     valid_coords = df[df['latitude'].notna() & df['longitude'].notna()]
#     
#     print(f"Records with valid coordinates: {len(valid_coords):,} ({len(valid_coords)/len(df)*100:.1f}%)")
#     
#     # Simple scatter plot of coordinates
#     plt.figure(figsize=(14, 8))
#     plt.scatter(valid_coords['longitude'], valid_coords['latitude'], alpha=0.1, s=1)
#     plt.title('Geographic Distribution of Sightings')
#     plt.xlabel('Longitude')
#     plt.ylabel('Latitude')
#     plt.grid(True, alpha=0.3)
#     plt.tight_layout()
#     plt.show()

## 6. Text Analysis Preview

In [None]:
# Look at sample descriptions (adjust column name as needed)
# if 'description' in df.columns or 'comments' in df.columns:
#     text_col = 'description' if 'description' in df.columns else 'comments'
#     
#     # Show length distribution
#     df['text_length'] = df[text_col].astype(str).str.len()
#     
#     print("\nText Length Statistics:")
#     print(df['text_length'].describe())
#     
#     # Sample a few descriptions
#     print("\n=== Sample Descriptions ===")
#     for idx, row in df.sample(3).iterrows():
#         print(f"\n[Sample {idx}]")
#         print(row[text_col][:300] + "..." if len(str(row[text_col])) > 300 else row[text_col])

## 7. Summary & Next Steps

Based on this exploration, document your findings:

### Key Observations
- Total records:
- Date range:
- Geographic coverage:
- Data quality issues:

### Next Steps
1. Create `clean_data.py` script to normalize and clean the data
2. Handle missing values (drop or impute)
3. Standardize date/time formats
4. Validate coordinates
5. Clean and normalize text fields