# 01 - Exploration

Initial data exploration and setup.

**Tasks:**
- Load and explore eBird data
- Preprocess data
- Extract embeddings
- Add project-specific labels/features
- Save processed data

## Setup Paths

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add backbone to path
project_root = Path.cwd().parent.parent  # Go up to bird-embeddings/
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Current project: {Path.cwd().parent.name}")

## Load eBird Data

In [None]:
from src.data import load_ebird_data

# Load shared eBird data
data_path = project_root / 'data' / 'raw' / 'ebd_IN-KL_smp_relSep-2025.txt'
df = load_ebird_data(str(data_path), nrows=50000)  # Adjust nrows as needed

print(f"Loaded {len(df)} observations")
print(f"Columns: {df.columns.tolist()}")
df.head()

## Explore Data

In [None]:
# Add your exploration code here
# Example:
print(f"Unique checklists: {df['SAMPLING EVENT IDENTIFIER'].nunique()}")
print(f"Unique species: {df['COMMON NAME'].nunique()}")
print(f"Date range: {df['OBSERVATION DATE'].min()} to {df['OBSERVATION DATE'].max()}")

## Preprocess Data

In [None]:
from src.data import EBirdPreprocessor

# Preprocess using backbone
preprocessor = EBirdPreprocessor()
processed_df = preprocessor.fit_transform(df)

print(f"Processed shape: {processed_df.shape}")
print(f"Features (species): {processed_df.shape[1]}")
processed_df.head()

## Extract Embeddings

In [None]:
from src.inference import EmbeddingExtractor

# Load shared VAE model
model_path = project_root / 'models' / 'vae_model_inference_ready.pth'
extractor = EmbeddingExtractor(str(model_path), device='cpu')

# Extract embeddings
embeddings = extractor.extract_embeddings(processed_df, use_mean=True)

print(f"Embeddings shape: {embeddings.shape}")
print(f"Latent dimensions: {embeddings.shape[1]}")

## Add Project-Specific Features/Labels

**TODO**: Add your project-specific code here.

Examples:
- Extract labels from location/date/other columns
- Compute additional features
- Filter data for specific analysis

In [None]:
# Example: Add labels
# labels = df.loc[processed_df.index, 'YOUR_LABEL_COLUMN']

# Your code here
pass

## Save Processed Data

In [None]:
# Save embeddings and metadata to project folder
save_path = Path('../data/processed/embeddings.npz')

np.savez(
    save_path,
    embeddings=embeddings,
    # Add your labels/metadata here
    # labels=labels.values,
    checklist_ids=processed_df.index.values
)

print(f"âœ“ Saved processed data to {save_path}")

## Summary

**TODO**: Summarize what you found in this exploration.