# Expedition Clustering Showcase
Run the CLI end-to-end (same as `uv run expedition-cluster ...`) from the notebook, then summarize and visualize the results.

In [None]:
import subprocess, shlex, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from expedition_clustering import (
    plot_geographical_heatmap,
    plot_geographical_positions,
    plot_time_histogram,
)

# Parameters
E_DIST_KM = 10.0
E_DAYS = 7.0
LIMIT = 10000  # set None to process all rows
OUTPUT_PATH = 'data/clustered_showcase.csv'

print('Config ->', {
    'E_DIST_KM': E_DIST_KM,
    'E_DAYS': E_DAYS,
    'LIMIT': LIMIT,
    'OUTPUT_PATH': OUTPUT_PATH,
})


## Run CLI
Uses the same path as the working command-line invocation.

In [None]:
cmd = [
    'uv', 'run', 'expedition-cluster',
    '--e-dist', str(E_DIST_KM),
    '--e-days', str(E_DAYS),
    '--output', OUTPUT_PATH,
]
if LIMIT is not None:
    cmd.extend(['--limit', str(int(LIMIT))])
print('Running CLI:', ' '.join(shlex.quote(c) for c in cmd))
res = subprocess.run(cmd, capture_output=True, text=True)
print('Return code:', res.returncode)
print('--- STDOUT ---')
print(res.stdout)
print('--- STDERR ---')
print(res.stderr)
if res.returncode != 0:
    raise RuntimeError('CLI failed')


## Load clustered output
Loads the CSV written by the CLI.

In [None]:
df = pd.read_csv(OUTPUT_PATH, low_memory=False)
print('Loaded', len(df), 'rows from', OUTPUT_PATH)
df.head()

## Summary stats
Basic size distribution and counts.

In [None]:
num_clusters = df['spatiotemporal_cluster_id'].nunique()
cluster_sizes = df.groupby('spatiotemporal_cluster_id').size()
print(f'Total specimens: {len(df):,}')
print(f'Total expeditions (clusters): {num_clusters:,}')
print(f'Average size: {cluster_sizes.mean():.2f}')
print(f'Median size: {cluster_sizes.median():.0f}')
print(f'Largest: {cluster_sizes.max()}')
print(f'Smallest: {cluster_sizes.min()}')

fig, ax = plt.subplots(figsize=(8, 4))
cluster_sizes.hist(bins=50, ax=ax)
ax.set_xlabel('Cluster size')
ax.set_ylabel('Count')
ax.set_title('Cluster size distribution')
plt.tight_layout()
plt.show()

## Geospatial visualizations
Heatmap and scatter on a sample.

In [None]:
sample = df.sample(min(5000, len(df)), random_state=42)
plot_geographical_heatmap(sample, lat_col='latitude1', lon_col='longitude1', zoom='auto', grid_size=80)
plot_geographical_positions(
    sample,
    lat_col='latitude1',
    lon_col='longitude1',
    datetime_col='startdate',
    zoom='auto',
    cluster_line=False,
)


## Temporal distribution
Histogram of collection dates.

In [None]:
plot_time_histogram(df, datetime_col='startdate', bins='auto')
