# Debugging spatiotemporal DBSCAN
Use this notebook to walk step-by-step through the pipeline and spot where spatially disconnected pieces might get merged. The cells mirror the code paths in `expedition_clustering/pipeline.py` and `cli.py`.

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from expedition_clustering import Preprocessor, SpatialDBSCAN, TemporalDBSCAN, CombineClusters

# Adjust these paths/params as needed
CSV_PATH = 'data/all_expeditions.csv'
E_DIST_KM = 7.0
E_DAYS = 7.0

df_raw = pd.read_csv(CSV_PATH, low_memory=False)
df_raw.head()

## 1) Preprocess
Run the same preprocessing as the pipeline.

In [None]:
pre = Preprocessor()
df_pre = pre.fit_transform(df_raw)
df_pre.head(), len(df_pre)

## 2) Spatial DBSCAN
Haversine, eps in km converted to radians, min_samples=1, algorithm='ball_tree'.

In [None]:
spatial = SpatialDBSCAN(e_dist=E_DIST_KM)
df_spatial = spatial.fit_transform(df_pre)
df_spatial['spatial_cluster_id'].nunique()

### Check spatial connectivity of a few clusters
If these are already disconnected, the issue is in spatial step.

In [None]:
def spatial_components(df, cluster_id):
    sub = df[df['spatial_cluster_id'] == cluster_id]
    coords = np.radians(sub[['latitude1','longitude1']].to_numpy(float))
    labels = DBSCAN(eps=E_DIST_KM/6371, min_samples=1, metric='haversine', algorithm='ball_tree').fit_predict(coords)
    return labels.max()+1, len(sub)

test_ids = df_spatial['spatial_cluster_id'].value_counts().head(10).index
{cid: spatial_components(df_spatial, cid) for cid in test_ids}

## 3) Temporal DBSCAN per spatial cluster
Per spatial cluster, eps in days, min_samples=1.

In [None]:
temporal = TemporalDBSCAN(e_days=E_DAYS)
df_temporal = temporal.fit_transform(df_spatial)
df_temporal[['spatial_cluster_id','temporal_cluster_id']].head()

## 4) Combine labels
Unique integer for each (spatial, temporal) pair.

In [None]:
combiner = CombineClusters()
df_combined = combiner.fit_transform(df_temporal)
df_combined[['spatial_cluster_id','temporal_cluster_id','spatiotemporal_cluster_id']].head()

### Validate connectivity of the final clusters
Re-run spatial DBSCAN within each spatiotemporal ID. If you see >1 component, the merge logic is at fault.

In [None]:
bad = []
eps_rad = E_DIST_KM / 6371
for cid, sub in df_combined.groupby('spatiotemporal_cluster_id'):
    if len(sub) <= 1:
        continue
    coords = np.radians(sub[['latitude1','longitude1']].to_numpy(float))
    labels = DBSCAN(eps=eps_rad, min_samples=1, metric='haversine', algorithm='ball_tree').fit_predict(coords)
    if labels.max() > 0:
        bad.append((cid, int(labels.max()+1), len(sub)))

len(bad), bad[:10]


## 5) Optional: single-pass spatiotemporal component growth
Compare against the joint spatiotemporal connected-components logic to see if DBSCAN chaining is the issue.

In [None]:
from expedition_clustering.pipeline import SpatioTemporalConnectedComponents
st = SpatioTemporalConnectedComponents(e_dist=E_DIST_KM, e_days=E_DAYS)
df_st = st.fit_transform(df_pre)
df_st['spatiotemporal_cluster_id'].nunique()
