In [None]:
import pandas as pd
from expedition_clustering.plotting import plot_geographical_positions

In [None]:
cluster_df = pd.read_csv("../data/all_expeditions.csv")   

In [None]:
big_cluster_df = cluster_df[cluster_df.spatiotemporal_cluster_id == 1000030]

In [None]:
big_cluster_df.startdate.min(), big_cluster_df.startdate.max()

In [None]:
# convert startdate to datetimes (will respect existing pd import and big_cluster_df)
big_cluster_df['startdate_dt'] = pd.to_datetime(big_cluster_df['startdate'], errors='coerce')

big_cluster_df = big_cluster_df.sort_values('startdate_dt')

# time difference between each row and the previous row in the current DataFrame order
big_cluster_df['time_diff'] = big_cluster_df['startdate_dt'].diff()  # pandas.Timedelta

# numeric difference in days (float, NaN where diff is missing)
big_cluster_df['time_diff_days'] = big_cluster_df['time_diff'].dt.total_seconds() / 86400.0

# If you prefer differences after sorting by date (chronological adjacent differences) without modifying original order:
sorted_diffs = big_cluster_df.sort_values('startdate_dt')['startdate_dt'].diff()
# you can attach them back if needed:
big_cluster_df['time_diff_sorted_days'] = big_cluster_df['startdate_dt'].map(lambda x: pd.NaT)  # placeholder
big_cluster_df.loc[sorted_diffs.index, 'time_diff_sorted_days'] = sorted_diffs.dt.total_seconds() / 86400.0

In [None]:
big_cluster_df['time_diff_days'].hist(bins=30)

In [None]:
cluster_counts = cluster_df.spatiotemporal_cluster_id.value_counts().reset_index()
cluster_counts


In [None]:
cluster_ids = cluster_counts[cluster_counts['count'] < 100].spatiotemporal_cluster_id.tolist()

cluster_counts[cluster_counts['count'] < 100]


In [None]:
i = 0
cluster_id = cluster_ids[i]	

cluster_df[cluster_df.spatiotemporal_cluster_id == cluster_id][['startdate', 'latitude1', 'longitude1']]

In [None]:
cluster_ids[1]	

In [None]:

i = 1
cluster_id = cluster_ids[i]	

plot_geographical_positions(cluster_df[cluster_df.spatiotemporal_cluster_id == cluster_id], lat_col='latitude1', lon_col='longitude1', datetime_col='startdate',
                            zoom=3, cluster_line=True, plot_towns=False, plot_rivers=True, plot_roads=True)
                            # zoom='auto', cluster_line=True, )

In [None]:
test_df = cluster_df[cluster_df.spatiotemporal_cluster_id == cluster_id]

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN

# Sample coordinates (replace with your actual data)
# Data should be in a NumPy array or pandas DataFrame as [latitude, longitude]
coords = test_df[['latitude1', 'longitude1']].to_numpy()
# Define the epsilon value in kilometers
# Example: 1.5 kilometers (adjust as needed)
epsilon_km = 10

# The radius of the Earth in kilometers (approx)
kms_per_radian = 6371 

# Calculate eps in radians
epsilon_radians = epsilon_km / kms_per_radian

# Apply DBSCAN
db = DBSCAN(
    eps=epsilon_radians, 
    min_samples=1, # Minimum number of points for a cluster
    algorithm='ball_tree', 
    metric='haversine'
).fit(np.radians(coords)) # Convert coordinates to radians

# Get cluster labels
cluster_labels = db.labels_

# Number of clusters (excluding noise, which is labeled as -1)
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

print(f'Number of clusters found: {num_clusters}')


In [None]:

from sklearn.cluster import DBSCAN
import numpy as np
X = np.array([[1, 2], [2, 2], [2, 3],
              [8, 7], [8, 8], [25, 80]])
clustering = DBSCAN(eps=3, min_samples=2, metric=).fit(X)
clustering.labels_
clustering

In [None]:
import numpy as np
from sklearn.neighbors import BallTree

# cl_id = 123  # your cluster id
cl = cluster_df[cluster_df.spatiotemporal_cluster_id == cluster_id]
coords = np.radians(cl[['latitude1','longitude1']].to_numpy())
tree = BallTree(coords, metric="haversine")
dists, _ = tree.query(coords, k=2)
max_nn_km = dists[:,1].max() * 6371
print("Max nearest-neighbor distance (km):", max_nn_km)


In [None]:
from sklearn.neighbors import BallTree
coords = np.radians(cl[['latitude1','longitude1']])
dists, _ = BallTree(coords, metric="haversine").query(coords, k=2)
print((dists[:,1].max() * 6371), "km max nearest-neighbor")


In [None]:
spread_cluster_df =cluster_df[cluster_df.spatiotemporal_cluster_id == cluster_id]

spread_cluster_df = spread_cluster_df.sort_values('latitude1')

# time difference between each row and the previous row in the current DataFrame order
spread_cluster_df['lat_diff'] = spread_cluster_df['latitude1'].diff()  # pandas.Timedelta

In [None]:
cluster_df[cluster_df.spatiotemporal_cluster_id == cluster_id]