# Secondary Clustering

In this notebook we will see how effectively we can glom on the remaining 2/3 of records that lack locality lat/lon data.

We'll be using the geography centroid lat/lon instead.

In [72]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [79]:
import numpy as np
import pandas as pd
from datetime import datetime

In [80]:
clean_df = pd.read_csv('../data/clean_df.csv', index_col=0)
processed_df = pd.read_csv('../data/full_processed_df.csv', index_col=0)

  clean_df = pd.read_csv('../data/clean_df.csv', index_col=0)


In [81]:
clean_df.columns = map(str.lower, clean_df.columns)
df = clean_df.merge(right=processed_df[['collectingeventid', 'spatiotemporal_cluster_id']], on='collectingeventid', how='left')

In [82]:
df.columns

Index(['collectingeventid', 'startdate', 'enddate', 'remarks', 'localityid',
       'collectionobjectid', 'text1', 'minelevation', 'maxelevation',
       'elevationaccuracy', 'latitude1', 'longitude1', 'localityname',
       'namedplace', 'geographyid', 'centroidlat', 'centroidlon', 'commonname',
       'fullname', 'name', 'spatial_flag', 'spatiotemporal_cluster_id'],
      dtype='object')

In [83]:
df.__len__()

867852

In [84]:
df.head(1)

Unnamed: 0,collectingeventid,startdate,enddate,remarks,localityid,collectionobjectid,text1,minelevation,maxelevation,elevationaccuracy,...,localityname,namedplace,geographyid,centroidlat,centroidlon,commonname,fullname,name,spatial_flag,spatiotemporal_cluster_id
0,1.0,2005-08-17,,Rhododendron-bamboo thicket with scattered Abies.,1.0,335013,Perennial herb 5-10 cm tall. Dry fruit brown.,3840.0,,,...,"Yaduo Cun, NE of Yaping Yakou at the Myanmar b...",,33223.0,,,,"Lishadi Xiang, Fugong County, Yunnan, China",Lishadi Xiang,1.0,26345.0


In [88]:
# Define the regex pattern for the desired date format
date_pattern = r'^\d{4}-\d{2}-\d{2}$'

# Keep rows where 'startdate' matches the date pattern
df = df[df['startdate'].str.match(date_pattern, na=True)]

# Convert 'startdate' to datetime
df['startdate'] = pd.to_datetime(df['startdate'], errors='coerce')

# Define date range
min_date = datetime(1700, 1, 1)
max_date = datetime.today()

# Filter rows within the date range
df = df[(df['startdate'] >= min_date) & (df['startdate'] <= max_date)]

df['centroidlat'] = df['centroidlat'].astype(float)
df['centroidlon'] = df['centroidlon'].astype(float)


Restrict df to only coarse spatial records, i.e. those without a spatiotemporal cluster id

In [89]:
record_df = df[df.spatiotemporal_cluster_id.isna()]

In [91]:
len(record_df)

628335

In [92]:
summary_df = pd.read_csv('../data/cluster_summary_stats.csv', index_col=0)

In [93]:
summary_df

Unnamed: 0,spatiotemporal_cluster_id,startdate_min,startdate_max,latitude_mean,latitude_std,longitude_mean,longitude_std,count,max_time_diff,max_lat_diff,max_lon_diff,max_time_diff_consecutive,max_lat_diff_consecutive,max_lon_diff_consecutive,day_range
0,0,1808-04-20,1808-04-20,26.495565,0.000000,88.114457,0.000000,2,0,0.000,0.000,0.0,0.000,0.000,0 days
1,1,1809-04-01,1809-04-01,26.520000,0.000000,88.730000,0.000000,3,0,0.000,0.000,0.0,0.000,0.000,0 days
2,2,1824-08-19,1824-08-19,34.325200,,-117.428100,,1,0,0.000,0.000,,,,0 days
3,3,1830-07-02,1830-07-02,27.830000,,95.670000,,1,0,0.000,0.000,,,,0 days
4,4,1834-01-01,1834-01-01,22.558676,0.000000,88.291099,0.000000,9,0,0.000,0.000,0.0,0.000,0.000,0 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35351,35351,2024-09-26,2024-09-26,47.055429,,-122.912680,,1,0,0.000,0.000,,,,0 days
35352,35352,2024-09-30,2024-09-30,47.053971,,-122.719350,,1,0,0.000,0.000,,,,0 days
35353,35353,2024-10-09,2024-10-09,46.857071,,-122.306897,,1,0,0.000,0.000,,,,0 days
35354,35354,2024-10-14,2024-10-14,47.031033,,-122.912741,,1,0,0.000,0.000,,,,0 days


In [94]:
summary_df['startdate_min'] = pd.to_datetime(summary_df['startdate_min'])
summary_df['startdate_max'] = pd.to_datetime(summary_df['startdate_max'])
summary_df['latitude_mean'] = summary_df['latitude_mean'].astype(float)
summary_df['latitude_std'] = summary_df['latitude_std'].astype(float)
summary_df['longitude_mean'] = summary_df['longitude_mean'].astype(float)
summary_df['longitude_std'] = summary_df['longitude_std'].astype(float)


# 1. Spatially Coarse Subset Proximity to Clusters

In [101]:
import numpy as np
import pandas as pd

def is_record_in_cluster(record, cluster_df, time_weight=0.5, space_weight=0.5):
    """
    Assess whether a record is close enough to a spatiotemporal cluster.
    
    Args:
        record (pd.Series): A single record from record_df.
        cluster_df (pd.DataFrame): The dataframe containing spatiotemporal clusters.
        time_weight (float): Weight for temporal proximity in confidence calculation.
        space_weight (float): Weight for spatial proximity in confidence calculation.
        
    Returns:
        tuple: (is_in_cluster (bool), closest_cluster_id (int or None), confidence (float))
    """
    closest_cluster_id = None
    highest_confidence = 0
    is_in_cluster = False

    # Skip record if coordinates are invalid
    if pd.isna(record['centroidlat']) or pd.isna(record['centroidlon']):
        return False, None, 0.0
    
    for _, cluster in cluster_df.iterrows():
        # Skip cluster if coordinates are invalid
        if pd.isna(cluster['latitude_mean']) or pd.isna(cluster['longitude_mean']):
            continue

        # Temporal proximity: Check if startdate falls within the cluster's date range
        temporal_proximity = cluster['startdate_min'] <= record['startdate'] <= cluster['startdate_max']
        
        # Spatial proximity (using degrees of latitude and longitude)
        lat_diff = abs(record['centroidlat'] - cluster['latitude_mean'])
        lon_diff = abs(record['centroidlon'] - cluster['longitude_mean'])
        
        spatial_proximity = (lat_diff <= 2 * cluster['latitude_std']) and (lon_diff <= 2 * cluster['longitude_std'])
        
        # Confidence calculation
        temporal_confidence = 1 if temporal_proximity else 0
        spatial_confidence = (
            np.exp(-lat_diff / (2 * cluster['latitude_std'])) *
            np.exp(-lon_diff / (2 * cluster['longitude_std']))
            if spatial_proximity else 0
        )
        confidence = time_weight * temporal_confidence + space_weight * spatial_confidence
        
        # Determine if it's the closest cluster
        if confidence > highest_confidence:
            highest_confidence = confidence
            closest_cluster_id = cluster['spatiotemporal_cluster_id']
            is_in_cluster = temporal_proximity and spatial_proximity
    
    return is_in_cluster, closest_cluster_id, highest_confidence




In [102]:
# Example: Applying the function to df
record_df['is_in_cluster'], record_df['closest_cluster_id'], record_df['confidence'] = zip(
    *record_df.apply(lambda record: is_record_in_cluster(record, summary_df), axis=1)
)




KeyboardInterrupt: 

In [111]:
import numpy as np
import pandas as pd
from datetime import timedelta

def compute_cluster_membership(record_df, cluster_df, time_weight=0.5, space_weight=0.5):
    """
    Compute whether records belong to spatiotemporal clusters using vectorized operations,
    considering a ±2° lat/lon and ±10-day temporal range.

    Args:
        record_df (pd.DataFrame): The DataFrame containing records.
        cluster_df (pd.DataFrame): The DataFrame containing spatiotemporal clusters.
        time_weight (float): Weight for temporal proximity in confidence calculation.
        space_weight (float): Weight for spatial proximity in confidence calculation.

    Returns:
        pd.DataFrame: Updated record_df with 'is_in_cluster', 'closest_cluster_id', and 'confidence' columns.
    """
    # Initialize results
    closest_cluster_ids = []
    confidences = []
    is_in_cluster_flags = []

    # Iterate over records
    for _, record in record_df.iterrows():
        # Define the filtering bounds
        startdate_lower = record['startdate'] - timedelta(days=10)
        startdate_upper = record['startdate'] + timedelta(days=10)
        lat_lower = record['centroidlat'] - 2
        lat_upper = record['centroidlat'] + 2
        lon_lower = record['centroidlon'] - 2
        lon_upper = record['centroidlon'] + 2

        # Filter clusters within the defined bounds
        candidate_clusters = cluster_df[
            (cluster_df['startdate_min'] <= startdate_upper) &
            (cluster_df['startdate_max'] >= startdate_lower) &
            (cluster_df['latitude_mean'] >= lat_lower) &
            (cluster_df['latitude_mean'] <= lat_upper) &
            (cluster_df['longitude_mean'] >= lon_lower) &
            (cluster_df['longitude_mean'] <= lon_upper)
        ]

        if candidate_clusters.empty:
            # No matching clusters
            closest_cluster_ids.append(None)
            confidences.append(0.0)
            is_in_cluster_flags.append(False)
            continue

        # Calculate lat/lon differences (broadcasted)
        lat_diffs = abs(candidate_clusters['latitude_mean'] - record['centroidlat'])
        lon_diffs = abs(candidate_clusters['longitude_mean'] - record['centroidlon'])

        # Check spatial proximity
        spatial_proximities = (
            (lat_diffs <= 2 * candidate_clusters['latitude_std']) &
            (lon_diffs <= 2 * candidate_clusters['longitude_std'])
        )

        # Compute confidence for each cluster
        spatial_confidences = np.exp(-lat_diffs / (2 * candidate_clusters['latitude_std'])) * \
                              np.exp(-lon_diffs / (2 * candidate_clusters['longitude_std']))
        spatial_confidences[~spatial_proximities] = 0  # Zero out confidence for non-matching clusters

        temporal_confidences = pd.Series(1.0, index=candidate_clusters.index)  # Temporal confidence is binary
        temporal_confidences[~spatial_proximities] = 0

        confidences_per_cluster = (
            time_weight * temporal_confidences +
            space_weight * spatial_confidences
        )

        # Find the best match
        if confidences_per_cluster.max() > 0:
            best_cluster = confidences_per_cluster.idxmax()
            closest_cluster_ids.append(candidate_clusters.loc[best_cluster, 'spatiotemporal_cluster_id'])
            confidences.append(confidences_per_cluster.max())
            is_in_cluster_flags.append(True)
        else:
            closest_cluster_ids.append(None)
            confidences.append(0.0)
            is_in_cluster_flags.append(False)

    # Add results back to record_df
    record_df['is_in_cluster'] = is_in_cluster_flags
    record_df['closest_cluster_id'] = closest_cluster_ids
    record_df['confidence'] = confidences

    return record_df


In [112]:
clustered_record_df = compute_cluster_membership(record_df, summary_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record_df['is_in_cluster'] = is_in_cluster_flags
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record_df['closest_cluster_id'] = closest_cluster_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record_df['confidence'] = confidences


In [113]:
clustered_record_df.is_in_cluster.sum()/len(clustered_record_df)

np.float64(3.819618515600754e-05)