# Followup: Determination, Taxon, and Collector Tables

In this notebook we will investigate the determination, taxon, and collector tables, and decide if they can be used to improve our clustering model.

In [19]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [20]:
import pandas as pd
import pymysql

In [69]:
clean_df = pd.read_csv('../data/clean_df.csv', index_col=0)
processed_df = pd.read_csv('../data/full_processed_df.csv', index_col=0)

clean_df.columns = map(str.lower, clean_df.columns)
df = clean_df.merge(right=processed_df[['collectingeventid', 'spatiotemporal_cluster_id']], on='collectingeventid', how='left')

  clean_df = pd.read_csv('../data/clean_df.csv', index_col=0)


In [70]:
df

Unnamed: 0,collectingeventid,startdate,enddate,remarks,localityid,collectionobjectid,text1,minelevation,maxelevation,elevationaccuracy,...,localityname,namedplace,geographyid,centroidlat,centroidlon,commonname,fullname,name,spatial_flag,spatiotemporal_cluster_id
0,1.0,2005-08-17,,Rhododendron-bamboo thicket with scattered Abies.,1.0,335013,Perennial herb 5-10 cm tall. Dry fruit brown.,3840.0,,,...,"Yaduo Cun, NE of Yaping Yakou at the Myanmar b...",,33223.0,,,,"Lishadi Xiang, Fugong County, Yunnan, China",Lishadi Xiang,1.0,26345.0
1,3.0,1922-08-10,1922-08-10,,3.0,10675,,4700.0,4700.0,0.0,...,Medow W of Gutzman's.,,17158.0,41.57,-100.0,,"Modoc County, California, United States",Modoc County,1.0,2295.0
2,4.0,1950-04-24,,atop sea bluffs. With Dudleya edulis over a l...,295658.0,392458,,,,,...,"San Clemente, atop sea bluffs",,19808.0,33.70,-100.0,,"Orange County, California, United States",Orange County,0.0,
3,7.0,1888-01-01,1888-01-01,,7.0,266069,,,,,...,Emigrant Gap.,,20834.0,39.07,-100.0,,"Placer County, California, United States",Placer County,0.0,
4,8.0,2006-08-18,2006-08-18,Wetland vegetation.,8.0,36409,Perennial herb 40-80 cm tall. Young fruit green.,3470.0,3470.0,0.0,...,Along N side of Nianwaluo He on the trail from...,Along N side of Nianwaluo He on the trail from...,33268.0,,,,"Bingzhongluo Xiang, Gongshan Autonomous County...",Bingzhongluo Xiang,1.0,27060.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
867847,1055140.0,1990-03-21,,on soil along the trail in part shade,984982.0,1220335,,50.0,,,...,"Pacific Spirit Park, in Vancouver, along the D...",,558.0,54.00,-100.0,,"British Columbia, Canada",British Columbia,0.0,
867848,1055141.0,2009-03-28,,on an oak tree trunk in part shade,984983.0,1220336,,110.0,,,...,"Ducktrap River Preserve, Coastal Mountains Lan...",,26853.0,44.47,-69.1,,"Waldo County, Maine, United States",Waldo County,1.0,28779.0
867849,1055142.0,2005-06-16,,in crevices and the face of the volcanic cliff...,984984.0,1220337,,6740.0,,,...,"Yellowstone National Park, at the entrance to ...",,20195.0,44.42,-100.0,,"Park County, Wyoming, United States",Park County,1.0,26233.0
867850,1055143.0,1940-07-08,,submerged along stream above the Springs,984985.0,1220338,,,,,...,"Jasper National Park, Miette Hot Springs",,312.0,52.28,-100.0,,"Alberta, Canada",Alberta,1.0,4804.0


In [23]:
# Database connection parameters
db_config = {
    'host': 'localhost',  # or the IP address of your MySQL server
    'port': 3306,         # default MySQL port
    'user': 'myuser',
    'password': 'mypassword',
    'database': 'exped_cluster_db'
}

# Establishing the connection
connection = pymysql.connect(
    host=db_config['host'],
    port=db_config['port'],
    user=db_config['user'],
    password=db_config['password'],
    database=db_config['database']
)



# 1. Determination

Lets pull the table from the CAS Botany backup, and look into it!

## a.) EDA

In [7]:

# SQL query to fetch data
query = "SELECT * FROM determination"

# Reading the data into a pandas DataFrame
determination_df = pd.read_sql(query, connection)

# Displaying the first few rows of the DataFrame
determination_df.head()

  determination_df = pd.read_sql(query, connection)


Unnamed: 0,DeterminationID,TimestampCreated,TimestampModified,Version,CollectionMemberID,Addendum,AlternateName,Confidence,DeterminedDate,DeterminedDatePrecision,...,Number5,Text3,Text4,Text5,Text6,Text7,Text8,YesNo3,YesNo4,YesNo5
0,1,2007-03-21 17:17:55,2016-10-19 09:19:30,2,4,,,,,1.0,...,,,,,,,,,,
1,3,2011-08-05 12:12:29,NaT,1,4,,,,,1.0,...,,,,,,,,,,
2,4,2007-03-21 17:17:54,2016-05-13 14:43:44,2,4,,,,,1.0,...,,,,,,,,,,
3,5,2007-03-21 17:17:53,NaT,1,4,,,,1955-07-01,2.0,...,,,,,,,,,,
4,6,2011-08-03 09:43:44,NaT,1,4,,,,,1.0,...,,,,,,,,,,


In [None]:
determination_df.columns

Index(['DeterminationID', 'TimestampCreated', 'TimestampModified', 'Version',
       'CollectionMemberID', 'Addendum', 'AlternateName', 'Confidence',
       'DeterminedDate', 'DeterminedDatePrecision', 'FeatureOrBasis',
       'IsCurrent', 'Method', 'NameUsage', 'Number1', 'Number2', 'Qualifier',
       'VarQualifer', 'Remarks', 'SubSpQualifier', 'Text1', 'Text2',
       'TypeStatusName', 'VarQualifier', 'YesNo1', 'YesNo2', 'GUID', 'TaxonID',
       'CollectionObjectID', 'ModifiedByAgentID', 'DeterminerID',
       'PreferredTaxonID', 'CreatedByAgentID', 'Integer1', 'Integer2',
       'Integer3', 'Integer4', 'Integer5', 'Number3', 'Number4', 'Number5',
       'Text3', 'Text4', 'Text5', 'Text6', 'Text7', 'Text8', 'YesNo3',
       'YesNo4', 'YesNo5'],
      dtype='object')

In [15]:
determination_df[['DeterminationID', 'CollectionObjectID', 'PreferredTaxonID']]

Unnamed: 0,DeterminationID,CollectionObjectID,PreferredTaxonID
0,1,1,92652.0
1,3,123487,57547.0
2,4,7,100228.0
3,5,8,62636.0
4,6,9848,61417.0
...,...,...,...
1092796,1221629,1220335,94020.0
1092797,1221630,1220336,62903.0
1092798,1221631,1220337,62239.0
1092799,1221632,1220338,53943.0


Columns of interest: DeterminationID, CollectionObjectID, PreferredTaxonID

# 2. Taxon

Lets pull the table from the CAS Botany backup, and look into it!

## a.) EDA

In [24]:

# SQL query to fetch data
query = "SELECT * FROM taxon"

# Reading the data into a pandas DataFrame
taxon_df = pd.read_sql(query, connection)

# Displaying the first few rows of the DataFrame
taxon_df.head()

  taxon_df = pd.read_sql(query, connection)


Unnamed: 0,TaxonID,TimestampCreated,TimestampModified,Version,Author,CitesStatus,COLStatus,CommonName,CultivarName,EnvironmentalProtectionStatus,...,YesNo18,YesNo19,YesNo4,YesNo5,YesNo6,YesNo7,YesNo8,YesNo9,LSID,TaxonAttributeID
0,1,2011-09-21 13:48:50,2011-09-21 13:48:50,20,,,,,,,...,,,,,,,,,,
1,2,2011-09-21 01:48:03,NaT,19,,,,,,,...,,,,,,,,,,
2,3,2011-09-21 01:48:03,2021-02-24 21:11:54,5,,,,,,,...,,,,,,,,,,
3,4,2011-09-21 01:48:03,2021-02-25 16:13:58,3,,,,,,,...,,,,,,,,,,
4,5,2011-09-21 01:48:03,NaT,1,,,,,,,...,,,,,,,,,,


In [25]:
taxon_df.columns

Index(['TaxonID', 'TimestampCreated', 'TimestampModified', 'Version', 'Author',
       'CitesStatus', 'COLStatus', 'CommonName', 'CultivarName',
       'EnvironmentalProtectionStatus', 'EsaStatus', 'FullName', 'GroupNumber',
       'GUID', 'HighestChildNodeNumber', 'IsAccepted', 'IsHybrid',
       'IsisNumber', 'LabelFormat', 'Name', 'NcbiTaxonNumber', 'NodeNumber',
       'Number1', 'Number2', 'RankID', 'Remarks', 'Source',
       'TaxonomicSerialNumber', 'Text1', 'Text2', 'UnitInd1', 'UnitInd2',
       'UnitInd3', 'UnitInd4', 'UnitName1', 'UnitName2', 'UnitName3',
       'UnitName4', 'UsfwsCode', 'Visibility', 'AcceptedID', 'TaxonTreeDefID',
       'ParentID', 'HybridParent1ID', 'ModifiedByAgentID', 'CreatedByAgentID',
       'VisibilitySetByID', 'TaxonTreeDefItemID', 'HybridParent2ID', 'Number3',
       'Number4', 'Number5', 'Text3', 'Text4', 'Text5', 'YesNo1', 'YesNo2',
       'YesNo3', 'Integer1', 'Integer2', 'Integer3', 'Integer4', 'Integer5',
       'Text10', 'Text11', 'Text12',

In [26]:
taxon_df[['TaxonID', 'FullName', 'Name', 'ParentID']]

Unnamed: 0,TaxonID,FullName,Name,ParentID
0,1,Life,Life,
1,2,Plantae,Plantae,1.0
2,3,Magnoliophyta,Magnoliophyta,14641.0
3,4,Liliopsida,Liliopsida,179282.0
4,5,Poales,Poales,4.0
...,...,...,...,...
191375,261025,Saxifraga tangutica var. tangutica,tangutica,261024.0
191376,261026,Pittosporum arborescens,arborescens,21441.0
191377,261027,Chiloscyphus profundus,profundus,19035.0
191378,261028,Couroupita nicaraguensis,nicaraguensis,19252.0


Columns of interest: TaxonID, FullName, Name, ParentID

# 3. Collector

Lets pull the table from the CAS Botany backup, and look into it!

## a.) EDA

In [27]:

# SQL query to fetch data
query = "SELECT * FROM collector"

# Reading the data into a pandas DataFrame
collector_df = pd.read_sql(query, connection)

# Displaying the first few rows of the DataFrame
collector_df.head()

  collector_df = pd.read_sql(query, connection)


Unnamed: 0,CollectorID,TimestampCreated,TimestampModified,Version,IsPrimary,OrderNumber,Remarks,CreatedByAgentID,DivisionID,ModifiedByAgentID,CollectingEventID,AgentID,Text1,Text2,YesNo1,YesNo2
0,1,2007-05-31 00:00:00,NaT,0,b'\x01',0,,,,,1,56882,,,,
1,2,2007-05-31 00:00:00,2017-06-07 16:46:53,1,b'\x01',0,,,,78286.0,2,5043,,,,
2,3,2007-05-31 00:00:00,NaT,0,b'\x01',0,,,,,3,4934,,,,
3,5,2009-05-22 16:17:06,2009-05-22 16:19:01,0,b'\x01',0,,,,,5,3909,,,,
4,6,2007-05-31 00:00:00,NaT,0,b'\x01',0,,,,,6,3879,,,,


In [28]:
collector_df.columns

Index(['CollectorID', 'TimestampCreated', 'TimestampModified', 'Version',
       'IsPrimary', 'OrderNumber', 'Remarks', 'CreatedByAgentID', 'DivisionID',
       'ModifiedByAgentID', 'CollectingEventID', 'AgentID', 'Text1', 'Text2',
       'YesNo1', 'YesNo2'],
      dtype='object')

In [29]:
collector_df[['CollectorID', 'CollectingEventID', 'AgentID']]

Unnamed: 0,CollectorID,CollectingEventID,AgentID
0,1,1,56882
1,2,2,5043
2,3,3,4934
3,5,5,3909
4,6,6,3879
...,...,...,...
1385482,1470170,1055141,88951
1385483,1470171,1055142,88951
1385484,1470172,1055143,7738
1385485,1470173,1055143,84592


Columns of interest: CollectorID, CollectingEventID, AgentID

# 4. Agent

Lets pull the table from the CAS Botany backup, and look into it!

## a.) EDA

In [30]:

# SQL query to fetch data
query = "SELECT * FROM agent"

# Reading the data into a pandas DataFrame
agent_df = pd.read_sql(query, connection)

# Displaying the first few rows of the DataFrame
agent_df.head()

  agent_df = pd.read_sql(query, connection)


Unnamed: 0,AgentID,TimestampCreated,TimestampModified,Version,Abbreviation,AgentType,DateOfBirth,DateOfBirthPrecision,DateOfDeath,DateOfDeathPrecision,...,Date2Precision,Integer1,Integer2,Text1,Text2,VerbatimDate1,VerbatimDate2,Text3,Text4,Text5
0,1,2011-09-21 13:48:48,2011-09-21 13:48:48,3,,1,,1.0,,1.0,...,,,,,,,,,,
1,2,2011-09-21 14:01:13,2012-04-20 18:48:31,8,,1,,1.0,,1.0,...,,,,,,,,,,
2,3,2007-03-21 17:44:36,2007-03-21 17:44:36,1,,1,,1.0,,1.0,...,,,,,,,,,,
3,4,2007-03-21 17:44:37,2023-04-28 11:16:34,3,,1,,1.0,,1.0,...,,,,,,,,,,
4,5,2007-03-21 17:44:36,2007-03-21 17:44:36,1,,1,,1.0,,1.0,...,,,,,,,,,,


In [31]:
agent_df.columns

Index(['AgentID', 'TimestampCreated', 'TimestampModified', 'Version',
       'Abbreviation', 'AgentType', 'DateOfBirth', 'DateOfBirthPrecision',
       'DateOfDeath', 'DateOfDeathPrecision', 'DateType', 'Email', 'FirstName',
       'GUID', 'Initials', 'Interests', 'JobTitle', 'LastName',
       'MiddleInitial', 'Remarks', 'Title', 'URL', 'ParentOrganizationID',
       'InstitutionTCID', 'CreatedByAgentID', 'CollectionTCID',
       'CollectionCCID', 'ModifiedByAgentID', 'InstitutionCCID',
       'SpecifyUserID', 'DivisionID', 'Suffix', 'Date1', 'Date1Precision',
       'Date2', 'Date2Precision', 'Integer1', 'Integer2', 'Text1', 'Text2',
       'VerbatimDate1', 'VerbatimDate2', 'Text3', 'Text4', 'Text5'],
      dtype='object')

In [33]:
agent_df[['AgentID', 'AgentType', 'DateOfBirth', 'DateOfDeath', 'Email', 'FirstName', 'JobTitle', 'LastName', 'ParentOrganizationID',
       'InstitutionTCID', 'CollectionTCID', 'CollectionCCID', 'InstitutionCCID']]

Unnamed: 0,AgentID,AgentType,DateOfBirth,DateOfDeath,Email,FirstName,JobTitle,LastName,ParentOrganizationID,InstitutionTCID,CollectionTCID,CollectionCCID,InstitutionCCID
0,1,1,,,dtrock@calacademy.org,Debra,,Trock,,,,,
1,2,1,,,jfong@calacademy.org,Jon,,Fong,,,,,
2,3,1,,,,H.,,Inoue,,,,,
3,4,1,,,,D.,,Breedlove,,,,,
4,5,1,,,,D.,,Tao,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96815,106391,1,,,,Sverre,,Schou,,,,,
96816,106392,1,,,,,,Ex. herb. Joseph Crawford,,,,,
96817,106393,1,,,,A.,,Vaarama,,,,,
96818,106394,1,,,,K.,,Khanna,,,,,


In [44]:
agent_df.AgentType.unique()

array([1, 2, 0, 3])

Columns of interest: AgentID, FirstName, LastName

Let's do some merging to get a collector reference table complete with personal information

In [48]:
collector_agent_df = collector_df[['CollectorID', 'CollectingEventID', 'AgentID']].merge(right=agent_df[['AgentID', 'FirstName', 'LastName']], on='AgentID', how='left')

collector_agent_df.columns = map(str.lower, collector_agent_df.columns)


In [49]:
collector_agent_df

Unnamed: 0,collectorid,collectingeventid,agentid,firstname,lastname
0,1,1,56882,,Gaoligong Shan Biodiversity Survey
1,2,2,5043,E.,Palacios E.
2,3,3,4934,Leland,Smith
3,5,5,3909,A.,Shilom Ton
4,6,6,3879,Frank,Almeda
...,...,...,...,...,...
1385482,1470170,1055141,88951,Judy,Harpel
1385483,1470171,1055142,88951,Judy,Harpel
1385484,1470172,1055143,7738,E.,Frye
1385485,1470173,1055143,84592,T.,Frye


And now lets use this collector_agent reference table to merge this data into the overall df

In [71]:
df = df.merge(right=collector_agent_df, on='collectingeventid', how='left')

In [72]:
df.agentid.value_counts()

agentid
5116.0      60885
4804.0      56004
1637.0      27188
5534.0      25707
56882.0     18599
            ...  
92558.0         1
80216.0         1
6282.0          1
92565.0         1
106395.0        1
Name: count, Length: 34000, dtype: int64

In [73]:
agent_df[agent_df.AgentID == 5116][['FirstName', 'LastName']]

Unnamed: 0,FirstName,LastName
5100,John,Howell


In [74]:
agent_df[agent_df.AgentID == 5116].to_dict()

{'AgentID': {5100: 5116},
 'TimestampCreated': {5100: Timestamp('2007-05-31 15:42:47')},
 'TimestampModified': {5100: Timestamp('2007-05-31 15:42:47')},
 'Version': {5100: 1},
 'Abbreviation': {5100: None},
 'AgentType': {5100: 1},
 'DateOfBirth': {5100: None},
 'DateOfBirthPrecision': {5100: 1.0},
 'DateOfDeath': {5100: None},
 'DateOfDeathPrecision': {5100: 1.0},
 'DateType': {5100: None},
 'Email': {5100: None},
 'FirstName': {5100: 'John'},
 'GUID': {5100: '75a2af2f-f8be-11e2-a0e5-60eb693e819a'},
 'Initials': {5100: None},
 'Interests': {5100: 'Phan.'},
 'JobTitle': {5100: '1903-1994'},
 'LastName': {5100: 'Howell'},
 'MiddleInitial': {5100: 'Thomas'},
 'Remarks': {5100: 'C. Amer. & Mexico: CAS, F, GH,.  United States of America Mexico Ecuador'},
 'Title': {5100: None},
 'URL': {5100: None},
 'ParentOrganizationID': {5100: None},
 'InstitutionTCID': {5100: None},
 'CreatedByAgentID': {5100: nan},
 'CollectionTCID': {5100: None},
 'CollectionCCID': {5100: None},
 'ModifiedByAgentID'

In [75]:
df['spatiotemporal_cluster_id']

0          26345.0
1           2295.0
2              NaN
3              NaN
4          27060.0
            ...   
1194709    28779.0
1194710    26233.0
1194711     4804.0
1194712     4804.0
1194713        NaN
Name: spatiotemporal_cluster_id, Length: 1194714, dtype: float64

In [76]:
df.columns

Index(['collectingeventid', 'startdate', 'enddate', 'remarks', 'localityid',
       'collectionobjectid', 'text1', 'minelevation', 'maxelevation',
       'elevationaccuracy', 'latitude1', 'longitude1', 'localityname',
       'namedplace', 'geographyid', 'centroidlat', 'centroidlon', 'commonname',
       'fullname', 'name', 'spatial_flag', 'spatiotemporal_cluster_id',
       'collectorid', 'agentid', 'firstname', 'lastname'],
      dtype='object')

In [80]:
from collections import Counter

import pandas as pd
from geopy.distance import great_circle

# # Example: Load your clustered data
# df = pd.read_csv("botany_collections.csv")  # Your dataset with lat, lon, date, collector, and cluster labels

# Step 1: Find the majority collector for each cluster
clustered = df[df['spatiotemporal_cluster_id'] != -1]  # Exclude noise points
majority_collectors = (
    clustered.groupby('spatiotemporal_cluster_id')['agentid']
    .apply(lambda x: Counter(x).most_common(1)[0][0])  # Most frequent collector in the cluster
    .to_dict()
)

# Step 2: Identify unclustered points or small clusters
unclustered = df[df['spatiotemporal_cluster_id'] == -1]

# Step 3: Merge unclustered points if they match the collector and are nearby
merge_threshold_km = 5  # Define a reasonable distance threshold
merge_threshold_days = 10  # Define a reasonable time threshold


reassigned_count = 0
affected_clusters = set()

for index, row in unclustered.iterrows():
    lat, lon, date, collector = row['latitude1'], row['longitude1'], row['startdate'], row['agentid']
    
    for cluster_id, majority_collector in majority_collectors.items():
        if collector == majority_collector:  # Collector name matches
            # Find any point in the cluster to compare
            cluster_points = clustered[clustered['spatiotemporal_cluster_id'] == cluster_id]
            
            # Compute minimum spatial & temporal distance
            min_dist = cluster_points.apply(
                lambda p: great_circle((lat, lon), (p['latitude1'], p['longitude1'])).km, axis=1
            ).min()
            
            min_time_diff = cluster_points.apply(
                lambda p: abs(pd.to_datetime(date) - pd.to_datetime(p['startdate'])).days, axis=1
            ).min()
            
            # Merge if within thresholds
            if min_dist <= merge_threshold_km and min_time_diff <= merge_threshold_days:
                df.at[index, 'new_cluster'] = cluster_id  # Assign cluster label
                reassigned_count += 1
                affected_clusters.add(cluster_id)

# Summary statistics
total_unclustered = len(unclustered)
reassignment_percentage = (reassigned_count / total_unclustered) * 100 if total_unclustered > 0 else 0

print(f"Total points reassigned from -1 to a valid cluster: {reassigned_count}")
print(f"Number of unique clusters affected: {len(affected_clusters)}")
print(f"Percentage of originally unclustered points reassigned: {reassignment_percentage:.2f}%")





Total points reassigned from -1 to a valid cluster: 0
Number of unique clusters affected: 0
Percentage of originally unclustered points reassigned: 0.00%


It looks like accounting for collector name did not affect our clustering at all! Seems like they agree well.

This is not surprising because the records missing lat/lon and start date are probably also missing agentid. Therefore this step is more of a check than anything.

TODO: Check to make sure this code is actually working! To test, one could assign an *incorrect* cluster id to a record far in space and time from the cluster specs, and check to make sure that it is identified and removed from the cluster in this step.