# Expedition Clustering: Botany EDA

### In this notebook, we will perform exploratory data analysis on the tables in the botany database. 

### The goal is to determine which fields in which tables may hold information useful in clustering records to individual expeditions.

## Tables
-  **CollectingEvent**
-  **CollectingEventAttribute**
-  **CollectionObject**
-  **CollectionObjectAttachment**
-  **Attachment**
-  **Locality**
-  **LocalityDetail**
-  **Geography**
-  **GeoCoordDetail**

In [2]:
import numpy as np
import pandas as pd

import pymysql

In [3]:
# Database connection parameters
db_config = {
    'host': 'localhost',  # or the IP address of your MySQL server
    'port': 3306,         # default MySQL port
    'user': 'myuser',
    'password': 'mypassword',
    'database': 'exped_cluster_db'
}

# Establishing the connection
connection = pymysql.connect(
    host=db_config['host'],
    port=db_config['port'],
    user=db_config['user'],
    password=db_config['password'],
    database=db_config['database']
)



# Collecting Event Table

In [4]:
# Create a cursor object
cursor = connection.cursor()

# Execute the command to list databases
# Execute the command to list databases
cursor.execute("SHOW TABLES")

# Fetch and print the databases
tables = cursor.fetchall()
for db in tables:
    print(db[0])

# # Close the connection
# connection.close()

accession
accession_copy
accessionagent
accessionattachment
accessionauthorization
accessioncitation
address
addressofrecord
agent
agentattachment
agentgeography
agentidentifier
agentspecialty
agentvariant
appraisal
attachment
attachmentdataset
attachmentimageattribute
attachmentmetadata
attachmenttag
attributedef
auth_group
auth_group_permissions
auth_permission
author
autonumberingscheme
autonumsch_coll
autonumsch_div
autonumsch_dsp
borrow
borrowagent
borrowattachment
borrowmaterial
borrowreturnmaterial
botmap
botportal
bryo_images
bryo_nobarcode
bryoportal
cch2_bryophyte
cch2_extra
cch2_images
cch2_nobarcode
cch2_not_in_specify
collectingevent
collectingevent_dupes
collectingeventattachment
collectingeventattr
collectingeventattribute
collectingeventauthorization
collectingtrip
collectingtripattachment
collectingtripattribute
collectingtripauthorization
collection
collectionobject
collectionobject_dupes
collectionobjectattachment
collectionobjectattr
collectionobjectattribute
collec

In [5]:

# SQL query to fetch data
query = "SELECT * FROM collectingevent"

# Reading the data into a pandas DataFrame
collecting_event_df = pd.read_sql(query, connection)

# Displaying the first few rows of the DataFrame
collecting_event_df.head()

  collecting_event_df = pd.read_sql(query, connection)


Unnamed: 0,CollectingEventID,TimestampCreated,TimestampModified,Version,EndDate,EndDatePrecision,EndDateVerbatim,EndTime,Method,Remarks,...,StationFieldNumberModifier1,StationFieldNumberModifier2,StationFieldNumberModifier3,Text3,Text4,Text5,Text6,Text7,Text8,UniqueIdentifier
0,1,2007-03-21 16:23:34,2007-03-21 16:23:34,2,,0.0,,,,Rhododendron-bamboo thicket with scattered Abies.,...,,,,,,,,,,
1,2,2007-03-21 16:23:27,2017-06-07 16:46:53,2,,0.0,,,,"Selva baja caducifolia, Ruderal. Secund.",...,,,,,,,,,,
2,3,2007-03-21 16:23:48,2010-10-05 14:21:32,1,1922-08-10,1.0,,,,,...,,,,,,,,,,
3,4,2012-05-16 16:31:38,2012-05-17 14:43:24,2,,1.0,,,,atop sea bluffs. With Dudleya edulis over a l...,...,,,,,,,,,,
4,5,2009-05-22 16:17:06,2009-05-22 16:19:59,1,1964-03-01,2.0,,,,Slope with Quercus,...,,,,,,,,,,


In [6]:
len(collecting_event_df)

777807

In [7]:
777807

777807

In [8]:
collecting_event_df.columns

Index(['CollectingEventID', 'TimestampCreated', 'TimestampModified', 'Version',
       'EndDate', 'EndDatePrecision', 'EndDateVerbatim', 'EndTime', 'Method',
       'Remarks', 'StartDate', 'StartDatePrecision', 'StartDateVerbatim',
       'StartTime', 'StationFieldNumber', 'VerbatimDate', 'VerbatimLocality',
       'Visibility', 'ModifiedByAgentID', 'LocalityID', 'CreatedByAgentID',
       'DisciplineID', 'VisibilitySetByID', 'CollectingTripID',
       'CollectingEventAttributeID', 'SGRStatus', 'GUID', 'Integer1',
       'Integer2', 'ReservedInteger3', 'ReservedInteger4', 'ReservedText1',
       'ReservedText2', 'Text1', 'Text2', 'PaleoContextID',
       'StationFieldNumberModifier1', 'StationFieldNumberModifier2',
       'StationFieldNumberModifier3', 'Text3', 'Text4', 'Text5', 'Text6',
       'Text7', 'Text8', 'UniqueIdentifier'],
      dtype='object')

In [9]:
collecting_event_df[['CollectingEventID', 'StartDate', 'EndDate', 'Remarks', 'LocalityID']]

Unnamed: 0,CollectingEventID,StartDate,EndDate,Remarks,LocalityID
0,1,2005-08-17,,Rhododendron-bamboo thicket with scattered Abies.,1.0
1,2,1988-08-19,,"Selva baja caducifolia, Ruderal. Secund.",2.0
2,3,1922-08-10,1922-08-10,,3.0
3,4,1950-04-24,,atop sea bluffs. With Dudleya edulis over a l...,295658.0
4,5,1964-01-01,1964-03-01,Slope with Quercus,5.0
...,...,...,...,...,...
777802,823109,2020-04-09,,Growing in talus below limestone rock outcrops,758830.0
777803,823110,2023-06-02,,Growing with Sarcobatus baileyi on the valley ...,758831.0
777804,823111,2022-04-08,,"Heavy clay and rocky ""pebble plain"" soil, appa...",758832.0
777805,823112,2023-08-22,,Growing at the edge of a small pond along the ...,758833.0


In [14]:
collecting_event_df.CollectingEventAttributeID

0         25177.0
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
777802        NaN
777803        NaN
777804        NaN
777805        NaN
777806        NaN
Name: CollectingEventAttributeID, Length: 777807, dtype: float64

## Questions

- What is the date precision? {0:Null, 1:Excellent, 2:Good, 3:Bad}?
- Should we use date times?
- What is verbatim date?
- What is StationFieldNumber?



# Collecting Event Attribute Table

In [104]:

# SQL query to fetch data
query = "SELECT * FROM collectingeventattribute"

# Reading the data into a pandas DataFrame
collecting_event_attribute_df = pd.read_sql(query, connection)

# Displaying the first few rows of the DataFrame
collecting_event_attribute_df.tail()

  collecting_event_attribute_df = pd.read_sql(query, connection)


Unnamed: 0,CollectingEventAttributeID,TimestampCreated,TimestampModified,Version,Number1,Number10,Number11,Number12,Number13,Number2,...,Integer1,Integer10,Integer2,Integer3,Integer4,Integer5,Integer6,Integer7,Integer8,Integer9
28080,29704,2021-09-01 12:58:34,2021-09-01 12:58:34,0,213688.0,,,,,,...,,,,,,,,,,
28081,29705,2021-10-18 13:47:50,2021-10-18 13:47:50,0,129827.0,,,,,,...,,,,,,,,,,
28082,29706,2021-11-17 11:40:15,2021-11-17 11:40:15,0,221450.0,,,,,,...,,,,,,,,,,
28083,29707,2023-01-05 15:24:08,2023-01-05 15:24:08,0,106953.0,,,,,,...,,,,,,,,,,
28084,29708,2023-08-09 13:19:53,2023-08-09 13:19:53,0,48670.0,,,,,,...,,,,,,,,,,


In [105]:
len(collecting_event_attribute_df)

28085

## Questions

- Is this a table we even care about? Or is it some artifact?
- Where is a key for the column names

# Collection Object Table

In [15]:

# SQL query to fetch data
query = "SELECT * FROM collectionobject"

# Reading the data into a pandas DataFrame
collection_object_df = pd.read_sql(query, connection)

  collection_object_df = pd.read_sql(query, connection)


In [None]:
# Displaying the first few rows of the DataFrame
collection_object_df.head()

Unnamed: 0,CollectionObjectID,TimestampCreated,TimestampModified,Version,CollectionMemberID,AltCatalogNumber,Availability,CatalogNumber,CatalogedDate,CatalogedDatePrecision,...,EmbargoReleaseDatePrecision,EmbargoStartDate,EmbargoStartDatePrecision,Text4,Text5,Text6,Text7,Text8,UniqueIdentifier,EmbargoAuthorityID
0,1,2007-03-21 16:29:20,2020-07-24 16:47:38,3,4,702772,,522744,2007-03-21,1.0,...,,,,,,,,,,
1,4,2007-03-21 16:29:24,2024-06-24 15:14:03,24,4,1263814,,550184,2007-03-21,1.0,...,,,,,,,,,,
2,5,2007-03-21 16:29:20,2013-11-11 16:46:22,2,4,1060568,,319960,2007-03-21,1.0,...,,,,,,,,,,
3,7,2007-03-21 16:29:18,2016-05-13 14:43:44,2,4,674815,,504936,2007-03-21,1.0,...,,,,,,,,,,
4,8,2007-03-21 16:29:17,2009-05-20 11:41:43,1,4,385059,,5321,2007-03-21,1.0,...,,,,,,,,,,


In [None]:
collection_object_df.columns

NameError: name 'collection_object_df' is not defined

In [None]:
len(collection_object_df)

777812

In [None]:
collection_object_df[['CollectionObjectID', 'Text1', 'CollectingEventID']]

Unnamed: 0,CollectionObjectID,Text1,CollectingEventID
0,1,Shrub 10 feet tall,126372.0
1,4,Growing on Quercus trunk.,195645.0
2,5,Tree ca. 4 m tall. Fruit purplish black.,66157.0
3,7,Tree 60 feet tall.,81225.0
4,8,,44023.0
...,...,...,...
777807,988271,Annual,823109.0
777808,988272,"Annual to 1.2 m tall, multi-branched.",823110.0
777809,988273,"Annual, petals yellow, sepals with yellow marg...",823111.0
777810,988274,petals yellow,823112.0


In [None]:
len(collection_object_df.CollectionObjectID.unique())/len(collection_object_df)

1.0

In [None]:
len(collection_object_df.CollectingEventID.unique())/len(collection_object_df)

0.9999794294765316

In [None]:
sum(collection_object_df.CollectingEventID.value_counts(sort='desc') > 1)

0

In [None]:
collection_object_df[collection_object_df.CollectingEventID.isna()]

Unnamed: 0,CollectionObjectID,TimestampCreated,TimestampModified,Version,CollectionMemberID,AltCatalogNumber,Availability,CatalogNumber,CatalogedDate,CatalogedDatePrecision,...,EmbargoReleaseDatePrecision,EmbargoStartDate,EmbargoStartDatePrecision,Text4,Text5,Text6,Text7,Text8,UniqueIdentifier,EmbargoAuthorityID
416480,592370,2014-01-28 09:24:13,2014-04-21 18:21:32,9,4,99999.0,,,2014-01-28,1.0,...,,,,,,,,,,
599732,781521,2018-03-22 14:05:45,2018-03-22 14:05:45,1,4,384751.0,,582690.0,2018-03-22,1.0,...,,,,,,,,,,
600240,782080,2018-04-04 14:27:04,2018-04-04 14:27:04,1,4,166593.0,,581424.0,2018-04-04,1.0,...,,,,,,,,,,
600541,782391,2018-04-12 10:01:04,2018-04-12 10:01:04,1,4,1283307.0,,584176.0,2018-04-12,1.0,...,,,,,,,,,,
600546,782401,2018-04-12 10:15:02,2018-04-12 10:15:02,1,4,1283307.0,,584173.0,2018-04-12,1.0,...,,,,,,,,,,
602874,784864,2018-05-31 13:29:45,2018-05-31 13:30:35,2,4,589578.0,,397499.0,2018-05-31,1.0,...,,,,,,,,,,
605084,787251,2018-07-25 10:21:15,2018-07-25 10:22:57,2,4,,,,2018-07-25,1.0,...,,,,,,,,,,
606534,788831,2018-08-27 16:39:03,2018-08-28 14:45:30,2,4,24651.0,,591137.0,2018-08-27,1.0,...,,,,,,,,,,
608915,791369,2018-10-15 13:58:07,2018-10-15 14:07:39,3,4,,,,2018-10-15,1.0,...,,,,,,,,,,
611217,793775,2018-11-01 14:45:00,2018-11-01 14:45:00,1,4,598971.0,,597219.0,2018-11-01,1.0,...,,,,,,,,,,


Seems like collection object -> collecting event is 1-1. There are 17 records in collection oject that are missing a collecting event ID but thats it!

## Questions

- What is (alt) catalog number?
- What is count amount?
- Agent vs Cataloger? Appraisal? Container Owner? 
- What is reserved text?


In [None]:
collection_object_df['ReservedText'].unique()

array([None, 'Gaoligong Shan', 'PBI Miconieae', 'Madagascar',
       'Gary Li Masters Project', 'Brazil', 'Symplocos',
       'Costa Rica Melastomes', 'Symplocos-Antilles', 'Chiapas',
       'Camporupestre', 'Picturae Project'], dtype=object)

In [None]:
collection_object_df['ReservedText2'].unique()

array(['Chiapas', None, 'California Plants'], dtype=object)

# Collection Object Attachment Table

In [117]:
# SQL query to fetch data
query = "SELECT * FROM collectionobjectattachment"

# Reading the data into a pandas DataFrame
collection_object_attachment_df = pd.read_sql(query, connection)

  collection_object_attachment_df = pd.read_sql(query, connection)


In [118]:
print(len(collection_object_attachment_df))

print(collection_object_attachment_df.columns)

collection_object_attachment_df.head()



492474
Index(['CollectionObjectAttachmentID', 'TimestampCreated', 'TimestampModified',
       'Version', 'CollectionMemberID', 'Ordinal', 'Remarks',
       'CreatedByAgentID', 'CollectionObjectID', 'ModifiedByAgentID',
       'AttachmentID'],
      dtype='object')


Unnamed: 0,CollectionObjectAttachmentID,TimestampCreated,TimestampModified,Version,CollectionMemberID,Ordinal,Remarks,CreatedByAgentID,CollectionObjectID,ModifiedByAgentID,AttachmentID
0,94484,2022-06-02 18:16:53,2022-06-02 18:16:53,0,4,0,,95728,214585,,94485
1,94485,2022-06-02 18:17:01,2022-06-02 18:17:01,0,4,0,,95728,46696,,94486
2,94486,2022-06-02 18:17:09,2022-06-02 18:17:09,0,4,0,,95728,13549,,94487
3,94487,2022-06-02 18:17:18,2022-06-02 18:17:18,0,4,0,,95728,353869,,94488
4,94488,2022-06-02 18:17:26,2022-06-02 18:17:26,0,4,0,,95728,250304,,94489


# Attachment

In [119]:
# SQL query to fetch data
query = "SELECT * FROM attachment"

# Reading the data into a pandas DataFrame
attachment_df = pd.read_sql(query, connection)

  attachment_df = pd.read_sql(query, connection)


In [120]:
attachment_df.head()

Unnamed: 0,AttachmentID,TimestampCreated,TimestampModified,Version,AttachmentLocation,CopyrightDate,CopyrightHolder,Credit,DateImaged,FileCreatedDate,...,VisibilitySetByID,IsPublic,CreatorID,CaptureDevice,LicenseLogoUrl,MetadataText,SubjectOrientation,Subtype,Type,AttachmentStorageConfig
0,94485,2022-06-02 18:16:53,2022-06-02 18:16:53,0,f1428550-8047-41aa-bcec-00031e08784c.jpg,,,,,2022-06-02,...,,b'\x01',,,,,,,,
1,94486,2022-06-02 18:17:01,2022-06-02 18:17:01,0,5d18edf1-5a48-4d2b-8676-53054b3d6e54.jpg,,,,,2022-06-02,...,,b'\x01',,,,,,,,
2,94487,2022-06-02 18:17:09,2022-06-02 18:17:09,0,09e2db83-419f-4430-8da0-ec8eeaed63f7.jpg,,,,,2022-06-02,...,,b'\x01',,,,,,,,
3,94488,2022-06-02 18:17:17,2022-06-02 18:17:17,0,bcf2aae6-1fd5-4d21-8b90-2a857953edf1.jpg,,,,,2022-06-02,...,,b'\x01',,,,,,,,
4,94489,2022-06-02 18:17:26,2022-06-02 18:17:26,0,fdfd57aa-6e0b-4145-8b41-2e12421c985b.jpg,,,,,2022-06-02,...,,b'\x01',,,,,,,,


In [121]:
attachment_df.__len__()

492474

In [122]:
attachment_df.columns

Index(['AttachmentID', 'TimestampCreated', 'TimestampModified', 'Version',
       'AttachmentLocation', 'CopyrightDate', 'CopyrightHolder', 'Credit',
       'DateImaged', 'FileCreatedDate', 'License', 'MimeType', 'origFilename',
       'Remarks', 'title', 'TableID', 'ScopeID', 'ScopeType', 'GUID',
       'Visibility', 'AttachmentImageAttributeID', 'ModifiedByAgentID',
       'CreatedByAgentID', 'VisibilitySetByID', 'IsPublic', 'CreatorID',
       'CaptureDevice', 'LicenseLogoUrl', 'MetadataText', 'SubjectOrientation',
       'Subtype', 'Type', 'AttachmentStorageConfig'],
      dtype='object')

In [123]:
attachment_df[['AttachmentID', 'AttachmentLocation', 'Remarks']]

Unnamed: 0,AttachmentID,AttachmentLocation,Remarks
0,94485,f1428550-8047-41aa-bcec-00031e08784c.jpg,http://ibss-images.calacademy.org:80/static/bo...
1,94486,5d18edf1-5a48-4d2b-8676-53054b3d6e54.jpg,http://ibss-images.calacademy.org:80/static/bo...
2,94487,09e2db83-419f-4430-8da0-ec8eeaed63f7.jpg,http://ibss-images.calacademy.org:80/static/bo...
3,94488,bcf2aae6-1fd5-4d21-8b90-2a857953edf1.jpg,http://ibss-images.calacademy.org:80/static/bo...
4,94489,fdfd57aa-6e0b-4145-8b41-2e12421c985b.jpg,http://ibss-images.calacademy.org:80/static/bo...
...,...,...,...
492469,606579,02d99ad2-5c8a-4398-9a37-f6b81761e48f.JPG,
492470,606580,5a40aacb-44aa-4a2d-8f7d-72ef518d0c70.JPG,
492471,606581,3c4cde12-3ba9-41e3-af62-1afc51981e98.JPG,
492472,606582,1383cc38-9254-472a-8e1c-b52a976603d1.JPG,


# Locality

In [124]:
# SQL query to fetch data
query = "SELECT * FROM locality"

# Reading the data into a pandas DataFrame
locality_df = pd.read_sql(query, connection)

  locality_df = pd.read_sql(query, connection)


In [125]:
locality_df.head()

Unnamed: 0,LocalityID,TimestampCreated,TimestampModified,Version,Datum,ElevationAccuracy,ElevationMethod,GML,GUID,Lat1Text,...,Text5,VerbatimLatitude,VerbatimLongitude,PaleoContextID,YesNo1,YesNo2,YesNo3,YesNo4,YesNo5,UniqueIdentifier
0,1,2007-03-21 16:19:44,NaT,1,,,,,62e63c63-f8be-11e2-a0e5-60eb693e819a,"27°13'3.5""N",...,,,,,,,,,,
1,2,2007-03-21 16:19:24,NaT,1,,,,,62e64086-f8be-11e2-a0e5-60eb693e819a,,...,,,,,,,,,,
2,3,2007-03-21 16:19:27,NaT,1,,0.0,,,62e64286-f8be-11e2-a0e5-60eb693e819a,41.3044°N,...,,,,,,,,,,
3,5,2008-07-21 10:00:22,NaT,1,,0.0,,,62e6447b-f8be-11e2-a0e5-60eb693e819a,,...,,,,,,,,,,
4,6,2007-03-21 16:19:17,NaT,1,,,,,62e6469e-f8be-11e2-a0e5-60eb693e819a,,...,,,,,,,,,,


In [126]:
locality_df.__len__()

167954

In [127]:
locality_df[['LocalityID', 'MinElevation', 'MaxElevation', 'ElevationAccuracy', 'Latitude1', 'Longitude1', 'LocalityName', 'NamedPlace', 'GeographyID']]

Unnamed: 0,LocalityID,MinElevation,MaxElevation,ElevationAccuracy,Latitude1,Longitude1,LocalityName,NamedPlace,GeographyID
0,1,3840.0,,,27.217642,98.705223,"Yaduo Cun, NE of Yaping Yakou at the Myanmar b...",,33223.0
1,2,820.0,,,,,"El Zapotal, al SE de Tuxtla Gutierrez.",,28316.0
2,3,4700.0,4700.0,0.0,41.304400,-121.036800,Medow W of Gutzman's.,,17158.0
3,5,6700.0,6700.0,0.0,,,Near Tenejapa Center.,,28307.0
4,6,,,,,,Limón Province. Rainforest slopes of Cerro Sk...,Limón Province. Rainforest slopes of Cerro Sk...,27649.0
...,...,...,...,...,...,...,...,...,...
167949,168945,3100.0,3100.0,0.0,,,Cup Valley Cmpd. Anza Borrego Park,,22916.0
167950,168946,2600.0,,,27.727800,98.344002,"Taron Taru divide, Bucashwang valley.",,33272.0
167951,168947,2620.0,,,,,Along road to Adquem and Las Flores above Huis...,,33537.0
167952,168948,,,,,,Cultivated at the San Francisco Conservatory o...,,72.0


In [128]:
locality_df.Remarks.unique()[0:20]

array([None, 'bracket (Hsing Shan Hsien)', 'bracket',
       'bracket (Hangchow)', 'bracket (Taipei Hsien)',
       '[locality from another sheet this collection; locality on this sheet not easily discerned.]',
       'bracket (De-xin)', 'Mpo. specified as El Zapotal, not current',
       'bracket (Hsiushui)', 'bracket (I-Hing)',
       'Verbatim: UTM2N  #¡Núm!     UTM1E  #¡Núm! ', 'bracket (Taitung)',
       'bracket (Nanking)', 'bracket (Kaohsiung Hsien)',
       'breacket (Tsing-tien)', 'bracket (Taipei)'], dtype=object)

# Locality Detail Table

In [129]:
# # SQL query to fetch data
# query = "SELECT * FROM localitydetail"

# # Reading the data into a pandas DataFrame
# locality_detail_df = pd.read_sql(query, connection)

In [130]:
# locality_detail_df.head()

In [82]:
# locality_detail_df.__len__()

In [83]:
# locality_detail_df[['LocalityDetailID', 'RangeDesc', 'Section', 'Text1', 'Township', 'UtmEasting', 'LocalityID']]

# Geography

In [131]:
# SQL query to fetch data
query = "SELECT * FROM geography"

# Reading the data into a pandas DataFrame
geography_df = pd.read_sql(query, connection)

  geography_df = pd.read_sql(query, connection)


In [132]:
geography_df

Unnamed: 0,GeographyID,TimestampCreated,TimestampModified,Version,Abbrev,CentroidLat,CentroidLon,CommonName,FullName,GeographyCode,...,Remarks,Text1,Text2,TimestampVersion,ModifiedByAgentID,CreatedByAgentID,ParentID,GeographyTreeDefID,AcceptedID,GeographyTreeDefItemID
0,1,2011-09-21 13:52:33,2011-09-21 13:52:33,8,,,,,Earth,,...,,,,,,,,1,,1
1,2,2011-09-21 13:48:03,2011-09-21 13:48:03,1,AF,7.19,21.09,,Africa,,...,,,,,,1.0,1.0,1,,2
2,3,2011-09-21 13:48:03,2011-09-21 13:48:03,3,AS,29.84,89.30,,Asia,,...,,,,,,1.0,1.0,1,,2
3,4,2011-09-21 13:48:03,2011-09-21 13:48:03,3,EU,48.69,9.14,,Europe,,...,,,,,,1.0,1.0,1,,2
4,5,2011-09-21 13:48:03,2011-09-21 13:48:03,1,,46.07,-100.00,,North America,,...,,,,,,1.0,1.0,1,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31297,34401,2024-02-28 13:50:04,2024-02-28 13:50:04,1,,,,,"Bago City, Province of Negros Occidental, Phil...",,...,,,,,96761.0,96761.0,2891.0,1,,5
31298,34402,2024-02-28 13:55:29,2024-02-28 13:55:29,1,,,,,"Baguio, Province of Benguet, Philippines",,...,,,,,96761.0,96761.0,2855.0,1,,5
31299,34403,2024-02-28 15:07:56,2024-02-28 15:07:56,1,,,,,"Bais, Province of Negros Oriental, Philippines",,...,,,,,96761.0,96761.0,2892.0,1,,5
31300,34404,2024-03-01 10:26:39,2024-03-01 10:26:39,1,,,,,"Batangas City, Province of Batangas, Philippines",,...,,,,,96761.0,96761.0,2854.0,1,,5


In [133]:
geography_df.columns

Index(['GeographyID', 'TimestampCreated', 'TimestampModified', 'Version',
       'Abbrev', 'CentroidLat', 'CentroidLon', 'CommonName', 'FullName',
       'GeographyCode', 'GML', 'GUID', 'HighestChildNodeNumber', 'IsAccepted',
       'IsCurrent', 'Name', 'NodeNumber', 'Number1', 'Number2', 'RankID',
       'Remarks', 'Text1', 'Text2', 'TimestampVersion', 'ModifiedByAgentID',
       'CreatedByAgentID', 'ParentID', 'GeographyTreeDefID', 'AcceptedID',
       'GeographyTreeDefItemID'],
      dtype='object')

In [134]:
geography_df[['GeographyID', 'CentroidLat', 'CentroidLon', 'CommonName', 'FullName', 'Name']]

Unnamed: 0,GeographyID,CentroidLat,CentroidLon,CommonName,FullName,Name
0,1,,,,Earth,Earth
1,2,7.19,21.09,,Africa,Africa
2,3,29.84,89.30,,Asia,Asia
3,4,48.69,9.14,,Europe,Europe
4,5,46.07,-100.00,,North America,North America
...,...,...,...,...,...,...
31297,34401,,,,"Bago City, Province of Negros Occidental, Phil...",Bago City
31298,34402,,,,"Baguio, Province of Benguet, Philippines",Baguio
31299,34403,,,,"Bais, Province of Negros Oriental, Philippines",Bais
31300,34404,,,,"Batangas City, Province of Batangas, Philippines",Batangas City


# Geo Coord Detail Table

In [135]:
# SQL query to fetch data
query = "SELECT * FROM geocoorddetail"

# Reading the data into a pandas DataFrame
geo_coord_detail_df = pd.read_sql(query, connection)

  geo_coord_detail_df = pd.read_sql(query, connection)


In [136]:
geo_coord_detail_df

Unnamed: 0,GeoCoordDetailID,TimestampCreated,TimestampModified,Version,GeoRefAccuracyUnits,GeoRefDetDate,GeoRefDetRef,GeoRefRemarks,GeoRefVerificationStatus,MaxUncertaintyEst,...,Number3,Number4,Number5,Text4,Text5,YesNo1,YesNo2,YesNo3,YesNo4,YesNo5
0,1,2010-01-05 11:30:15,NaT,0,,NaT,,Lat/long est.,,,...,,,,,,,,,,
1,2,2010-01-05 11:30:15,NaT,0,,NaT,,Lat/long est.,,,...,,,,,,,,,,
2,3,2010-01-05 11:30:15,NaT,0,,NaT,,Lat/long est.,,,...,,,,,,,,,,
3,4,2010-01-05 11:30:15,NaT,0,,NaT,,Lat/long est.,,,...,,,,,,,,,,
4,5,2010-01-05 11:30:15,NaT,0,,NaT,,Lat/long est.,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27626,59135,2024-08-01 11:06:11,2024-08-01 11:06:11,0,,NaT,,,,2.77,...,,,,,,,,,,
27627,59136,2024-08-01 11:14:11,2024-08-01 11:14:11,0,,NaT,,,,7.73,...,,,,,,,,,,
27628,59137,2024-08-01 11:18:06,2024-08-01 11:18:06,0,,NaT,,,,7.73,...,,,,,,,,,,
27629,59138,2024-08-01 11:19:39,2024-08-01 11:19:39,0,,NaT,,,,7.73,...,,,,,,,,,,


In [137]:
geo_coord_detail_df[['GeoCoordDetailID', 'GeoRefRemarks', 'LocalityID']]

Unnamed: 0,GeoCoordDetailID,GeoRefRemarks,LocalityID
0,1,Lat/long est.,235352
1,2,Lat/long est.,235353
2,3,Lat/long est.,235560
3,4,Lat/long est.,238882
4,5,Lat/long est.,238883
...,...,...,...
27626,59135,,757822
27627,59136,,757825
27628,59137,,757826
27629,59138,,757827


This seems to be unrelated to us!


## Questions
- What is Latitude1 vs Latitude2?
- Do we really care about lat/lon/elevation accuracy and original unit?


In [143]:
full_df = collection_object_df[['CollectionObjectID', 'Text1', 'CountAmt', 'CollectingEventID']]

# Merge in Collection Object Attachment table
full_df = full_df.merge(right=collection_object_attachment_df[['CollectionObjectID', 'CollectionObjectAttachmentID', 'AttachmentID']], on='CollectionObjectID', how='left')

# Merge in Attachment table
# full_df = full_df.merge(right=attachment_df[['AttachmentID', 'AttachmentLocation', 'Remarks']], on='AttachmentID', how='left')
full_df = full_df.merge(right=attachment_df[['AttachmentID', 'AttachmentLocation']], on='AttachmentID', how='left')

# Merge in Collecting Event table
full_df = full_df.merge(right=collecting_event_df[['CollectingEventID', 'StartDate', 'EndDate', 'Remarks', 'LocalityID']], on='CollectingEventID', how='left')

full_df = full_df.merge(right=locality_df[['LocalityID', 'MinElevation', 'MaxElevation', 'ElevationAccuracy', 'Latitude1', 'Longitude1', 'LocalityName', 'NamedPlace', 'GeographyID']], on='LocalityID', how='left')




In [144]:
full_df

Unnamed: 0,CollectionObjectID,Text1,CountAmt,CollectingEventID,CollectionObjectAttachmentID,AttachmentID,AttachmentLocation,StartDate,EndDate,Remarks,LocalityID,MinElevation,MaxElevation,ElevationAccuracy,Latitude1,Longitude1,LocalityName,NamedPlace,GeographyID
0,1,Shrub 10 feet tall,1.0,126372.0,177213.0,177214.0,191c7bd6-83cd-4917-8196-c79e6414c4bf.jpg,1976-11-22,1976-11-22,Steep slopes and dry ravines,543494.0,,,,,,,,
1,4,Growing on Quercus trunk.,5.0,195645.0,,,,1988-05-23,,"Ridge with Pinus, Quercus and Ostrya, Prunus a...",195645.0,,,,,,,,
2,5,Tree ca. 4 m tall. Fruit purplish black.,8.0,66157.0,,,,2004-10-29,,Tsuga dumosa forest mixed with elements of sub...,66157.0,2500.0,,,27.717865,98.421631,Vicinity of Sandui campsite between Shigong Qi...,Vicinity of Sandui campsite between Shigong Qi...,33272.0
3,7,Tree 60 feet tall.,1.0,81225.0,,,,1982-01-14,1982-01-14,Montane Rain Forest.,81225.0,760.0,760.0,0.0,,,80 km SW of Palenque on road to Ocosingo along...,,28268.0
4,8,,1.0,44023.0,98924.0,98925.0,8560a4a4-af49-4981-af42-c692d9138617.jpg,1954-05-01,1954-05-01,Ridge,44023.0,,,,,,Above Sucker Creek,,16253.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
788480,988271,Annual,,823109.0,,,,2020-04-09,,Growing in talus below limestone rock outcrops,758830.0,,,,,,,,
788481,988272,"Annual to 1.2 m tall, multi-branched.",,823110.0,,,,2023-06-02,,Growing with Sarcobatus baileyi on the valley ...,758831.0,,,,,,,,
788482,988273,"Annual, petals yellow, sepals with yellow marg...",,823111.0,,,,2022-04-08,,"Heavy clay and rocky ""pebble plain"" soil, appa...",758832.0,,,,,,,,
788483,988274,petals yellow,,823112.0,,,,2023-08-22,,Growing at the edge of a small pond along the ...,758833.0,,,,,,,,


In [145]:
print(full_df[full_df.CountAmt > 1].iloc[0])

print(full_df[full_df.CountAmt > 1].iloc[0]['Remarks'])

CollectionObjectID                                                              4
Text1                                                   Growing on Quercus trunk.
CountAmt                                                                      5.0
CollectingEventID                                                        195645.0
CollectionObjectAttachmentID                                                  NaN
AttachmentID                                                                  NaN
AttachmentLocation                                                            NaN
StartDate                                                              1988-05-23
EndDate                                                                      None
Remarks                         Ridge with Pinus, Quercus and Ostrya, Prunus a...
LocalityID                                                               195645.0
MinElevation                                                                  NaN
MaxElevation    

In [146]:
print(full_df[full_df.CountAmt > 1].iloc[1000])

# print(full_df[full_df.CountAmt > 1].iloc[1000]['Remarks'])

CollectionObjectID                                                         9673
Text1                           Perennial herb ca. 1 m tall. Young fruit green.
CountAmt                                                                    8.0
CollectingEventID                                                      223923.0
CollectionObjectAttachmentID                                           106599.0
AttachmentID                                                           106600.0
AttachmentLocation                     45a5e0be-23c1-464a-9662-6cc0c397ccf0.jpg
StartDate                                                            2002-10-06
EndDate                                                              2002-10-06
Remarks                                                                    None
LocalityID                                                             223923.0
MinElevation                                                                NaN
MaxElevation                            

## Trying with non-null lat/lon set first

In [153]:
full_df = full_df[full_df.Latitude1.notna()].reset_index(drop=True)

In [172]:
full_df = full_df[full_df['StartDate'].notna()].reset_index(drop=True)

In [179]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataframe
df = full_df.copy()

# Step 1: Geographical clustering using DBSCAN
coords = df[['Latitude1', 'Longitude1']].dropna()  # Remove NaNs
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)  # Scale for DBSCAN

geo_clustering = DBSCAN(eps=0.04, min_samples=10).fit(coords_scaled)
df['GeoCluster'] = geo_clustering.labels_

# Step 2: Text clustering using TF-IDF and KMeans
text_data = df['Text1'].fillna('')  # Handle missing text

tfidf_vectorizer = TfidfVectorizer(max_features=100)
text_vectors = tfidf_vectorizer.fit_transform(text_data)

kmeans = KMeans(n_clusters=5)  # Choose an appropriate number of clusters
text_clusters = kmeans.fit_predict(text_vectors)
df['TextCluster'] = text_clusters


# Step 3: Date-based clustering (e.g., within 3 days)
# Ensure the 'Date' column is in datetime format
# Convert 'StartDate' to datetime, handle errors, and drop missing values
df['Date'] = pd.to_datetime(df['StartDate'], errors='coerce')


# Convert dates to ordinal numbers for clustering
date_ordinal = df['Date'].map(lambda x: x.toordinal()).values.reshape(-1, 1)

# DBSCAN for temporal clustering (adjust `eps` based on the range of days)
date_clustering = DBSCAN(eps=10, min_samples=2).fit(date_ordinal)
df['DateCluster'] = date_clustering.labels_

# Combine clusters and assign EventID
df['CombinedCluster'] = (
    df['GeoCluster'].astype(str) + '-' +
    df['TextCluster'].astype(str) + '-' +
    df['DateCluster'].astype(str)
)
df['EventID'] = df['CombinedCluster'].astype('category').cat.codes

# Display the resulting dataframe with EventID
print(df[['CollectionObjectID', 'EventID']])

       CollectionObjectID  EventID
0                       5      301
1                       9      270
2                      11      319
3                      20      308
4                      24      674
...                   ...      ...
44800              924772     1942
44801              936416     1067
44802              941113      780
44803              941114      780
44804              956015      282

[44805 rows x 2 columns]


In [180]:
df['EventID'].value_counts()

EventID
247     1549
271     1335
280      962
282      701
316      612
        ... 
203        1
2340       1
97         1
228        1
1481       1
Name: count, Length: 2391, dtype: int64

In [181]:
df[df.EventID == 61]

Unnamed: 0,index,CollectionObjectID,Text1,CountAmt,CollectingEventID,CollectionObjectAttachmentID,AttachmentID,AttachmentLocation,StartDate,EndDate,...,Longitude1,LocalityName,NamedPlace,GeographyID,GeoCluster,TextCluster,Date,DateCluster,CombinedCluster,EventID
10098,49711,86390,Twining vine. Petals dark orange.,1.0,4694.0,,,,2005-10-09,,...,-49.615833,14 km NE of Jaguariaíva on BR-151 (the highway...,,27576.0,-1,0,2005-10-09,51,-1-0-51,61
40973,202284,352754,Florets pink-purple. Locally common.,1.0,119141.0,,,,2005-10-09,,...,-49.615833,14 km NE of Jaguariaíva on BR-151 (the highway...,,27576.0,-1,0,2005-10-09,51,-1-0-51,61


In [182]:
full_df.to_csv('../data/full_df.csv')

In [None]:
# Closing the connection
connection.close()