In [None]:
# imports
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import ijson
import jsonlines
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from collections import Counter
import folium
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from geopy.extra.rate_limiter import RateLimiter

import datashader as ds
import datashader.transfer_functions as tf
from datashader.colors import Hot
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
from datashader.utils import export_image
from datashader import transfer_functions as tf

### Data Loading

In [None]:
file_path = "/Users/albertoscinetti/Desktop/Thesis/exploratory-data-analysis/digital-specimen.json"

In [None]:
# Only load a partition of the data where latitude and longitude coordinates are present

# data has been loaded in batches with counter to limit ingestion and to avoid overloading memory 

filtered_data = []
counter = 0 

with jsonlines.open(file_path) as reader:
    for obj in reader:
        if "_source" in obj and "ods:hasEvents" in obj["_source"]:
            for identification in obj["_source"]["ods:hasEvents"]:
                if "ods:hasLocation" in identification:
                    location = identification["ods:hasLocation"]
                    if "ods:hasGeoreference" in location: 
                        georeference = location['ods:hasGeoreference']
                        if "dwc:decimalLatitude" in georeference and "dwc:decimalLongitude" in georeference:
                            counter += 1
                            if counter > 1000000: # added this to skip the previously ingested data
                                filtered_data.append(obj["_source"])
                                break
        
        if len(filtered_data) >= 500000:  # limit
            break

df = pd.json_normalize(filtered_data)

### Standardisation of the df (extract values from nested columns, filter for only relevant columns...)


In [None]:
# extracting with these scripts all information related to the location of a speciic collecting events 
def extract_georeferences(row): 
    return row[0]['ods:hasLocation']['ods:hasGeoreference']

def extract_latitude(row): 
    georeference_dict = row[0]['ods:hasLocation']['ods:hasGeoreference']
    if 'dwc:decimalLatitude' in georeference_dict:
        return georeference_dict['dwc:decimalLatitude']
    else: 
        return None

def extract_longitude(row): 
    georeference_dict = row[0]['ods:hasLocation']['ods:hasGeoreference']
    if 'dwc:decimalLongitude' in georeference_dict: 
        return georeference_dict['dwc:decimalLongitude']
    else: 
        return None

def extract_country(row): 
    location_dict = row[0]['ods:hasLocation']
    if 'dwc:country' in location_dict: 
        return location_dict['dwc:country']
    else: 
        return None
    
def extract_stateprovince(row): 
    location_dict = row[0]['ods:hasLocation']
    if 'dwc:stateProvince' in location_dict: 
        return location_dict['dwc:stateProvince']
    else: 
        return None

def extract_localty(row): 
    location_dict = row[0]['ods:hasLocation']
    if 'dwc:locality' in location_dict: 
        return location_dict['dwc:locality']
    else: 
        return None
    
def extract_island(row): 
    location_dict = row[0]['ods:hasLocation']
    if 'dwc:island' in location_dict: 
        return location_dict['dwc:island']
    else: 
        return None

def extract_collector_name(row): 
    if 'ods:hasAgents' in row[0]:
        agent_dict = row[0]['ods:hasAgents']
        if 'schema:name' in agent_dict[0]:
            return agent_dict[0]['schema:name']
        else: 
            return None
    else:
        return None
    
def extract_collection_date(row): 
    if 'dwc:eventDate' in row[0]:
        return row[0]['dwc:eventDate']
    else:
        return None

df_georeference = df.copy()
df_georeference['georeference'] = df_georeference['ods:hasEvents'].map(extract_georeferences)
df_georeference['latitude'] = df_georeference['ods:hasEvents'].map(extract_latitude)
df_georeference['longitude'] = df_georeference['ods:hasEvents'].map(extract_longitude)
df_georeference['country'] = df_georeference['ods:hasEvents'].map(extract_country)
df_georeference['stateProvince'] = df_georeference['ods:hasEvents'].map(extract_stateprovince)
df_georeference['localty'] =  df_georeference['ods:hasEvents'].map(extract_localty)
df_georeference['island'] =  df_georeference['ods:hasEvents'].map(extract_island)
df_georeference['collector'] =  df_georeference['ods:hasEvents'].map(extract_collector_name)
df_georeference['collection_date'] =  df_georeference['ods:hasEvents'].map(extract_collection_date)

In [None]:
df_final = df_georeference[['@id','ods:sourceSystemName', 'ods:livingOrPreserved', 'ods:organisationName', 'ods:topicOrigin',
       'ods:topicDomain', 'ods:topicDiscipline', 'ods:specimenName', 'latitude', 'longitude', 'country', 'stateProvince',
       'localty', 'island', 'collector', 'collection_date'   ]]

In [None]:
df_final

In [None]:
df_final.to_csv('df_final_3.csv', index = False)

In [None]:
# check for duplicates
# the batches are put together to obtain a final df 
df1 = pd.read_csv('df_final_1.csv')
df2 = pd.read_csv('df_final_2.csv')
df3 = pd.read_csv('df_final_3.csv')

In [None]:
len(set(df2['@id']) - set(df3['@id']))

In [None]:
# Combine df1, df2, and df3 into a single DataFrame
df_combined = pd.concat([df1, df2, df3], ignore_index=True)

# Display the combined DataFrame
len(df_combined)

In [None]:
df_combined.to_csv('df_finalissimo.csv', index = False)

In [None]:
# load andcheck 
df_s2 = df_combined[['latitude', 'longitude']]
df_s2.to_csv('s2_df_test_lat_long.csv', index=False)

### Data points visualization

In [None]:

# obtain a visualization of the data points into a world map to understand the distribution 

df = df_combined.dropna(subset=['latitude', 'longitude'])

# data shader in use here 
cvs = ds.Canvas(plot_width=1800, plot_height=900,
                x_range=(-180, 180), y_range=(-90, 90))
agg = cvs.points(df, 'longitude', 'latitude')
img = tf.shade(agg, cmap=Hot, how='eq_hist')
img = tf.spread(img, px=1)
img_pil = img.to_pil().convert("RGBA")

# map plot 
fig = plt.figure(figsize=(15, 8))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_global()
ax.add_feature(cfeature.LAND.with_scale('110m'), facecolor='black')
ax.add_feature(cfeature.OCEAN.with_scale('110m'), facecolor='white')
ax.add_feature(cfeature.COASTLINE, edgecolor='white', linewidth=0.3)
ax.add_feature(cfeature.BORDERS, edgecolor='white', linewidth=0.2, linestyle=':')

# heatmap overlay
ax.imshow(img_pil, origin='upper', extent=(-180, 180, -90, 90),
          transform=ccrs.PlateCarree(), alpha=0.8)

# plot 
plt.title("Global Data Point Density (Dark Map)", color='white')
ax.set_facecolor('black')
plt.tight_layout()
plt.show()



### Convert Data into a textual format (pseudo sentences) for Ingestion into the model 


In [None]:
df_analysis = pd.read_csv('specimens_with_adaptive_cells_v2.csv')

In [None]:
df_analysis#['ods:topicDiscipline'].value_counts()

In [None]:
# OBTAIN A DF WITH ONLY FREE TEXT FORM AND THEN THE CELL TOKEN 
def safe_fill(col):
    return df_analysis[col].fillna("Unkn")



# the velow are commented out based on conducted analsysis and type of pseudosentences wanted to be extracted 
df_analysis['text_combined'] = (
    "Specimen: " + safe_fill("ods:specimenName") + ". " +
    #"Collected by " + safe_fill("collector") + " on " + safe_fill("collection_date") + " in " +
    "Collected in " + safe_fill("localty") + ", " + safe_fill("stateProvince") + ", " + safe_fill("country") + ". "  # ADDED COLLECTECTED IN AND REMOVED COLLECTOR for an exepriment in this case
    #"Coordinates: (" + df["latitude"].fillna("Unknown").astype(str) + ", " +
    #                  df["longitude"].fillna("Unknown").astype(str) + "). " +
    # "Discipline: " + safe_fill("ods:topicDiscipline") + "."
)



In [None]:
df_analysis['ods:topicDiscipline'].value_counts()

In [None]:
df_analysis_final = df_analysis[['text_combined',  'adaptive_cell_token', 'adaptive_cell_id', 'adaptive_cell_level' ]] #'country', 'stateProvince', 'localty',

In [None]:
# drop na values for adavice cell toke columns 
df_analysis_final = df_analysis_final.dropna(subset=['adaptive_cell_token'])

In [None]:
# here also saved with locality as of now 
df_analysis_final.to_csv('df_for_model_nocoord_nocollector.csv')