In [1]:
import pandas as pd
from s2sphere import CellId, Cell, LatLng
import plotly.graph_objects as go
import bisect

### Perform Adpative Cell Partitioning based on data points


In [None]:
# load this df, which contains all the points that have coorinates already 
df = pd.read_csv("s2_df_test_lat_long.csv")
points = [LatLng.from_degrees(lat, lon) for lat, lon in zip(df.latitude, df.longitude)]

# define function to group points by S2 cells
def group_points_by_cell(points, level=4):
    cell_to_points = {}
    for p in points:
        cell_id = CellId.from_lat_lng(p).parent(level)
        if cell_id not in cell_to_points:
            cell_to_points[cell_id] = []
        cell_to_points[cell_id].append(p)
    return cell_to_points

# define function to do adaptive disvision based on points dentisty in each cell 
def adaptive_subdivide(cell_id, points, min_points=10, max_points=50, max_level=10):
    if len(points) <= max_points or cell_id.level() >= max_level:
        return [cell_id]

    children = []
    child = cell_id.child_begin()
    while child != cell_id.child_end():
        child_points = [p for p in points if child.contains(CellId.from_lat_lng(p))]
        if len(child_points) >= min_points:
            # if wnough points in the child cell, subdivide further
            children.extend(adaptive_subdivide(child, child_points, min_points, max_points, max_level))
        elif len(child_points) > 5:
            children.append(child)
       
        child = child.next()

    # if no children qualified, return current cell to avoid dropping data
    return children if children else [cell_id]


# adaptive partitioning run here
cell_to_points = group_points_by_cell(points, level=4) # starting from level 4 as defined and it goes to level 10 max, which is defined below 
adaptive_cells = []

for cell_id, pts in cell_to_points.items():
    adaptive_cells.extend(adaptive_subdivide(cell_id, pts, min_points=10, max_points=50, max_level=10)) # here maximum points set to 50! It can be changed 

# results 
adaptive_cell_data = [{
    "cell_token": cell.to_token(),
    "level": cell.level(),
    "center_lat": LatLng.from_point(cell.to_point()).lat().degrees,
    "center_lng": LatLng.from_point(cell.to_point()).lng().degrees,
    "id": cell.id()
} for cell in adaptive_cells]

adaptive_df = pd.DataFrame(adaptive_cell_data)
adaptive_df.to_csv("adaptive_s2_cells.csv", index=False)
print(adaptive_df.head())


  cell_token  level  center_lat  center_lng                   id
0      46404      7   60.606203    9.649086  5062116349908615168
1      4640c      7   59.878398    9.366203  5062257087396970496
2     464101     10   60.147451   10.144370  5062328555652775936
3     464105     10   60.045479   10.252519  5062332953699287040
4     464113     10   59.931900   10.505315  5062348346862075904


In [None]:
# this is the cell token in which the earth is divede into based on the division that is applied  
adaptive_df

Unnamed: 0,cell_token,level,center_lat,center_lng,id
0,46404,7,60.606203,9.649086,5062116349908615168
1,4640c,7,59.878398,9.366203,5062257087396970496
2,464101,10,60.147451,10.144370,5062328555652775936
3,464105,10,60.045479,10.252519,5062332953699287040
4,464113,10,59.931900,10.505315,5062348346862075904
...,...,...,...,...,...
16866,8d3,4,5.751290,-47.449040,10173631558229950464
16867,973,4,-25.456329,-82.214860,10894207498609229824
16868,595,4,59.134547,143.504579,6435643867512438784
16869,20b,4,-39.604989,64.337170,2355382605114769408


In [None]:
adaptive_df['level'].value_counts()

level
10    7188
9     3081
8     2515
7     1823
6     1127
4      615
5      522
Name: count, dtype: int64

In [3]:
# reload in case previous part is already run
df = pd.read_csv("adaptive_s2_cells.csv")
df

Unnamed: 0,cell_token,level,center_lat,center_lng,id
0,46404,7,60.606203,9.649086,5062116349908615168
1,4640c,7,59.878398,9.366203,5062257087396970496
2,464101,10,60.147451,10.144370,5062328555652775936
3,464105,10,60.045479,10.252519,5062332953699287040
4,464113,10,59.931900,10.505315,5062348346862075904
...,...,...,...,...,...
16866,8d3,4,5.751290,-47.449040,10173631558229950464
16867,973,4,-25.456329,-82.214860,10894207498609229824
16868,595,4,59.134547,143.504579,6435643867512438784
16869,20b,4,-39.604989,64.337170,2355382605114769408


### Cell partitioning Plot and Visualization 

In [5]:
cell_ids = [CellId.from_token(token) for token in df["cell_token"]]

# plotting
lats, lons = [], []
hover_lats, hover_lons = [], []
hover_texts = []

for cell_id in cell_ids:
    cell = Cell(cell_id) # cell boarder and define polygons
    for i in range(4):
        vertex = LatLng.from_point(cell.get_vertex(i))
        lats.append(vertex.lat().degrees)
        lons.append(vertex.lng().degrees)
    vertex = LatLng.from_point(cell.get_vertex(0))
    lats.append(vertex.lat().degrees)
    lons.append(vertex.lng().degrees)

    # separator
    lats.append(None)
    lons.append(None)

    # center
    center = LatLng.from_point(cell_id.to_point())
    hover_lats.append(center.lat().degrees)
    hover_lons.append(center.lng().degrees)
    hover_texts.append(f"Token: {cell_id.to_token()}<br>Level: {cell_id.level()}")


# use ploty to create and obtain a figure 
fig = go.Figure()

# border and centers
fig.add_trace(go.Scattergeo(
    lat=lats,
    lon=lons,
    mode='lines',
    line=dict(color='blue', width=0.5),
    showlegend=False
))

fig.add_trace(go.Scattergeo(
    lat=hover_lats,
    lon=hover_lons,
    mode='markers',
    marker=dict(size=2, color='red'),
    text=hover_texts,
    hoverinfo='text',
    name='Cell Centers'
))

# view 
fig.update_geos(
    projection_type="orthographic",
    showland=True,
    showcountries=True,
    showcoastlines=True,
)

fig.update_layout(
    title="Adaptive S2 Cell Partitioning Based on Specimen Density",
    height=800,
    margin=dict(l=0, r=0, t=50, b=0)
)

fig.show()


In [33]:
# total Number of adaptive cells 
len(adaptive_cells)

16871

In [None]:
# HOW TO VISUALIZE THE SPECIMEN LOCATION GIVEN LATITUDE AND LONGITUDE

adaptive_df = pd.read_csv("adaptive_s2_cells.csv")

# specimen location 
specimen_lat = 48.8584
specimen_lng = 2.2945
latlng = LatLng.from_degrees(specimen_lat, specimen_lng)
specimen_cell_id = CellId.from_lat_lng(latlng)

# find the adaptive cell that contains the specimen location
matched_row = None
for _, row in adaptive_df.iterrows():
    adaptive_cell = CellId.from_token(row["cell_token"])
    if adaptive_cell.contains(specimen_cell_id):
        matched_row = row
        break

# borders and plot 
lats, lons = [], []
for token in adaptive_df["cell_token"]:
    cell = Cell(CellId.from_token(token))
    for i in range(4):
        v = LatLng.from_point(cell.get_vertex(i))
        lats.append(v.lat().degrees)
        lons.append(v.lng().degrees)
    v = LatLng.from_point(cell.get_vertex(0))
    lats.append(v.lat().degrees)
    lons.append(v.lng().degrees)
    lats.append(None)
    lons.append(None)

# plot
fig = go.Figure()

# uncomment to also show borders of alla daptive cells 
# # borders 
# fig.add_trace(go.Scattergeo(
#     lat=lats,
#     lon=lons,
#     mode='lines',
#     line=dict(color='blue', width=0.5),
#     name='Adaptive Cells',
#     showlegend=False
# ))

# gighlight specimen cell in red 
if matched_row is not None:
    highlight_cell = Cell(CellId.from_token(matched_row["cell_token"]))
    cell_lats, cell_lons = [], []
    for i in range(4):
        v = LatLng.from_point(highlight_cell.get_vertex(i))
        cell_lats.append(v.lat().degrees)
        cell_lons.append(v.lng().degrees)
    v = LatLng.from_point(highlight_cell.get_vertex(0))
    cell_lats.append(v.lat().degrees)
    cell_lons.append(v.lng().degrees)

    fig.add_trace(go.Scattergeo(
        lat=cell_lats,
        lon=cell_lons,
        mode='lines',
        line=dict(color='red', width=2),
        name='Specimen Cell'
    ))

    # specimen location 
    fig.add_trace(go.Scattergeo(
        lat=[specimen_lat],
        lon=[specimen_lng],
        mode='markers',
        marker=dict(size=6, color='orange'),
        name='Specimen Location'
    ))

# plot
fig.update_geos(
    projection_type="orthographic",
    showland=True,
    showcountries=True,
    showcoastlines=True,
)

fig.update_layout(
    title="Specimen Location within Adaptive S2 Cells",
    height=800,
    margin=dict(l=0, r=0, t=50, b=0)
)

fig.show()


### Processing to annotate each cell id to records given species coordinates


In [None]:
# annotate the adpative cell id to each speicmne 

specimens = pd.read_csv("df_finalissimo.csv")
adaptive = pd.read_csv("adaptive_s2_cells.csv")
 

# get adpative cell id 
adaptive["cell_id_obj"] = adaptive["cell_token"].apply(CellId.from_token)

# assing adaptive cell info 
def assign_adaptive_cell(row, adaptive_cells):
    latlng = LatLng.from_degrees(row["latitude"], row["longitude"])
    specimen_cell = CellId.from_lat_lng(latlng)
    
    for _, cell_row in adaptive_cells.iterrows():
        adaptive_cell = cell_row["cell_id_obj"]
        if adaptive_cell.contains(specimen_cell):
            return pd.Series({
                "adaptive_cell_token": cell_row["cell_token"],
                "adaptive_cell_id": cell_row["id"],
                "adaptive_cell_level": cell_row["level"]
            })
    return pd.Series({
        "adaptive_cell_token": None,
        "adaptive_cell_id": None,
        "adaptive_cell_level": None
    })

# annotate each row 
annotated = specimens.join(specimens.apply(assign_adaptive_cell, axis=1, adaptive_cells=adaptive))

# sa
annotated.to_csv("specimens_with_adaptive_cells_v2.csv", index=False)
print(annotated.head())


Columns (11,13) have mixed types. Specify dtype option on import or set low_memory=False.



                                   @id         ods:sourceSystemName  \
0  https://doi.org/SANDBOX/3L8-AS3-E1T  NHMD Ornithology Collection   
1  https://doi.org/SANDBOX/CDW-AZW-QCL  NHMD Ornithology Collection   
2  https://doi.org/SANDBOX/62W-XWV-R9Y  NHMD Ornithology Collection   
3  https://doi.org/SANDBOX/4HA-96B-7L9  NHMD Ornithology Collection   
4  https://doi.org/SANDBOX/PGN-C54-5P2  NHMD Ornithology Collection   

  ods:livingOrPreserved               ods:organisationName ods:topicOrigin  \
0             Preserved  Natural History Museum of Denmark         Natural   
1             Preserved  Natural History Museum of Denmark         Natural   
2             Preserved  Natural History Museum of Denmark         Natural   
3             Preserved  Natural History Museum of Denmark         Natural   
4             Preserved  Natural History Museum of Denmark         Natural   

  ods:topicDomain ods:topicDiscipline  \
0            Life             Zoology   
1            Life     

In [None]:
# above approachh was very slow, so the below approach was used 

specimens = pd.read_csv("df_finalissimo.csv")
adaptive = pd.read_csv("adaptive_s2_cells.csv")

# indexed cell bu range 
indexed_cells = []
for _, row in adaptive.iterrows():
    cell = CellId.from_token(row["cell_token"])
    indexed_cells.append({
        "range_min": cell.range_min().id(),
        "range_max": cell.range_max().id(),
        "token": row["cell_token"],
        "id": row["id"],
        "level": row["level"]
    })

indexed_cells.sort(key=lambda x: x["range_min"])
range_starts = [cell["range_min"] for cell in indexed_cells]

# binary search 
def find_adaptive_cell(cell_id):
    id_int = cell_id.id()
    idx = bisect.bisect_right(range_starts, id_int) - 1
    if idx >= 0 and indexed_cells[idx]["range_min"] <= id_int <= indexed_cells[idx]["range_max"]:
        return indexed_cells[idx]
    return None

# process rows with printing progress 
tokens, ids, levels = [], [], []

for i, row in specimens.iterrows():
    latlng = LatLng.from_degrees(row["latitude"], row["longitude"])
    specimen_cell = CellId.from_lat_lng(latlng)
    result = find_adaptive_cell(specimen_cell)

    if result:
        tokens.append(result["token"])
        ids.append(result["id"])
        levels.append(result["level"])
    else:
        tokens.append(None)
        ids.append(None)
        levels.append(None)

    if (i + 1) % 100000 == 0:
        print(f"Processed {i + 1} rows")

specimens["adaptive_cell_token"] = tokens
specimens["adaptive_cell_id"] = ids
specimens["adaptive_cell_level"] = levels

specimens.to_csv("specimens_with_adaptive_cells_v2.csv", index=False)
print("Annotation complete. File saved as 'specimens_with_adaptive_cells.csv'")



Columns (11,13) have mixed types. Specify dtype option on import or set low_memory=False.



Processed 100000 rows
Processed 200000 rows
Processed 300000 rows
Processed 400000 rows
Processed 500000 rows
Processed 600000 rows
Processed 700000 rows
Processed 800000 rows
Processed 900000 rows
Processed 1000000 rows
Processed 1100000 rows
Processed 1200000 rows
✅ Annotation complete. File saved as 'specimens_with_adaptive_cells.csv'


In [42]:
specimens

Unnamed: 0,@id,ods:sourceSystemName,ods:livingOrPreserved,ods:organisationName,ods:topicOrigin,ods:topicDomain,ods:topicDiscipline,ods:specimenName,latitude,longitude,country,stateProvince,localty,island,collector,collection_date,adaptive_cell_token,adaptive_cell_id,adaptive_cell_level
0,https://doi.org/SANDBOX/3L8-AS3-E1T,NHMD Ornithology Collection,Preserved,Natural History Museum of Denmark,Natural,Life,Zoology,"Columba palumbus palumbus Linnaeus, 1758",55.806300,12.379700,Denmark,Region Hovedstaden,"Fiskebæk, Sjælland",,"Pedersen, H.",1915-05-24,465245,5.067188e+18,10.0
1,https://doi.org/SANDBOX/CDW-AZW-QCL,NHMD Ornithology Collection,Preserved,Natural History Museum of Denmark,Natural,Life,Zoology,Pachycephala citreogaster citreogaster E. P. R...,-3.138611,151.926944,Papua New Guinea,,"Lemkamin, Lelet Plateau",,Noona Dan Expedition,1962-04-16,68bdcf,7.547416e+18,10.0
2,https://doi.org/SANDBOX/62W-XWV-R9Y,NHMD Ornithology Collection,Preserved,Natural History Museum of Denmark,Natural,Life,Zoology,"Strix aluco aluco Linnaeus, 1758",56.022180,12.359960,Denmark,Region Hovedstaden,Tumlingevang,,"Wedelsborg, W.",1880-05-07,46523f,5.067182e+18,10.0
3,https://doi.org/SANDBOX/4HA-96B-7L9,NHMD Ornithology Collection,Preserved,Natural History Museum of Denmark,Natural,Life,Zoology,"Aplonis cantoroides (G. R. Gray, 1862)",-11.630378,160.289336,Solomon Islands,,"Rennell Island, Lake Tegano",,"Wolff, Torben",1962-08-23,6ed0a5,7.985064e+18,10.0
4,https://doi.org/SANDBOX/PGN-C54-5P2,NHMD Ornithology Collection,Preserved,Natural History Museum of Denmark,Natural,Life,Zoology,"Strix aluco aluco Linnaeus, 1758",55.771020,12.542870,Denmark,Region Hovedstaden,Ermelund,,"Christensen, Roar",1908-03-27,46524d,5.067197e+18,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1271536,https://doi.org/SANDBOX/1Y6-C4V-M62,Naturalis Biodiversity Center (NL) - Mollusca,Preserved,Naturalis Biodiversity Center,Natural,Life,Zoology,"Conus circumcisus Born, 1778",-3.720000,128.200000,Indonesia,Maluku (Moluccas),,,,,2d6cc3,3.273205e+18,10.0
1271537,https://doi.org/SANDBOX/5EL-M7C-8NA,Naturalis Biodiversity Center (NL) - Mollusca,Preserved,Naturalis Biodiversity Center,Natural,Life,Zoology,"Parvamussium thetidis (Hedley, 1902)",-27.033300,153.466700,Australia,Queensland,"Cape Moreton, alive",,,0-0-1964,6b93b,7.751733e+18,8.0
1271538,https://doi.org/SANDBOX/11W-9W1-2ND,Naturalis Biodiversity Center (NL) - Mollusca,Preserved,Naturalis Biodiversity Center,Natural,Life,Zoology,"Conus sanguinolentus Quoy & Gaimard, 1834",-5.766667,134.216667,Indonesia,Maluku (Moluccas),"Kepulauan Aru, Wamar, Dobo",,,--,2d2d91,3.255418e+18,10.0
1271539,https://doi.org/SANDBOX/Q5H-JR2-P52,Naturalis Biodiversity Center (NL) - Mollusca,Preserved,Naturalis Biodiversity Center,Natural,Life,Zoology,"Volva volva (Linnaeus, 1758)",36.000000,138.000000,Japan,,Tosa,,,--,601cfd,6.925689e+18,10.0
