In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

In [2]:
inat_ods_dir = Path("/data-ssd/alex/datasets/inat-open-data/inaturalist-open-data-20240327/")

In [3]:
all_obs = pd.read_csv(
    inat_ods_dir / "observations.csv",
    sep="\t",
    usecols=[
        "observation_uuid", 
        "observer_id", 
        "latitude", 
        "longitude", 
        "taxon_id", 
        "quality_grade",
        "observed_on",
    ],
    dtype={
        "quality_grade": "category"
    },
)

In [4]:
all_obs = all_obs.dropna()

In [5]:
all_obs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 131397505 entries, 0 to 133656233
Data columns (total 7 columns):
 #   Column            Dtype   
---  ------            -----   
 0   observation_uuid  object  
 1   observer_id       int64   
 2   latitude          float64 
 3   longitude         float64 
 4   taxon_id          float64 
 5   quality_grade     category
 6   observed_on       object  
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 7.0+ GB


# SAMPLE 1M GOOD OBSERVATIONS

In [6]:
obs = all_obs.sample(1_000_000)

In [7]:
obs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 110649014 to 113060401
Data columns (total 7 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   observation_uuid  1000000 non-null  object  
 1   observer_id       1000000 non-null  int64   
 2   latitude          1000000 non-null  float64 
 3   longitude         1000000 non-null  float64 
 4   taxon_id          1000000 non-null  float64 
 5   quality_grade     1000000 non-null  category
 6   observed_on       1000000 non-null  object  
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 54.4+ MB


In [8]:
# finish cleaning on our 1M sample
obs.taxon_id = obs.taxon_id.astype(np.int32)
obs.observer_id = obs.observer_id.astype(np.int32)

# skipping this since i think elastic will do infer dates?
#obs.observed_on = pd.to_datetime(obs.observed_on)

# join on world geo dataframe to get continent and country from lat and lng

In [9]:
obs_gdf = gpd.GeoDataFrame(
    obs, geometry=gpd.points_from_xy(obs.longitude, obs.latitude), crs="EPSG:4326"
)

In [10]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


In [11]:
obs_gdf = gpd.sjoin(obs_gdf, world)

In [12]:
obs_gdf = obs_gdf[[
    "observation_uuid",
    "observer_id",
    "taxon_id",
    "quality_grade",
    "observed_on",
    "continent",
    "name",
]]

In [13]:
obs_gdf.rename({"name": "country_name"}, axis=1, inplace=True)

In [14]:
obs_gdf.continent.value_counts()

continent
North America              540413
Europe                     201072
Asia                        56951
Oceania                     56093
South America               39181
Africa                      29490
Antarctica                     23
Seven seas (open ocean)         1
Name: count, dtype: int64

In [15]:
obs_gdf.sample(3)

Unnamed: 0,observation_uuid,observer_id,taxon_id,quality_grade,observed_on,continent,country_name
73869876,3e422ebd-7dc6-470b-9d20-141b47adb56d,2675493,48957,research,2022-05-17,North America,United States of America
26916547,2389e476-035d-4cff-80c9-2926fb2fa115,2882121,4993,research,2019-06-29,Asia,India
109357878,69c3e941-5a96-4a1e-a73e-48113628d03a,6961643,54704,needs_id,2023-06-25,Europe,Austria


# merge in the rest of the ods export to get taxonomy, photos, and observer login names

In [16]:
tax = pd.read_csv(
    inat_ods_dir / "taxa.csv",
    sep="\t",
    usecols=["taxon_id", "ancestry", "name", "active"],
)

In [17]:
tax = tax[tax.active==True]
tax = tax.dropna()

In [18]:
tax.rename({"name": "taxon_name"}, axis=1, inplace=True)
tax = tax[["taxon_id", "taxon_name", "ancestry"]]

In [19]:
tax.sample(3)

Unnamed: 0,taxon_id,taxon_name,ancestry
524932,584825,Erica eustacei,48460/47126/211194/47125/47124/47181/133387/63...
680818,732043,Turdus plumbeus schistaceus,48460/1/2/355675/3/7251/15977/12705/12718
1131990,1170696,Amphioplus causatus,48460/1/47549/481959/48836/774983/774984/77498...


In [20]:
observers = pd.read_csv(
    inat_ods_dir / "observers.csv",
    sep="\t",
    usecols=["observer_id", "login"],
)
observers.rename({"login": "observer_login"}, axis=1, inplace=True)


In [21]:
observers.sample(3)

Unnamed: 0,observer_id,observer_login
667504,6712072,christopherantoniou
290931,3329606,eaceelgrassteam
218708,2726218,jstout


In [22]:
photos = pd.read_csv(
    inat_ods_dir / "photos.csv",
    sep="\t",
    usecols=["photo_id", "observation_uuid", "extension"],
    dtype={
        "photo_id": np.int32,
        "extension": "category",
    }
)

In [23]:
photos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233050418 entries, 0 to 233050417
Data columns (total 3 columns):
 #   Column            Dtype   
---  ------            -----   
 0   photo_id          int32   
 1   observation_uuid  object  
 2   extension         category
dtypes: category(1), int32(1), object(1)
memory usage: 2.8+ GB


In [24]:
photos.sample(3)

Unnamed: 0,photo_id,observation_uuid,extension
106420910,169473886,57a10b4a-59a8-4377-b254-14480b179c9b,jpg
29504216,49880332,4db62dec-c278-41e6-b922-e4822c033c16,jpg
72587975,117306198,92c14a8d-f2eb-4581-bc1e-17ae196e4023,jpeg


In [25]:
obs_with_photos = pd.merge(
    obs_gdf, photos, left_on="observation_uuid", right_on="observation_uuid",
)

In [26]:
obs_photos_with_taxa = pd.merge(
    obs_with_photos, tax, left_on="taxon_id", right_on="taxon_id"
)

In [27]:
obs_photos_taxa_with_observers = pd.merge(
    obs_photos_with_taxa, observers, left_on="observer_id", right_on="observer_id"
)

In [28]:
obs_photos_taxa_with_observers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615620 entries, 0 to 1615619
Data columns (total 12 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   observation_uuid  1615620 non-null  object  
 1   observer_id       1615620 non-null  int32   
 2   taxon_id          1615620 non-null  int32   
 3   quality_grade     1615620 non-null  category
 4   observed_on       1615620 non-null  object  
 5   continent         1615620 non-null  object  
 6   country_name      1615620 non-null  object  
 7   photo_id          1615620 non-null  int32   
 8   extension         1615620 non-null  category
 9   taxon_name        1615620 non-null  object  
 10  ancestry          1615620 non-null  object  
 11  observer_login    1615620 non-null  object  
dtypes: category(2), int32(3), object(7)
memory usage: 107.9+ MB


In [29]:
len(obs_photos_taxa_with_observers)

1615620

In [30]:
obs_photos_taxa_with_observers.to_csv(
    inat_ods_dir / "complete_1M_obs_sample.csv",
    index=False,
)