* download Insect of Micronesia records since 2021-08-01
* get list of unique GBIF taxon ids
* get GBIF occurrence records for each taxon id

Run using the Python 3 kernel.

In [1]:
from pygbif import occurrences as occ
from pygbif import species
import pandas as pd
import pprint

In [2]:
%%time

def get_gbif_taxa(taxon_list):
    """
    Returns a dataframe containing a record for each taxon in taxon_list
    """
    mylist = []
    for taxon in taxon_list:
        mylist.append(species.name_backbone(name=taxon))
    return pd.DataFrame.from_dict(mylist)

# get_gbif_taxa(taxon_list)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs


In [3]:
INSECTS_OF_MICRONESIA_CSV = 'observations-201397.csv'

In [4]:
def get_inat_count(data):
    inat_count = 0
    for i, x in enumerate(data['results']):
        references = x.get('references', '')
        if 'inaturalist' in references:
            inat_count += 1
        #print(i+1, references)
    return inat_count

In [5]:
def get_gbif_count(df_gbif_taxa):
    mylist = []
    for i, r in df_gbif_taxa.iterrows():
        data = occ.search(country='GU', scientificName=r.scientificName, limit=10000)
        if data['count'] > 0:
            taxon_rank = data['results'][0].get('taxonRank','')
        else:
            taxon_rank = ''
        mydict = {
            'taxon': r.scientificName,
            'taxon_rank': taxon_rank,
            'total_gbif_records': data['count'],
            'gbif_records_from_inat': get_inat_count(data)}
        mylist.append(mydict)
    df = pd.DataFrame.from_dict(mylist)
    return df

In [6]:
%%time

def get_gbif_gu_occ(df_gbif_taxa):
    """
    Returns a dataframe containing all Guam GBIF occurrence records for each taxon in df_gbif_taxa
    """
    mylist = []
    for i, r in df_gbif_taxa.iterrows():
#         print(r.scientificName)
        data = occ.search(country='GU', scientificName=r.scientificName, limit=10000)
        if data['count'] > 0:
            mylist.extend(data['results'])
            
    # Remove child dicts
    
    for item in mylist:
        del item['extensions']
        del item['facts']
        del item['gadm']
        del item['identifiedByIDs']
        del item['identifiers']
        del item['issues']
        del item['media']
        del item['recordedByIDs']
        del item['relations']
       
    return pd.DataFrame.from_dict(mylist)    

# df_gbif_gu_occ = get_gbif_gu_occ(df_gbif_taxa[:10])
# df_gbif_gu_occ

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.58 µs


In [8]:
%%time

# Main

print('creating df_iom')
df_iom = pd.read_csv(INSECTS_OF_MICRONESIA_CSV)

taxon_list = df_iom.scientific_name.tolist()
taxon_list = sorted(list(set(taxon_list)))

print('Selecting first 10 rows of taxon_list for testing.')
taxon_list = taxon_list[:10]

print('creating df_gbif_taxa')
df_gbif_taxa = get_gbif_taxa(taxon_list)

print('creating df_gbif_gu_occ')
df_gbif_gu_occ = get_gbif_gu_occ(df_gbif_taxa)

print('FINISHED')

# df_gbif_count = get_gbif_count(df_gbif_taxa)
# df_gbif_count

creating df_iom
Selecting first 10 rows of taxon_list for testing.
creating df_gbif_taxa
creating df_gbif_gu_occ
FINISHED
CPU times: user 596 ms, sys: 35.1 ms, total: 631 ms
Wall time: 33 s
