In [9]:
import pandas as pd
from pygbif import registry
from requests_cache import remove_expired_responses
import re

In [10]:
def get_dataset_metadata(uuid):
    print(f'Getting GBIF metadata for dataset with iuuid={uuid}')
    ds_search_dict = registry.dataset_search(uuid)
    count = ds_search_dict['count']
    if count > 0:
        if count > 1:
            print(f'WARNING: More than 1 dataset with uuid = {uuid}')
        key = ds_search_dict['results'][0]['key']
        ds_dict = registry.datasets(uuid=key)
        description = ds_dict.get('description','no description')        
        doi = ds_dict.get('doi', 'No doi')
        doi = f'http://doi.org/{doi}'
        title = ds_dict.get('title', 'no title')
        return description, doi, title
    else:
        return 'no description', 'no doi', 'no title'

#uuid = '2A01FF71FFF051065F5BAF5AFFFAFF96'
#uuid = 'FFACC92EFFE0FFEBFFB2FFC1FFFB1840'
#get_dataset_metadata(uuid)

In [11]:
# Read csv

df = pd.read_csv('data/guam_uuids.txt', delimiter='\t', header=None) 
df.drop([0,3], axis='columns', inplace=True)
df.columns = ['uuid', 'imf']

# Add get metadata from GBIF and add columns

df[['description','doi','title']] = df.apply(lambda x: pd.Series(get_dataset_metadata(x['uuid'])), axis=1)

df.to_html('data/uuid_list.html', render_links=True)
df

Getting GBIF metadata for dataset with iuuid=2A01FF71FFF051065F5BAF5AFFFAFF96
Getting GBIF metadata for dataset with iuuid=4125F147872BFF9DFFF5FFAC140DFF83
Getting GBIF metadata for dataset with iuuid=8A034648715AFFEADD11FF809F446C3F
Getting GBIF metadata for dataset with iuuid=9668142AFFAC355BFFD8FFF0FFC6FFA4
Getting GBIF metadata for dataset with iuuid=A676FD1EF22D3F34FF8F8907FFDAFC58
Getting GBIF metadata for dataset with iuuid=AD79FFBAEA10FFDBFFFE8726FFBFFFFE
Getting GBIF metadata for dataset with iuuid=B525F8594A7852476D53FFF00A6FFFD2
Getting GBIF metadata for dataset with iuuid=BE7D4354FFEEFF8BFFBDFFBC0B359010
Getting GBIF metadata for dataset with iuuid=C5751610FFAD3E7CE078FFB1FFCFFF82
Getting GBIF metadata for dataset with iuuid=CB52FF9DFFBE1B09FFDAFFB507064636
Getting GBIF metadata for dataset with iuuid=F27607317E645A2FB552FF8416577654
Getting GBIF metadata for dataset with iuuid=FE566D11FFD2FFF5383F9056FFE3FFEF
Getting GBIF metadata for dataset with iuuid=FF82F923FF8F8849FFA

Unnamed: 0,uuid,imf,description,doi,title
0,2A01FF71FFF051065F5BAF5AFFFAFF96,InsectsOfGuamI.172.73-146.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5159964,Curculionidae of Guam
1,4125F147872BFF9DFFF5FFAC140DFF83,InsectsOfGuamI.172.199-200.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5173998,Culicidae of Guam
2,8A034648715AFFEADD11FF809F446C3F,InsectsOfGuamI.172.147-149.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5160072,Barkbeetles of Guam
3,9668142AFFAC355BFFD8FFF0FFC6FFA4,InsectsOfGuamI.172.25-30.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5159923,Neuropteroid Insects from Guam
4,A676FD1EF22D3F34FF8F8907FFDAFC58,InsectsOfGuamI.172.191-194.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5160456,Halictine Bees from Rota Island
5,AD79FFBAEA10FFDBFFFE8726FFBFFFFE,InsectsOfGuamI.172.65-72.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5159835,Anthribidae Of Guam
6,B525F8594A7852476D53FFF00A6FFFD2,InsectsOfGuamI.172.184-187.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5160297,Wasps of Guam
7,BE7D4354FFEEFF8BFFBDFFBC0B359010,InsectsOfGuamI.172.53-55.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5159555,Elaterid And Eucnemid Beetles Of Guam
8,C5751610FFAD3E7CE078FFB1FFCFFF82,InsectsOfGuamI.172.39-40.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5160080,Sphingidae Of Guam
9,CB52FF9DFFBE1B09FFDAFFB507064636,InsectsOfGuamI.172.150-171.pdf.imf,This dataset contains the digitized treatments...,http://doi.org/10.5281/zenodo.5167701,Miscellaneous Families of Guam Coleoptera


In [None]:
for i, r in df.iterrows():
    if i==20:
        break
    ds_search_dict = registry.dataset_search(r.uuid)
    if ds_search_dict['count'] > 0:
        key = ds_search_dict['results'][0]['key']
        ds_dict = registry.datasets(uuid=key)
        gbif_dataset_citation = ds_dict.get('citation', {}).get('text', 'No citation')        
        print(gbif_dataset_citation)
        gbif_dataset_doi = ds_dict.get('doi', 'No doi')
        print(gbif_dataset_doi)
        print()
    else:
        print(f'No GBIF dataset with uuid = {uuid}')    

In [None]:
ds_dict.get('citation', {}).get('text', 'No citation')

In [None]:
def get_description(uuid):
    try:
        s = registry.dataset_search(uuid)['results'][0]['description']
    except:
        return f'Cannot get description for uuid {uuid}'
    s = s.replace('This dataset contains the digitized treatments in Plazi based on the original book chapter ', '')
    #s = doi_to_link(s)   
    return s

get_description(df.uuid[10])

In [None]:
def doi_to_link(text):
    'Converts DOI reference into a hyperlink.'
    doi_re = re.compile("http://doi.org/10.(\d)+/([^(\s\>\"\<)])+")
    m = doi_re.search(text)
    doi = m.group(0)
    link = f'<a href="{doi}">{doi}</a>'
    return text.replace(doi, link)                   

In [None]:
def get_gbif_link(uuid):
    s = f'https://www.gbif.org/dataset/search?q={uuid}'
    link = f'<a href="{s}">{s}</a>'
    return link

#get_gbif_link(df.uuid[1])

In [None]:
df = pd.read_csv('data/guam_uuids.txt', delimiter='\t', header=None) 
df.drop([0,3], axis='columns', inplace=True)
df.columns = ['uuid', 'imf']
df['gbif'] = df.apply(lambda x: get_gbif_link(x.uuid), axis=1)
df['zenodo'] = df.apply(lambda x: get_description(x.uuid), axis=1)
df.to_html('data/uuid_list.html', escape=False)

In [None]:
def get_doi(text):
    ''
    doi_re = re.compile("http://doi.org/10.(\d)+/([^(\s\>\"\<)])+")
    m = doi_re.search(text)
    if m:
        return m.group(0)
    else:
        return 'no DOI'

In [None]:
df = pd.read_csv('data/guam_uuids.txt', delimiter='\t', header=None) 
df.drop([0,3], axis='columns', inplace=True)
df.columns = ['uuid', 'imf']
df['gbif'] = df.apply(lambda x: f'https://www.gbif.org/dataset/search?q={x.uuid}', axis=1)
df['zenodo'] = df.apply(lambda x: get_description(x.uuid), axis=1)
df['doi'] = df.apply(lambda x: get_doi(x.zenodo), axis=1)
df.to_html('data/uuid_list1.html', render_links=True)

In [None]:
df.to_dict()

In [4]:
uuid = '2A01FF71FFF051065F5BAF5AFFFAFF96'
#uuid = 'FFACC92EFFE0FFEBFFB2FFC1FFFB1840'

ds_search_dict = registry.dataset_search(uuid)
if ds_search_dict['count'] > 0:
    key = ds_search_dict['results'][0]['key']
    ds_dict = registry.datasets(uuid=key)
    gbif_dataset_citation = ds_dict['citation']['text']
    gbif_dataset_doi = ds_dict['doi']
    print(gbif_dataset_citation, gbif_dataset_doi)
else:
    print(f'No GBIF dataset with uuid = {uuid}')

Zimmerman E C, carolina (1942). Curculionidae of Guam. Plazi.org taxonomic treatments database. Checklist dataset https://doi.org/10.5281/zenodo.5159964 accessed via GBIF.org on 2021-10-10. 10.5281/zenodo.5159964


In [8]:
ds_dict.get('title','no title')

'Curculionidae of Guam'

In [None]:
registry.datasets(uuid='d0309e8b-3179-4162-946c-08cef1c82013')