# uuid-urls.ipynb
This Jupyter notebook uses a list of unique identifiers (uuid's) to build a web page showing a table containing metadata for datasets extracted from Insects of Guam I and II.

In [11]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from pygbif import registry
from jinja2 import Template
from datetime import datetime
import arrow

In [2]:
def get_dataset_metadata(uuid):
    """
    Uses the uuid to grab metadata from GBIF
    """
    print(f'Getting GBIF metadata for dataset with iuuid={uuid}')
    ds_search_dict = registry.dataset_search(uuid)
    count = ds_search_dict['count']
    if count > 0:
        if count > 1:
            print(f'WARNING: More than 1 dataset with uuid = {uuid}')
        key = ds_search_dict['results'][0]['key']
        ds_dict = registry.datasets(uuid=key)
        description = ds_dict.get('description','no description')        
        doi = ds_dict.get('doi', 'No doi')
        doi = f'<a href="http://doi.org/{doi}">doi</a>'
        title = ds_dict.get('title', 'no title')
        dwca = ds_dict['endpoints'][0].get('url','no dwca url')
        dwca = f'<a href="{dwca}">dwca</a>'
        record_count = ds_search_dict['results'][0].get('recordCount',0)
        return description, doi, title, record_count, dwca
    else:
        return 'no description', 'no doi', 'no title', 0, 'no dwca url'

#uuid = '2A01FF71FFF051065F5BAF5AFFFAFF96'
#uuid = 'FFACC92EFFE0FFEBFFB2FFC1FFFB1840'
#get_dataset_metadata(uuid)

In [3]:
# MAIN

# Read csv

df = pd.read_csv('data/guam_uuids.txt', delimiter='\t', header=None) 
df.drop([0,3], axis='columns', inplace=True)
df.columns = ['uuid', 'imf']

# Add metadata from GBIF

df[['description','doi','title','record_count','dwca']] = df.apply(lambda x: pd.Series(get_dataset_metadata(x['uuid'])), axis=1)

# Reorder columns

df = df[['title','description','record_count','uuid','imf','doi','dwca']]

# Sort by record_count, ascending

df.sort_values(by=['record_count'], inplace=True)

# Create an HTML page containing the dataframe as a table

template = """
<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/bulma/0.7.5/css/bulma.css'>
    </head>
    <body>
        <div class="container">
            <h1 class="title">Datasets extracted from Guam I and II</h1>
            <h2 class="subtitle">Generated by <a href="https://github.com/aubreymoore/data-mining-insects-of-guam/blob/main/uuid-urls.ipynb">
            uuid-urls.ipynb</a> at {{ timestamp }}</h2>
            {{ table }}
        </div>
    </body>
</html>"""

table = df.to_html(render_links=True, index=False, classes=['table','is-striped'], escape=False)
html_page = Template(template).render({'table':table, 'timestamp':arrow.now().format()})
with open('data/dataset-list.html', 'w') as f:
    f.write(html_page)
    
print('FINISHED')

Getting GBIF metadata for dataset with iuuid=2A01FF71FFF051065F5BAF5AFFFAFF96
Getting GBIF metadata for dataset with iuuid=4125F147872BFF9DFFF5FFAC140DFF83
Getting GBIF metadata for dataset with iuuid=8A034648715AFFEADD11FF809F446C3F
Getting GBIF metadata for dataset with iuuid=9668142AFFAC355BFFD8FFF0FFC6FFA4
Getting GBIF metadata for dataset with iuuid=A676FD1EF22D3F34FF8F8907FFDAFC58
Getting GBIF metadata for dataset with iuuid=AD79FFBAEA10FFDBFFFE8726FFBFFFFE
Getting GBIF metadata for dataset with iuuid=B525F8594A7852476D53FFF00A6FFFD2
Getting GBIF metadata for dataset with iuuid=BE7D4354FFEEFF8BFFBDFFBC0B359010
Getting GBIF metadata for dataset with iuuid=C5751610FFAD3E7CE078FFB1FFCFFF82
Getting GBIF metadata for dataset with iuuid=CB52FF9DFFBE1B09FFDAFFB507064636
Getting GBIF metadata for dataset with iuuid=F27607317E645A2FB552FF8416577654
Getting GBIF metadata for dataset with iuuid=FE566D11FFD2FFF5383F9056FFE3FFEF
Getting GBIF metadata for dataset with iuuid=FF82F923FF8F8849FFA

In [16]:
df1 = df[['uuid','title']]
df1['last step completed'] = 'not started'
df1[['uuid','last step completed','title']].to_markdown('dataset-list.md', index=False)