<a href="https://colab.research.google.com/github/aubreymoore/OccuTree/blob/main/OccuTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-dwca-reader
!pip install htmlmin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from collections import defaultdict
import jinja2
from dwca.read import DwCAReader
import xml.etree.ElementTree as ET
from zipfile import ZipFile
import htmlmin
import re
import os

In [3]:
URL = 'https://api.gbif.org/v1/occurrence/download/request/0221151-230224095556074.zip'
DWCA = os.path.basename(URL)
VERSION = '2023-05-07'

In [4]:
def build_dataframe(dwca):
    """ 
    Builds a dataframe from the core data file of a Darwin core archive.
    """
    with DwCAReader(DWCA) as dwca:
        print("Core data file is: {}".format(dwca.descriptor.core.file_location)) # => 'occurrence.txt'
        df = dwca.pd_read('occurrence.txt', parse_dates=True, low_memory=False)

    df.dropna(axis=1, how='all', inplace=True)
    # df.rename(columns={'tclass': 'class'})
    df.sort_values(by=['kingdom', 'phylum', 'class', 'order','family','scientificName'], ignore_index=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [5]:
def link2gbif(sciname):
    try:
        taxid = df.loc[df['scientificName'] == sciname, 'taxonKey'].iloc[0]
        return f'<a href="https://www.gbif.org/species/{taxid}" target="_blank"> GBIF</a>'
    except:
        return ''

In [6]:
def build_taxonomy_dict(df):
    """
    Returns a defaultdict containing selected columns in dataframe.
    """

    def tree(): 
        return defaultdict(tree)

    taxonomy = tree()
    for i, r in df.iterrows():
        kingdom = r['kingdom']
        phylum = r['phylum']
        tclass = r['class']
        order = r['order']
        family= r['family']
        sciname = r['scientificName']
        taxonomy[kingdom][phylum][tclass][order][family][sciname]
    return taxonomy
# build_taxonomy_dict(df)

In [7]:
def build_treeview(taxonomy_dict, indent):
    """
    Returns html code for an interactive hierarchical tree view.
    Data come from a defaultdict.
    """
    html_string = ''
    html_string = html_string + '   '*indent + '<ul class ="nested"> \n'
    for k, v in taxonomy_dict.items():
        if isinstance(v, dict):
            if "name" in v.keys():
                html_string = html_string + '   '*indent + '<li><span class="caret">' + str(v["name"]) + '</span> \n '
            else:
                # html_string = html_string + '   '*indent + '<li><span class="caret">' + str(k) + '</span> \n '
                # html_string = html_string + '   '*indent + '<li><span class="caret">' + str(k) + '</span>  <a href="https://www.gbif.org/species/4995642" target="_blank">GBIF</a> \n '
                html_string = html_string + '   '*indent + '<li><span class="caret">' + str(k) + '</span>' + link2gbif(str(k)) + ' \n '
            html_string = html_string + build_treeview(v, indent+1) + '  '*indent + '</li> \n '
    html_string = html_string + '  '*indent + '</ul> \n '
    
    return html_string

In [8]:
def get_metadata():
    metadata = {}
    with ZipFile(DWCA) as myzip:
        with myzip.open('metadata.xml') as myfile:
            xml = myfile.read()

    tree = ET.ElementTree(ET.fromstring(xml))
    root = tree.getroot()

    metadata['abstract'] = root.find('./dataset/abstract/para').text
    metadata['title'] = root.find('./dataset/title').text
    metadata['pubdate'] = root.find('./dataset/pubDate').text.strip()
    metadata['altid'] = root.find('./dataset/alternateIdentifier').text.strip()
    metadata['url'] = f'https://www.gbif.org/occurrence/download/{metadata["altid"]}'
    return metadata

# print(get_metadata())

In [9]:
def get_gbif_query( metadata_abstract ):
    s = metadata['abstract']
    start = s.index( '{' )
    end = s.index( '}' )
    s = s[start:end+1]
    s = s.replace( '\n', '' )
    s = s.replace( '  ', '' ) 
    return s

# get_gbif_query( metadata['abstract'] )

In [10]:
def html_filename(metadata):
    """
    Returns a filename for the output file by parsing the GBIF query
    """
    regex = r"is (.*?)\""
    test_str = get_gbif_query( metadata['abstract'] )
    print(test_str)
    matches = re.finditer(regex, test_str)
    mylist = []
    for match in matches:
        mylist.append(match.groups(0)[0])
    s = '-'.join(mylist)
    s = s.replace( ' ', '' )
    return f'gbif-occurrences-{s}.html'

In [11]:
def calculate_taxoncounts():
    taxoncounts = {}
    taxoncounts['occurrences'] = f"{df.shape[0]:,d}"
    taxoncounts['kingdoms'] = f"{df['kingdom'].nunique():,d}"
    taxoncounts['phyla'] = f"{df['phylum'].nunique():,d}"
    taxoncounts['classes'] = f"{df['class'].nunique():,d}"
    taxoncounts['orders'] = f"{df['order'].nunique():,d}"
    taxoncounts['families'] = f"{df['family'].nunique():,d}"
    taxoncounts['genera'] = f"{df['genus'].nunique():,d}"
    taxoncounts['species'] = f"{df['species'].nunique():,d}"
    return taxoncounts

# calculate_taxoncounts()

In [12]:
# HTML TEMPLATE

template_string = """
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>GBIF Occurrences</title>

<!--
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
-->

<style>

body {
    margin: 20px;
}

#mybox {
  background-color: #cfc ;
  padding: 10px ;
  padding-right: 10px
  border: 1px solid green ;
}

ul, #myUL {
  list-style-type: none;
}

#myUL {
  margin: 0;
  padding: 0;
}

.caret {
  cursor: pointer;
  -webkit-user-select: none; /* Safari 3.1+ */
  -moz-user-select: none; /* Firefox 2+ */
  -ms-user-select: none; /* IE 10+ */
  user-select: none;
}

.caret::before {
  content: "\u25B6";
  color: black;
  display: inline-block;
  margin-right: 6px;
}

.caret-down::before {
  -ms-transform: rotate(90deg); /* IE 9 */
  -webkit-transform: rotate(90deg); /* Safari */'
  transform: rotate(90deg);
}

.nested {
  display: none;
}

.active {
  display: block;
}
</style>
</head>
<body>

<section class="section">
<div class="container">

<h1>GBIF Occurrence Records</h1>

GBIF query: <b>{{ gbif_query }}</b>

<div id="mybox">
<pre>HTML generated by OccuTree v.{{ version }} by Aubrey Moore (<a href="mailto: aubreymoore@triton.uog.edu">aubreymoore@triton.uog.edu</a>)
Data source: <a href={{ url }}>GBIF.org ({{ pubdate }}) {{ title }}</a>
{{ occurrences }} occurrences | {{ kingdoms }} kingdoms | {{ phyla }} phyla | {{ classes }} classes | {{ orders }} orders | {{ families }} families | {{ genera }} genera | {{ species }} species</pre>
</div>

<ul id="myUL">
<li><span class="caret">Click here to open tree</span>

{{ treeview }}

</li>
</ul>

</div>
</section>

<script>
var toggler = document.getElementsByClassName("caret");
var i;

for (i = 0; i < toggler.length; i++) {
  toggler[i].addEventListener("click", function() {
    this.parentElement.querySelector(".nested").classList.toggle("active");
    this.classList.toggle("caret-down");
  });
}
</script>

</body>
</html>
"""

In [13]:
# MAIN

os.system(f'wget {URL}')
df = build_dataframe(DWCA)
taxonomy_dict = build_taxonomy_dict(df)
treeview = build_treeview(taxonomy_dict, 2)

metadata = get_metadata()
taxoncounts = calculate_taxoncounts()

environment = jinja2.Environment()
template = environment.from_string(template_string)
html = template.render(
    treeview = treeview, 
    title = metadata['title'], 
    pubdate = metadata['pubdate'], 
    url = metadata['url'],
    occurrences = taxoncounts['occurrences'],
    kingdoms = taxoncounts['kingdoms'],
    phyla = taxoncounts['phyla'],
    classes = taxoncounts['classes'],
    orders = taxoncounts['orders'],
    families = taxoncounts['families'],
    genera = taxoncounts['genera'],
    species = taxoncounts['species'],
    version = VERSION,
    gbif_query = get_gbif_query( metadata['abstract'] )
)
print(len(html))
html = htmlmin.minify(html)
print(len(html))
with open(html_filename(metadata), mode='w', encoding='utf-8') as f:
    f.write(html)    
print('FINISHED')
print('NOTE: THERE MAY BE A DELAY OF A FEW SECONDS BEFORE THE OUTPUT HTML FILE APPEARS ON COLAB')

Core data file is: occurrence.txt
479332
301605
{"and" : ["Country is Guam","TaxonKey is Insecta"]}
FINISHED
NOTE: THERE MAY BE A DELAY OF A FEW SECONDS BEFORE THE OUTPUT HTML FILE APPEARS ON COLAB


# Find the first occurrence record for each taxon

# Playpen