# catalog1

Run using **jupyter lab** and the **Python 3 kernel**.

In [1]:
import glob
from dwca.read import DwCAReader
import pandas as pd
import os

In [2]:
INSECTS_OF_MICRONESIA_CSV = 'observations-203843.csv'
STUDENTS = 'albi345-2021.csv'
PRINT = True

In [3]:
def get_dwca_path(search_string):
    mylist = glob.glob(search_string)
    if len(mylist) == 0:
        raise Exception(f'No files found with search string = {search_string}')
    if len(mylist) > 1:
        raise Exception(f'Multiple files found with search string = {search_string}')
    return mylist[0]

# get_dwca_path('*.zip')    

In [4]:
def create_occurrence_dataframe(dwca_path):
    '''
    Note: low_memory=False is set to suppress a DtypeWarning. 
    This is considered to be bad practice. 
    See oelpeters.be/solved-dtypewarning-columns-have-mixed-types-specify-dtype-option-on-import-or-set-low-memory-in-pandas/    
    '''
    with DwCAReader(dwca_path) as dwca:
        df = dwca.pd_read('occurrence.txt', parse_dates=True, low_memory=False)
    return df

# dwca_path = get_dwca_path('*.zip')
# df = create_occurrence_dataframe(dwca_path)
# df

In [5]:
%%time

# MAIN

dwca_path = get_dwca_path('*.zip')
print(f'dwca path: {dwca_path}')
df = create_occurrence_dataframe(dwca_path)
df.dropna(how='all', axis=1, inplace=True)

df_iom = pd.read_csv(INSECTS_OF_MICRONESIA_CSV)

df_students = pd.read_csv(STUDENTS)

for i,r in df_students.iterrows():
#     print()
#     print(r['name'], r['inat_user_login'])
    
    # Create a table for the current student's specimens - data comes from iNaturalist (df_iom)

    df_student_specimens = df_iom.loc[df_iom['user_login']==r['inat_user_login']]
    field_list = ['id', 'taxon_order_name', 'taxon_family_name', 'taxon_genus_name', 'taxon_species_name', 'scientific_name']
    df_student_specimens = df_student_specimens[field_list]
    df_student_specimens.sort_values(by=field_list[1:], inplace=True)
    df_student_specimens.columns = ['id', 'order', 'family', 'genus', 'species', 'scientific_name']
#     print(df_student_specimens.shape[0])
#     print(df_student_specimens)

    s = f'<h1>{r["name"]}</h1>\n'
    s += f'<p>{df_student_specimens.shape[0]} specimens</p>\n'
    s += '<table>\n'
    
    # Create a table row for each specimen
    
    for i1, r1 in df_student_specimens.iterrows():        
#         print(r1['id'],r1['order'],r1['family'],r1['genus'],r1['species'],r1['scientific_name'])
        s += '<tr>\n'
        id = r1['id']
        link = f'<a href="https://www.inaturalist.org/observations/{id}">{id}</a>'
        s += f'<td>{link}</td>\n'
        s += f'<td>{r1["order"]}</td>\n'
        s += f'<td>{r1["family"]}</td>\n'
        s += f'<td>{r1["genus"]}</td>\n'
#         s += f'<td>{r1["species"]}</td>\n'
        s += f'<td>{r1["scientific_name"]}</td>\n'
        
        # Count the number of occurrences in GBIF (df) and add these to the table row
        
        df_sciname = df.loc[df['scientificName'].str.startswith(r1['scientific_name'])]
        df_sciname.fillna('', inplace=True)        
        total_occurrences = df_sciname.shape[0]
        inat_occurrences = df_sciname.loc[df_sciname["references"].str.contains("inaturalist")].shape[0]       
#         print(f'total occurrences: {total_occurrences}   iNaturalist occurrences: {inat_occurrences}')
        s += f'<td>{total_occurrences}</td>\n'
        s += f'<td>{inat_occurrences}</td>\n'
        s += f'</tr>\n'
    s += '</table>'
    
    # Save html in a file
    
    html_file = f'{r["name"]}.html'
    html_file = html_file.replace(' ', '_')
    with open(html_file, 'w') as f:
        f.write(s)
        
    # Create a PDF version as well
    
    pdf_file = html_file.replace(".html", ".pdf")
    commandline = f'pandoc {html_file} -o {pdf_file}'
    os.system(commandline)
    
    if PRINT:
        commandline = f'lp {pdf_file}'
        os.system(commandline)
    
print('FINISHED')

dwca path: 0077358-210914110416597.zip


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


FINISHED
CPU times: user 8.99 s, sys: 55 ms, total: 9.04 s
Wall time: 15.5 s
