In [1]:
import pandas as pd
import os
from pygbif import occurrences as occ

In [2]:
def read_dataset_list():
    '''
    Reads ../dataset-list.md and returns a pandas dataframe
    '''
    
    df = pd.read_table('../dataset-list.md', sep="|", header=0, skipinitialspace=True)

    # Drop the left-most and right-most null columns 
    
    df = df.dropna(axis=1, how='all')

    # Drop the header underline row
    
    df = df.iloc[1:]  

    # Strip whitespace from end of strings
    
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    # Strip whitespace from end of column headers
    
    df.columns = df.columns.str.strip()

    # Drop datasets with no title - we don't need to process these
    
    df = df.drop(df[df.title == 'no title'].index)
    df = df.reset_index()
    return df

# read_dataset_list()

In [3]:
# MAIN

df_uuids = read_dataset_list()
for i, r in df_uuids.iterrows():
    print(i)
    
    # garbage collection
    
    dwca_files = ['temp.zip', 'meta.xml','eml.xml','metadata.json','taxa.txt','occurrences.txt','multimedia.txt','description.txt',
                  'distribution.txt','media.txt','references.txt','vernaculars.txt']
    for dwca_file in dwca_files:
        command = f'rm --force {dwca_file}'
        result = os.system(command)
        assert result==0, f'{command} FAILED'

    # break loop for testing    
        
    if i==1000:
        break
    
    # download dwca
    
    command = f'wget http://tb.plazi.org/GgServer/dwca/{r.uuid}.zip -O temp.zip'
    result = os.system(command)
    assert result==0, f'{command} FAILED'
    
    # unzip dwca
    
    command = 'unzip temp.zip'
    result = os.system(command)
    assert result==0, f'{command} FAILED' 
    
    # create a dataframe containing all records from the taxa.txt files
    
    if i==0:     # read taxa.txt into df_taxa
        df_taxa = pd.read_csv('taxa.txt', sep='\t')        
    else:        # read taxa.txt and append to df_taxa
        df_taxa = pd.concat([df_taxa, pd.read_csv('taxa.txt', sep='\t')])
        
df_taxa = df_taxa.reindex()
for i, r in df_taxa.iterrows():
    treatment_id = r.taxonID[:32]
    print(treatment_id)
    
    # download treatment xml file
    
    command = f'wget https://tb.plazi.org/GgServer/xml/{treatment_id} -O {treatment_id}.xml'
    result = os.system(command)
    assert result==0, f'{command} FAILED'

print('FINISHED')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
03C90A6EFFA41642FEA5FBBCFD9C4EB0
03C90A6EFFA41642FEA5FBBCFD9C4EB0
5A4F8566F22D3F34FE948C98FBDCFBA4
5A4F8566F22D3F34FE918AA9FBC4F938
5A4F8566F22D3F34FE91813FFC96F59D
5A4F8566F22C3F36FD7288C6F94FF86D
03B5DF0EFF947F25FEBB0EF4FCB2FB8D
03B5DF0EFF947F25FEBB0AC7FB2FF790
03B5DF0EFF947F25FEBA08EEFBB6F964
03B5DF0EFF947F24FEB9056EFA37FC82
03E787F1B713A955FEFE90B4F92CF770
03E787F1B713A955FEF99BEEFCE6FB4F
03E787F1B713A955FEFC9CD9FD8FF94D
03E787F1B713A955FEFE9ED8FCE4F7E8
9D0E87CA9C68FFD2FCE15EAAF931F857
9D0E87CA9C68FFD2FCA05F96F937F601
9D0E87CA9C69FFD3FE1359CCF9C4F633
9D0E87CA9C6AFFD1FE865906FAB2FDEC
9D0E87CA9C6BFFD6FE2B5E6EFB5CFB1D
9D0E87CA9C6CFFD7FE835D2EF9ABF8BD
394C6E68FFAD3E7CE140FCCEFDFEF828
394C6E68FFAD3E7CE142F87EFBD0F634
394C6E68FFAC3E7DE1E8F79AF933F66C
394C6E68FFAC3E7DE1EEFCC5FD09F78B
394C6E68FFAC3E7DE1EEFCC5FD09F78B
394C6E68FFAC3E7DE1E2FE31F8C6FCCA
394C6E68FFAC3E7DE1E2FE31F8C6FCCA
39

In [2]:
import xml.etree.ElementTree as ET

# xmlfile = '03938792FFDBFFCAFEC2FBF880DCF876.xml'
xmlfile = 'AD79FFBAEA10FFDBFFFE8726FFBFFFFE'
tree = ET.parse(xmlfile)
root = tree.getroot()
print(root)
  
#     # create empty list for news items
#     newsitems = []
  
#     # iterate news items
#     for item in root.findall('./channel/item'):

FileNotFoundError: [Errno 2] No such file or directory: 'AD79FFBAEA10FFDBFFFE8726FFBFFFFE'

In [9]:
for treatment in root:
    print(treatment.attrib)

{'id': '49ED46F3A358F21E7A0A6E952146C160'}
{'id': '03938792FFDBFFCAFEC2FBF880DCF876', 'ID-DOI': 'http://doi.org/10.5281/zenodo.5211512', 'ID-GBIF-Taxon': '183925047', 'ID-Zenodo-Dep': '5211512', 'LSID': 'urn:lsid:plazi:treatment:03938792FFDBFFCAFEC2FBF880DCF876', 'httpUri': 'http://treatment.plazi.org/id/03938792FFDBFFCAFEC2FBF880DCF876', 'lastPageNumber': '149', 'pageId': '0', 'pageNumber': '149'}


In [8]:
[elem.tag for elem in root.iter()]

['document',
 '{http://www.loc.gov/mods/v3}mods',
 '{http://www.loc.gov/mods/v3}titleInfo',
 '{http://www.loc.gov/mods/v3}title',
 '{http://www.loc.gov/mods/v3}name',
 '{http://www.loc.gov/mods/v3}role',
 '{http://www.loc.gov/mods/v3}roleTerm',
 '{http://www.loc.gov/mods/v3}namePart',
 '{http://www.loc.gov/mods/v3}affiliation',
 '{http://www.loc.gov/mods/v3}typeOfResource',
 '{http://www.loc.gov/mods/v3}relatedItem',
 '{http://www.loc.gov/mods/v3}originInfo',
 '{http://www.loc.gov/mods/v3}dateIssued',
 '{http://www.loc.gov/mods/v3}dateOther',
 '{http://www.loc.gov/mods/v3}publisher',
 '{http://www.loc.gov/mods/v3}place',
 '{http://www.loc.gov/mods/v3}placeTerm',
 '{http://www.loc.gov/mods/v3}titleInfo',
 '{http://www.loc.gov/mods/v3}title',
 '{http://www.loc.gov/mods/v3}part',
 '{http://www.loc.gov/mods/v3}extent',
 '{http://www.loc.gov/mods/v3}start',
 '{http://www.loc.gov/mods/v3}end',
 '{http://www.loc.gov/mods/v3}classification',
 '{http://www.loc.gov/mods/v3}identifier',
 '{http:/

In [2]:
# command = f'grep '<materialsCitation ' *.xml'
# result = os.system(command)
# assert result==0, f'{command} FAILED'

import re

s = 'key|collectedFrom\n'
with open("materialsCitations.txt") as file:
    for line in file:
        regex = "ID-GBIF-Occurrence=\"(\d+)\""
        match = re.findall(regex, line)
#         print(match, len(match))
        if len(match) == 0:
            occid = ''
        else:
            occid = match[0]
        
        regex = "collectedFrom=\"(.*?)\""
        match = re.findall(regex, line, re.IGNORECASE)
#         print(match, len(match))
        if len(match) == 0:
            collectedFrom = ''
        else:
            collectedFrom = match[0]
            
        s += f'{occid}|{collectedFrom}\n'
        
with open('collectedFrom.csv', 'w') as file:
    file.write(s)        

In [2]:
df_collectedFrom = pd.read_csv('collectedFrom.csv', sep='|', dtype=str)

In [3]:
df_gloch = df_collectedFrom[df_collectedFrom.collectedFrom.str.contains('Glochidion', na=False)]
df_gloch.reset_index(inplace=True, drop=True)
df_gloch["key"] = pd.to_numeric(df_gloch["key"])
df_gloch.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gloch["key"] = pd.to_numeric(df_gloch["key"])


key               int64
collectedFrom    object
dtype: object

In [4]:
# Download occurrence records from GBIF

occlist = []
for i, r in df_gloch.iterrows():
    key = r['key']
    print(key)
    occdict = occ.get(key)
    occlist.append(occdict)  
    
# Save occurrence records in df_occ

df_occ = pd.DataFrame(occlist)
df_occ    
    
    
#     if i==0:
#         df_occ = pd.DataFrame.from_dict(occdict)
#     else:
#         pd.cocatenate([df_occ, pd.DataFrame.from_dict(occdict)])
#     print(occdict)
    
#     order = occdict.get('order')
#     family = occdict.get('family')
#     sciname = occdict.get('scientificName')
#     accepted_sciname = occdict.get('acceptScientificName')
   
#     print(key, occdict['order'], occdict['family'], occdict['scientificName'], 'accepted name = ', occdict['acceptedScientificName'])

3390911310
3914035340
3336062316
3331244304
3441224333
3994027426
3332978341
3332978379
3334698330
3404213302
3901040361
3901040342
3345364355
3990703308
3990703307
3990703303
3333874417
3994028386
3994028487
3336136333
3336136317
4002932301
4002932302
3994028342
3994028476
3336136313
3994028406


Unnamed: 0,key,datasetKey,publishingOrgKey,installationKey,publishingCountry,protocol,lastCrawled,lastParsed,crawlId,hostingOrganizationKey,...,taxonID,http://unknown.org/canonicalName,verbatimLabel,typeStatus,nomenclaturalStatus,namePublishedIn,http://unknown.org/combinationYear,http://unknown.org/basionymYear,http://unknown.org/basionymAuthors,sex
0,3390911310,356a98ac-1526-4045-ae7f-52d08e753dfb,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-07T13:04:39.940+00:00,2022-12-07T13:05:47.839+00:00,97,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,026F1569FFD3FFF7392198CDFB65FB9B.taxon,Halictus swezeyi,"Dandan, on Glochidion flowers, July 17, Swezey",,,,,,,
1,3914035340,8ddfacbb-c42b-4385-b70b-ea5bd759c377,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-25T13:51:27.492+00:00,2022-12-25T13:52:01.579+00:00,85,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,03938792FFDBFFCAFEC2FBF880DCF876.taxon,Dictyophora,"Nov. 14, on Glochidion, Swezey;",,,,,,,
2,3336062316,058b438a-ffe3-452b-a286-9267419b3014,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-10T23:38:29.353+00:00,2022-12-10T23:39:20.588+00:00,120,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,03B387B8E946FFAFFE5185D093B302AA.taxon,Nephropteryx,"Aug. 24, one swept from Glochidion marianum, S...",,,,,,,
3,3331244304,204ef7e8-5ba9-4591-8275-598a53611feb,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-07T13:04:40.641+00:00,2022-12-07T13:05:42.825+00:00,90,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,03DFFE1CBF21FFD3F6B0F87EF9DFF61D.taxon,Trioza guama,"Piti, Aug. 18, on Glochidion marianimi, Swezey...",HOLOTYPE,spec. nov.,"Cadwell, J. S. (1942): Psyllidae from Guam. In...",1942.0,,,
4,3441224333,16f6ef92-619b-417b-946c-22d9b1445e7d,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-07T12:54:51.876+00:00,2022-12-07T12:59:59.606+00:00,106,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,2C76878DFF9CFFF5BEF9FC1FFB97F9C5.taxon,Valanga excavata,"Piti, Oct. 12, on Ipomoea and Glochidion, Swezey",,,,,1861.0,Stal,
5,3994027426,5f279ac2-63c6-4bd8-be9e-0e3d82b73ab2,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-20T00:07:47.502+00:00,2022-12-20T00:08:56.474+00:00,99,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,376B87E5FFAB1B1CFED6FD28013A415A.taxon,Phytorus lineolatus,"Dandan, July 17, on Glochidion and Citrus, Swe...",,,,1884.0,,,
6,3332978341,62345736-dcf6-4c38-a870-36a90992dabb,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-11-15T20:16:06.902+00:00,2022-11-24T22:40:46.733+00:00,98,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,416187EDEB4EFFE7FE084AC28D80F56C.taxon,Tartessus swezeyi,"two m ~ es and one female, Piti, Aug. 18, on G...",,spec. nov.,"Metcalf, Z. P. (1946): Homoptera, Fulgoroidea ...",1946.0,,,FEMALE
7,3332978379,62345736-dcf6-4c38-a870-36a90992dabb,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-11-15T20:16:06.902+00:00,2022-11-24T22:40:45.052+00:00,98,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,416187EDEB4FFFE9FE974A368B17F4AA.taxon,Tartessus ochraceus,"Holotype female, Piti, Sept. 21, on Glochidion...",HOLOTYPE,spec. nov.,"Metcalf, Z. P. (1946): Homoptera, Fulgoroidea ...",1946.0,,,FEMALE
8,3334698330,7b960500-f4a3-4b8f-8b56-440fdc9431c9,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-11-30T09:51:59.711+00:00,2022-11-30T09:55:54.166+00:00,76,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,484B8797FFAAFFB6A528FE45A073FD19.taxon,Notogramma,"Piti, Aug. 18, one specimen, swept from Glochi...",,,,1867.0,1798.0,Fabricius,
9,3404213302,0f4ee0b0-7d0e-443c-b4c9-0b40ecd08854,7ce8aef0-9e92-11dc-8738-b8a03c50a862,7ce8aef1-9e92-11dc-8740-b8a03c50a999,CH,DWC_ARCHIVE,2022-12-07T13:04:40.650+00:00,2022-12-07T13:05:43.868+00:00,98,7ce8aef0-9e92-11dc-8738-b8a03c50a862,...,514087C2EA13FFDFFE478056FE5DF754.taxon,Notioxenus fulgidus,five paratypes from the same locality collecte...,,spec. nov.,"Zimmerman, Elwood C. (1942): Anthribidae Of Gu...",1942.0,,,


In [5]:
df_merged = df_gloch.merge(df_occ)
df_merged.sort_values(by=['order', 'family', 'genus'], inplace=True)
df_merged.reset_index(inplace=True, drop=True)
df_merged[['key','order','family','acceptedScientificName','scientificName', 'collectedFrom']]

Unnamed: 0,key,order,family,acceptedScientificName,scientificName,collectedFrom
0,3404213302,Coleoptera,Anthribidae,"Notioxenus fulgidus Zimmerman, 1942","Notioxenus fulgidus Zimmerman, 1942",Glochidion
1,3994027426,Coleoptera,Chrysomelidae,"Phytorus lineolatus Weise, 1884","Phytorus lineolatus Weise, 1884",on Glochidion and Citrus
2,3990703308,Coleoptera,Curculionidae,"Swezeyella muscosa Zimmerman, 1942","Swezeyella muscosa Zimmerman, 1942",from Glochidion
3,3990703307,Coleoptera,Curculionidae,"Swezeyella muscosa Zimmerman, 1942","Swezeyella muscosa Zimmerman, 1942",from Glochidion
4,3990703303,Coleoptera,Curculionidae,"Swezeyella muscosa Zimmerman, 1942","Swezeyella muscosa Zimmerman, 1942",from Glochidion
5,3333874417,Coleoptera,Curculionidae,"Trigonops inaequalis Zimmerman., 1942","Trigonops inaequalis Zimmerman., 1942",from Glochidion
6,3334698330,Diptera,Ulidiidae,"Notogramma Loew, 1867","Notogramma Loew, 1867",swept from Glochidion marianum
7,3332978341,Hemiptera,Cicadellidae,"Tartessus swezeyi Metcalf, 1946","Tartessus swezeyi Metcalf, 1946",on Glochidion sp.
8,3332978379,Hemiptera,Cicadellidae,"Tartessus ochraceus Metcalf, 1946","Tartessus ochraceus Metcalf, 1946",on Glochidion
9,3914035340,Hemiptera,Dictyopharidae,"Dictyophara Germar, 1833","Dictyophora Burmeister, 1835",on Glochidion


In [12]:
def make_clickable(key):
    # target _blank to open new window
    return f'<a target="_blank" href="https://gbif.org/occurrence/{key}">{key}</a>'

# df_merged.style({'key': make_clickable})

df_merged['link'] = df_merged.apply(lambda x: make_clickable(x['key']), axis=1)
df_merged[['key','link','order','family','acceptedScientificName','scientificName', 'collectedFrom']].style

Unnamed: 0,key,link,order,family,acceptedScientificName,scientificName,collectedFrom
0,3404213302,3404213302,Coleoptera,Anthribidae,"Notioxenus fulgidus Zimmerman, 1942","Notioxenus fulgidus Zimmerman, 1942",Glochidion
1,3994027426,3994027426,Coleoptera,Chrysomelidae,"Phytorus lineolatus Weise, 1884","Phytorus lineolatus Weise, 1884",on Glochidion and Citrus
2,3990703308,3990703308,Coleoptera,Curculionidae,"Swezeyella muscosa Zimmerman, 1942","Swezeyella muscosa Zimmerman, 1942",from Glochidion
3,3990703307,3990703307,Coleoptera,Curculionidae,"Swezeyella muscosa Zimmerman, 1942","Swezeyella muscosa Zimmerman, 1942",from Glochidion
4,3990703303,3990703303,Coleoptera,Curculionidae,"Swezeyella muscosa Zimmerman, 1942","Swezeyella muscosa Zimmerman, 1942",from Glochidion
5,3333874417,3333874417,Coleoptera,Curculionidae,"Trigonops inaequalis Zimmerman., 1942","Trigonops inaequalis Zimmerman., 1942",from Glochidion
6,3334698330,3334698330,Diptera,Ulidiidae,"Notogramma Loew, 1867","Notogramma Loew, 1867",swept from Glochidion marianum
7,3332978341,3332978341,Hemiptera,Cicadellidae,"Tartessus swezeyi Metcalf, 1946","Tartessus swezeyi Metcalf, 1946",on Glochidion sp.
8,3332978379,3332978379,Hemiptera,Cicadellidae,"Tartessus ochraceus Metcalf, 1946","Tartessus ochraceus Metcalf, 1946",on Glochidion
9,3914035340,3914035340,Hemiptera,Dictyopharidae,"Dictyophara Germar, 1833","Dictyophora Burmeister, 1835",on Glochidion


In [8]:
from pygbif import occurrences as occ

occ.get(key = 3331287303)

{'key': 3331287303,
 'datasetKey': '0f4ee0b0-7d0e-443c-b4c9-0b40ecd08854',
 'publishingOrgKey': '7ce8aef0-9e92-11dc-8738-b8a03c50a862',
 'installationKey': '7ce8aef1-9e92-11dc-8740-b8a03c50a999',
 'publishingCountry': 'CH',
 'protocol': 'DWC_ARCHIVE',
 'lastCrawled': '2022-12-07T13:04:40.650+00:00',
 'lastParsed': '2022-12-07T13:05:43.853+00:00',
 'crawlId': 98,
 'hostingOrganizationKey': '7ce8aef0-9e92-11dc-8738-b8a03c50a862',
 'extensions': {},
 'basisOfRecord': 'MATERIAL_CITATION',
 'individualCount': 8,
 'occurrenceStatus': 'PRESENT',
 'taxonKey': 10978288,
 'kingdomKey': 1,
 'phylumKey': 54,
 'classKey': 216,
 'orderKey': 1470,
 'familyKey': 4206,
 'genusKey': 1167566,
 'speciesKey': 10978288,
 'acceptedTaxonKey': 10978288,
 'scientificName': 'Notioxenus fulgidus Zimmerman, 1942',
 'acceptedScientificName': 'Notioxenus fulgidus Zimmerman, 1942',
 'kingdom': 'Animalia',
 'phylum': 'Arthropoda',
 'order': 'Coleoptera',
 'family': 'Anthribidae',
 'genus': 'Notioxenus',
 'species': 'N

In [9]:
from pygbif import species as sp
sp.name_lookup('Phytorus lineolatus')

{'offset': 0,
 'limit': 100,
 'endOfRecords': True,
 'count': 3,
 'results': [{'key': 11195226,
   'nameKey': 58521121,
   'datasetKey': 'd7dddbf4-2cf0-4f39-9b2a-bb099caae36c',
   'constituentKey': '5f279ac2-63c6-4bd8-be9e-0e3d82b73ab2',
   'parentKey': 4718924,
   'parent': 'Phytorus',
   'kingdom': 'Animalia',
   'phylum': 'Arthropoda',
   'order': 'Coleoptera',
   'family': 'Chrysomelidae',
   'genus': 'Phytorus',
   'species': 'Phytorus lineolatus',
   'kingdomKey': 1,
   'phylumKey': 54,
   'classKey': 216,
   'orderKey': 1470,
   'familyKey': 7780,
   'genusKey': 4718924,
   'speciesKey': 11195226,
   'scientificName': 'Phytorus lineolatus Weise, 1884',
   'canonicalName': 'Phytorus lineolatus',
   'authorship': 'Weise, 1884',
   'nameType': 'SCIENTIFIC',
   'taxonomicStatus': 'ACCEPTED',
   'rank': 'SPECIES',
   'origin': 'SOURCE',
   'numDescendants': 0,
   'numOccurrences': 0,
   'habitats': [],
   'nomenclaturalStatus': [],
   'threatStatuses': [],
   'descriptions': [{'descr