In [1]:
from lxml import etree
import pandas as pd
from uuid import uuid4

In [2]:
# Limit number of rows in dataframe display
pd.options.display.max_rows = 10

In [3]:
doc = etree.parse('list.xml')

### Build Taxon table

In [4]:
def getTaxonRank(scientificName):
    word_count = len(scientificName.split(' '))
    if word_count == 1:
        return('genus')
    else:
        return('species')

In [5]:
taxon_list = []
for section in doc.iterfind('section'):
    scientificName = section.findtext('crop/sci')
    taxonRank = getTaxonRank(scientificName)
    taxon_list.append({'kingdom':'Plantae',
                       'scientificName': scientificName,
                       'taxonRank': taxonRank
                      })
    for pest in section.iterfind('pest'):
        scientificName = pest.findtext('sci')
        taxonRank = getTaxonRank(scientificName)
        taxon_list.append({'kingdom':'Animalia',
                           'scientificName': scientificName,
                           'taxonRank': taxonRank
                          })
df_taxon = pd.DataFrame(taxon_list)
df_taxon = df_taxon.drop_duplicates()

In [6]:
if not ('uuid' in df_taxon.columns):
    df_taxon['uuid'] = None
    for index, row in df_taxon.iterrows():
        row['uuid'] = uuid4().hex
else:
    print('uuid column already exists')

df_taxon = df_taxon[['uuid', 'scientificName', 'taxonRank', 'kingdom']]
df_taxon

Unnamed: 0,uuid,scientificName,taxonRank,kingdom
0,ea2d6c59bf1a424bb9b685dec70af400,Agave sisalana,species,Plantae
1,45a08215eaee43639815b155e2c34d6b,Aonidiella orientalis,species,Animalia
2,ebb8f69dba3f4ae0b489c0a6ea902e7c,Dysmicoccus neobrevipes,species,Animalia
3,289954c303b24b89854770770ce7ad9b,Persea americana,species,Plantae
4,5de35f74bc5a43c9a1b3aa8a6d87ece1,Trigonops,genus,Animalia
...,...,...,...,...
981,630d484ac704415eaccb4785896a7c4d,Anua coronata,species,Animalia
983,3b223a621dc9465ab19547a1df3422e9,Psophocarpus tetragonalubus,species,Plantae
984,d4bb59a4b9e144d2897ca0b0fe3c8c79,Tetranychus neocaledonicus,species,Animalia
986,320e647a76e341498d82ff67ade9cb13,Dioscorea,genus,Plantae


In [7]:
def get_ID(scientificName):
    ''' Returns the ID associated with a scintificName in the taxon table.'''
    x = df_taxon[df_taxon['scientificName'] == scientificName]['uuid']
    return(x.values[0])

get_ID('Badamia exclamationis')

'd9c464146fa644379b00e4d8e1f5d825'

### Build Distribution Table

Island records are encoded as follows:

List of insects and mites attacking crops in the Caroline (Belau-B, Yap-Y, Truk-T, Pohnpei-Po, Kosrae-K) and Mariana Islands (Guam-G, Saipan-S, Rota-R, and Commonwealth of Northern Marianas-C). New island records are indicated by n. Y indicates confirmation of a species previously reported as maybe present or needs verification. A ? indicates that the species may be there but its presence has not been verified with absolute certainty.

In [8]:
# This dictionary decodes island codes.

island_dict = {
    'M=?':[{'i':'Mariana Islands','n':False}],
    'M=C':[{'i':'Saipan','n':False},{'i':'Tinian','n':False},{'i':'Rota','n':False}],
    'M=G':[{'i':'Guam','n':False}],
    'M=RG':[{'i':'Guam','n':False},{'i':'Rota','n':False}],
    'M=S':[{'i':'Saipan','n':False}],
    'M=n':[{'i':'Saipan','n':True},{'i':'Tinian','n':True},{'i':'Rota','n':True},{'i':'Guam','n':True}],
    'M=x':[{'i':'Saipan','n':False},{'i':'Tinian','n':False},{'i':'Rota','n':False},{'i':'Guam','n':False}],
    'B=?':[{'i':'Republic of Palau','n':False}],
    'B=n':[{'i':'Republic of Palau','n':True}],
    'B=x':[{'i':'Republic of Palau','n':False}],
    'K=?':[{'i':'Kosrae State','n':False}],
    'K=n':[{'i':'Kosrae State','n':True}],
    'K=x':[{'i':'Kosrae State','n':False}], 
    'P=?':[{'i':'Pohnpei State','n':False}],
    'P=n':[{'i':'Pohnpei State','n':True}],
    'P=x':[{'i':'Pohnpei State','n':False}],
    'T=?':[{'i':'Chuuk State','n':False}],
    'T=Y':[{'i':'Chuuk State','n':False}],
    'T=n':[{'i':'Chuuk State','n':True}],
    'T=x':[{'i':'Chuuk State','n':False}],
    'Y=?':[{'i':'Yap State','n':False}],
    'Y=n':[{'i':'Yap State','n':True}],
    'Y=x':[{'i':'Yap State','n':False}],
} 

island_code = 'M=x'
mylist = island_dict[island_code]
for item in mylist: 
    print('{}: new_island_record={}'.format(item['i'], item['n']))

Saipan: new_island_record=False
Tinian: new_island_record=False
Rota: new_island_record=False
Guam: new_island_record=False


In [9]:
dist_list = []
for section in doc.iterfind('section'):
    for pest in section.iterfind('pest'):
        scientificName = pest.findtext('sci')
        island_codes = pest.findall('island')
        for island_code in island_codes:
            island_list = island_dict[island_code.text]
            for item in island_list:
                if item['n']:
                    new_island_record = 'new island record'
                else:
                    new_island_record = ''
                dist_list.append({'scientificName': scientificName, 
                                  'locality': item['i'],
                                  'occurrenceRemarks': new_island_record,
                                  'uuid': get_ID(scientificName)
                             })
df_dist = pd.DataFrame(dist_list)
df_dist = df_dist.drop_duplicates()
df_dist = df_dist[['uuid', 'locality', 'occurrenceRemarks']]
df_dist

Unnamed: 0,uuid,locality,occurrenceRemarks
0,45a08215eaee43639815b155e2c34d6b,Yap State,
1,45a08215eaee43639815b155e2c34d6b,Saipan,
2,45a08215eaee43639815b155e2c34d6b,Tinian,
3,45a08215eaee43639815b155e2c34d6b,Rota,
4,45a08215eaee43639815b155e2c34d6b,Guam,
...,...,...,...
4575,630d484ac704415eaccb4785896a7c4d,Guam,
4584,d4bb59a4b9e144d2897ca0b0fe3c8c79,Guam,
4634,d505a104a42c4fea879a628f1d6c68f1,Republic of Palau,
4635,d505a104a42c4fea879a628f1d6c68f1,Yap State,


### Build Vernacular table

In [10]:
vernacular_list = []
for section in doc.iterfind('section'):
    scientificName = section.findtext('crop/sci')
    common_names = section.findall('crop/com')
    for common_name in common_names:
        vernacular_list.append({'scientificName': scientificName, 
                                'vernacularName': common_name.text,
                                'uuid': get_ID(scientificName)
                               })    
        for pest in section.iterfind('pest'):
            scientificName = pest.findtext('sci')
            common_names = pest.findall('com')
            for common_name in common_names:
                vernacular_list.append({'scientificName': scientificName,
                                        'vernacularName': common_name.text,
                                        'uuid': get_ID(scientificName)
                                       })    
df_vernacular = pd.DataFrame(vernacular_list)
df_vernacular = df_vernacular.drop_duplicates()
df_vernacular = df_vernacular[['uuid', 'vernacularName']]
df_vernacular

Unnamed: 0,uuid,vernacularName
0,ea2d6c59bf1a424bb9b685dec70af400,Agave
1,45a08215eaee43639815b155e2c34d6b,oriental scale
2,ebb8f69dba3f4ae0b489c0a6ea902e7c,grey pineapple mealybug
3,289954c303b24b89854770770ce7ad9b,Avocado
4,5de35f74bc5a43c9a1b3aa8a6d87ece1,weevil
...,...,...
1122,630d484ac704415eaccb4785896a7c4d,moth
1124,3b223a621dc9465ab19547a1df3422e9,Wing bean
1125,d4bb59a4b9e144d2897ca0b0fe3c8c79,vegetable mite
1127,320e647a76e341498d82ff67ade9cb13,Yams


### Build ecological associates table

Note that this table has to be searched twice to discover all interactions for a taxon.

In [11]:
associates_list = []
for section in doc.iterfind('section'):
    scientificName1 = section.findtext('crop/sci')
    for pest in section.iterfind('pest'):
        scientificName2 = pest.findtext('sci')
        associates_list.append({'uuid': get_ID(scientificName1),
                                'relatedResourceID': get_ID(scientificName2),
                                'relationshipOfAssociate': 'host plant of',
                               })
        associates_list.append({'uuid': get_ID(scientificName2),
                                'relatedResourceID': get_ID(scientificName1),
                                'relationshipOfAssociate': 'herbivore of'
                               })

df_associates = pd.DataFrame(associates_list)
df_associates = df_associates.drop_duplicates()
df_associates = df_associates[['uuid', 'relationshipOfAssociate', 'relatedResourceID']]
df_associates

Unnamed: 0,uuid,relationshipOfAssociate,relatedResourceID
0,ea2d6c59bf1a424bb9b685dec70af400,host plant of,45a08215eaee43639815b155e2c34d6b
1,45a08215eaee43639815b155e2c34d6b,herbivore of,ea2d6c59bf1a424bb9b685dec70af400
2,ea2d6c59bf1a424bb9b685dec70af400,host plant of,ebb8f69dba3f4ae0b489c0a6ea902e7c
3,ebb8f69dba3f4ae0b489c0a6ea902e7c,herbivore of,ea2d6c59bf1a424bb9b685dec70af400
4,289954c303b24b89854770770ce7ad9b,host plant of,5de35f74bc5a43c9a1b3aa8a6d87ece1
...,...,...,...
1893,1b3a62f9f4434a13a702d96a7528d844,herbivore of,320e647a76e341498d82ff67ade9cb13
1894,320e647a76e341498d82ff67ade9cb13,host plant of,1a5da002959a477cba193377ce02bca3
1895,1a5da002959a477cba193377ce02bca3,herbivore of,320e647a76e341498d82ff67ade9cb13
1896,320e647a76e341498d82ff67ade9cb13,host plant of,b92ce6aca29448a7b910b5159851223a


### Save Data Frames as CSV Files

In [12]:
import os

if not os.path.exists('dwca'):
    os.makedirs('dwca')
df_taxon.to_csv('dwca/dwcaTaxon.csv', index=False)
df_dist.to_csv('dwca/dwcaDistribution.csv', index=False)
df_vernacular.to_csv('dwca/dwcaVernacular.csv', index=False)
df_associates.to_csv('dwca/dwcaResourceRelationship.csv', index=False)

### Create Darwin Core Archive
The archive, **dwca**, can be validated at http://tools.gbif.org/dwca-validator/

In [13]:
!zip -r dwca.zip dwca

updating: dwca/ (stored 0%)
updating: dwca/taxon.csv (deflated 54%)
updating: dwca/distribution.csv (deflated 80%)
updating: dwca/associates.csv (deflated 89%)
updating: dwca/vernacular.csv (deflated 46%)
updating: dwca/meta.xml (deflated 76%)
  adding: dwca/dwcaTaxon.csv (deflated 54%)
  adding: dwca/dwcaResourceRelationship.csv (deflated 89%)
  adding: dwca/dwcaVernacular.csv (deflated 46%)
  adding: dwca/dwcaDistribution.csv (deflated 80%)
