In [1]:
from lxml import etree
import pandas as pd
from uuid import uuid4

In [2]:
# Limit number of rows in dataframe display
pd.options.display.max_rows = 10

In [3]:
doc = etree.parse('list.xml')

### Build Taxon table

In [4]:
def getTaxonRank(scientificName):
    word_count = len(scientificName.split(' '))
    if word_count == 1:
        return('genus')
    else:
        return('species')

In [5]:
taxon_list = []
for section in doc.iterfind('section'):
    scientificName = section.findtext('crop/sci')
    taxonRank = getTaxonRank(scientificName)
    taxon_list.append({'kingdom':'Plantae',
                       'scientificName': scientificName,
                       'taxonRank': taxonRank
                      })
    for pest in section.iterfind('pest'):
        scientificName = pest.findtext('sci')
        taxonRank = getTaxonRank(scientificName)
        taxon_list.append({'kingdom':'Animalia',
                           'scientificName': scientificName,
                           'taxonRank': taxonRank
                          })
df_taxon = pd.DataFrame(taxon_list)
df_taxon = df_taxon.drop_duplicates()

In [6]:
'''
kingdom: kingdom - col 0
scientificName: scientificName - col 1
taxonRank: taxonRank - col 2
ID: ID col 3
'''

if not ('ID' in df_taxon.columns):
    df_taxon['ID'] = None
    for index, row in df_taxon.iterrows():
        row['ID'] = uuid4().hex
else:
    print('ID column already exists')

df_taxon = df_taxon[['ID', 'scientificName', 'taxonRank', 'kingdom']]
df_taxon

Unnamed: 0,ID,scientificName,taxonRank,kingdom
0,876f6d22b15d4396be2b760bf415c955,Agave sisalana,species,Plantae
1,17c201c8da1847929a10109237aab0f4,Aonidiella orientalis,species,Animalia
2,311cf11008e54e33a9259ed045b47b23,Dysmicoccus neobrevipes,species,Animalia
3,66bd6c0b2d4c4a5e98eed112a44bc291,Persea americana,species,Plantae
4,1e02c9c91085410092b1dcfa20c2be82,Trigonops,genus,Animalia
...,...,...,...,...
981,d306d95564c148a6ac36c0b50eb41b75,Anua coronata,species,Animalia
983,7e42fff30b2d4ab6bf2109478641ad14,Psophocarpus tetragonalubus,species,Plantae
984,338d8f3eb2404e4390c745a62c6cfbab,Tetranychus neocaledonicus,species,Animalia
986,019b64c3980c443c8bb814d94f6b1857,Dioscorea,genus,Plantae


In [7]:
def get_ID(scientificName):
    ''' Returns the ID associated with a scintificName in the taxon table.'''
    x = df_taxon[df_taxon['scientificName'] == scientificName]['ID']
    return(x.values[0])

get_ID('Badamia exclamationis')

'14c9c8febc92492eb371bfc56491e33b'

### Build Distribution Table

Island records are encoded as follows:

List of insects and mites attacking crops in the Caroline (Belau-B, Yap-Y, Truk-T, Pohnpei-Po, Kosrae-K) and Mariana Islands (Guam-G, Saipan-S, Rota-R, and Commonwealth of Northern Marianas-C). New island records are indicated by n. Y indicates confirmation of a species previously reported as maybe present or needs verification. A ? indicates that the species may be there but its presence has not been verified with absolute certainty.

In [8]:
# This dictionary decodes island codes.

island_dict = {
    'M=?':[{'i':'Mariana Islands','n':False}],
    'M=C':[{'i':'Saipan','n':False},{'i':'Tinian','n':False},{'i':'Rota','n':False}],
    'M=G':[{'i':'Guam','n':False}],
    'M=RG':[{'i':'Guam','n':False},{'i':'Rota','n':False}],
    'M=S':[{'i':'Saipan','n':False}],
    'M=n':[{'i':'Saipan','n':True},{'i':'Tinian','n':True},{'i':'Rota','n':True},{'i':'Guam','n':True}],
    'M=x':[{'i':'Saipan','n':False},{'i':'Tinian','n':False},{'i':'Rota','n':False},{'i':'Guam','n':False}],
    'B=?':[{'i':'Republic of Palau','n':False}],
    'B=n':[{'i':'Republic of Palau','n':True}],
    'B=x':[{'i':'Republic of Palau','n':False}],
    'K=?':[{'i':'Kosrae State','n':False}],
    'K=n':[{'i':'Kosrae State','n':True}],
    'K=x':[{'i':'Kosrae State','n':False}], 
    'P=?':[{'i':'Pohnpei State','n':False}],
    'P=n':[{'i':'Pohnpei State','n':True}],
    'P=x':[{'i':'Pohnpei State','n':False}],
    'T=?':[{'i':'Chuuk State','n':False}],
    'T=Y':[{'i':'Chuuk State','n':False}],
    'T=n':[{'i':'Chuuk State','n':True}],
    'T=x':[{'i':'Chuuk State','n':False}],
    'Y=?':[{'i':'Yap State','n':False}],
    'Y=n':[{'i':'Yap State','n':True}],
    'Y=x':[{'i':'Yap State','n':False}],
} 

island_code = 'M=x'
mylist = island_dict[island_code]
for item in mylist: 
    print('{}: new_island_record={}'.format(item['i'], item['n']))

Saipan: new_island_record=False
Tinian: new_island_record=False
Rota: new_island_record=False
Guam: new_island_record=False


In [9]:
'''
ID: Core ID - col 0
island: locality - col 1
new_island_record: occurrenceRemarks - col 2
scientificName: skip - col 3
'''

dist_list = []
for section in doc.iterfind('section'):
    for pest in section.iterfind('pest'):
        scientificName = pest.findtext('sci')
        island_codes = pest.findall('island')
        for island_code in island_codes:
            island_list = island_dict[island_code.text]
            for item in island_list:
                if item['n']:
                    new_island_record = 'new island record'
                else:
                    new_island_record = ''
                dist_list.append({'scientificName': scientificName, 
                                  'island': item['i'],
                                  'new_island_record': new_island_record,
                                  'ID': get_ID(scientificName)
                             })
df_dist = pd.DataFrame(dist_list)
df_dist = df_dist.drop_duplicates()
df_dist = df_dist[['ID', 'island', 'new_island_record']]
df_dist

Unnamed: 0,ID,island,new_island_record
0,17c201c8da1847929a10109237aab0f4,Yap State,
1,17c201c8da1847929a10109237aab0f4,Saipan,
2,17c201c8da1847929a10109237aab0f4,Tinian,
3,17c201c8da1847929a10109237aab0f4,Rota,
4,17c201c8da1847929a10109237aab0f4,Guam,
...,...,...,...
4575,d306d95564c148a6ac36c0b50eb41b75,Guam,
4584,338d8f3eb2404e4390c745a62c6cfbab,Guam,
4634,22cd5092082b4ba99c8fc7baf1edc54c,Republic of Palau,
4635,22cd5092082b4ba99c8fc7baf1edc54c,Yap State,


### Build Vernacular table

In [10]:
'''
ID: Core ID - col 0
scientificName: skip - col 1
vernacular: vernacularName - col 2
'''

vernacular_list = []
for section in doc.iterfind('section'):
    scientificName = section.findtext('crop/sci')
    common_names = section.findall('crop/com')
    for common_name in common_names:
        vernacular_list.append({'scientificName': scientificName, 
                                'vernacular': common_name.text,
                                'ID': get_ID(scientificName)
                               })    
        for pest in section.iterfind('pest'):
            scientificName = pest.findtext('sci')
            common_names = pest.findall('com')
            for common_name in common_names:
                vernacular_list.append({'scientificName': scientificName,
                                        'vernacular': common_name.text,
                                        'ID': get_ID(scientificName)
                                       })    
df_vernacular = pd.DataFrame(vernacular_list)
df_vernacular = df_vernacular.drop_duplicates()
df_vernacular = df_vernacular[['ID', 'vernacular']]
df_vernacular

Unnamed: 0,ID,vernacular
0,876f6d22b15d4396be2b760bf415c955,Agave
1,17c201c8da1847929a10109237aab0f4,oriental scale
2,311cf11008e54e33a9259ed045b47b23,grey pineapple mealybug
3,66bd6c0b2d4c4a5e98eed112a44bc291,Avocado
4,1e02c9c91085410092b1dcfa20c2be82,weevil
...,...,...
1122,d306d95564c148a6ac36c0b50eb41b75,moth
1124,7e42fff30b2d4ab6bf2109478641ad14,Wing bean
1125,338d8f3eb2404e4390c745a62c6cfbab,vegetable mite
1127,019b64c3980c443c8bb814d94f6b1857,Yams


### Build ecological associates table

Note that this table has to be searched twice to discover all interactions for a taxon.

In [11]:
'''
sciNameID: Core ID - col 4
sciName: skip - col 3
associateSciName: scientificName - col 0
associateSciNameID: relatedResourceID - col 1
relationshipOfAssociate: relationsipOfResource - col 2
'''

associates_list = []
for section in doc.iterfind('section'):
    scientificName1 = section.findtext('crop/sci')
    for pest in section.iterfind('pest'):
        scientificName2 = pest.findtext('sci')
        associates_list.append({'sciNameID': get_ID(scientificName1),
                                'sciName': scientificName1, 
                                'associateSciNameID': get_ID(scientificName2),
                                'associateSciName': scientificName2,
                                'relationshipOfAssociate': 'host plant of',
                               })
        associates_list.append({'sciNameID': get_ID(scientificName2),
                                'sciName': scientificName2, 
                                'associateSciNameID': get_ID(scientificName1),
                                'associateSciName': scientificName1,
                                'relationshipOfAssociate': 'herbivore of'
                               })

df_associates = pd.DataFrame(associates_list)
df_associates = df_associates.drop_duplicates()
df_associates = df_associates[['sciNameID', 'relationshipOfAssociate', 'associateSciNameID']]
df_associates

Unnamed: 0,sciNameID,relationshipOfAssociate,associateSciNameID
0,876f6d22b15d4396be2b760bf415c955,host plant of,17c201c8da1847929a10109237aab0f4
1,17c201c8da1847929a10109237aab0f4,herbivore of,876f6d22b15d4396be2b760bf415c955
2,876f6d22b15d4396be2b760bf415c955,host plant of,311cf11008e54e33a9259ed045b47b23
3,311cf11008e54e33a9259ed045b47b23,herbivore of,876f6d22b15d4396be2b760bf415c955
4,66bd6c0b2d4c4a5e98eed112a44bc291,host plant of,1e02c9c91085410092b1dcfa20c2be82
...,...,...,...
1893,bebd50d2a13e4cd5891a1d3c548449f1,herbivore of,019b64c3980c443c8bb814d94f6b1857
1894,019b64c3980c443c8bb814d94f6b1857,host plant of,57d9164cbfdb43e8835a72e9a7dd4573
1895,57d9164cbfdb43e8835a72e9a7dd4573,herbivore of,019b64c3980c443c8bb814d94f6b1857
1896,019b64c3980c443c8bb814d94f6b1857,host plant of,064cc43b42124fab9eec2aeb2ce20ef1


### Save Data Frames as CSV Files

In [12]:
import os

if not os.path.exists('dwca'):
    os.makedirs('dwca')
df_taxon.to_csv('dwca/taxon.csv', index=False)
df_dist.to_csv('dwca/distribution.csv', index=False)
df_vernacular.to_csv('dwca/vernacular.csv', index=False)
df_associates.to_csv('dwca/associates.csv', index=False)

### Create Darwin Core Archive
The archive, **dwca**, can be validated at http://tools.gbif.org/dwca-validator/

In [13]:
!zip -r dwca.zip dwca

updating: dwca/ (stored 0%)
updating: dwca/taxon.csv (deflated 54%)
updating: dwca/distribution.csv (deflated 80%)
updating: dwca/associates.csv (deflated 89%)
updating: dwca/vernacular.csv (deflated 46%)
updating: dwca/meta.xml (deflated 76%)
