In [1]:
import scispacy 
import spacy  
import pandas as pd
from spacy import displacy
from pathlib import Path
import re
import json
import numpy as np

from spacy.pipeline import EntityRuler
from spacy.tokens import Doc, Span, Token
from enum import Enum
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
from spacy.tokens import Span

# 

In [2]:
DATA_DIR = Path('./data')

In [3]:

columns = ['subgenus','genus','family','order','kingdom','class','phylum','collectionCode','order','taxonRank', 'higherClassification']

df = pd.read_csv( DATA_DIR / 'occurrence.csv', usecols=columns).drop_duplicates(keep='first').reset_index()

df.head()



Unnamed: 0,index,class,collectionCode,family,genus,higherClassification,kingdom,order,phylum,subgenus,taxonRank
0,0,Aves,ZOO,Bucerotidae,Tockus,Animalia; Chordata; Vertebrata; Aves; Buceroti...,Animalia,Bucerotiformes,Chordata,,
1,4,Aves,ZOO,Bucerotidae,Tropicranus,Animalia; Chordata; Vertebrata; Aves; Buceroti...,Animalia,Bucerotiformes,Chordata,,
2,11,Aves,ZOO,Phoeniculidae,Phoeniculus,Animalia; Chordata; Vertebrata; Aves; Buceroti...,Animalia,Bucerotiformes,Chordata,,
3,16,Aves,ZOO,Bucerotidae,Rhyticeros,Animalia; Chordata; Vertebrata; Aves; Buceroti...,Animalia,Bucerotiformes,Chordata,,
4,29,Aves,ZOO,Bucerotidae,Rhinoplax,Animalia; Chordata; Vertebrata; Aves; Buceroti...,Animalia,Bucerotiformes,Chordata,,


In [4]:

taxa = set()


def iter_value_with_code(column_name):
    cols = [column_name, 'collectionCode']
    for value, coll_code in df.drop_duplicates(subset=cols)[cols].dropna().itertuples(index=False):
        yield value, coll_code

for value, coll_code in iter_value_with_code('higherClassification'):
    try:
        taxa.update([(s.strip(), coll_code) for s in value.split(';')])
    except AttributeError:
        continue    

def normalise(string):
    return re.sub("[^a-zA-Z]+", "", string)


for column_name in ['genus', 'family', 'kingdom', 'order', 'phylum', 'subgenus', 'class']:
    for value, coll_code in iter_value_with_code(column_name):
        taxa.add((normalise(value), coll_code))



In [5]:

patterns = []

for taxon, label in taxa:

    if len(taxon) > 4:

        taxon_list = taxon.split()

        if len(taxon_list) > 1:
            patterns.append({
                'label': label,
                'pattern': [{"LOWER":  f'{v.lower()}'} for v in taxon.split()]
            })
        else:
            regex = "^" + re.escape(taxon.lower()[:-1]) + '([a-z]{0,2})$'

            patterns.append({
                'label': label,
                'pattern':  [{"LOWER": {"REGEX": regex}}]
            })               

patterns.append({
    'label': 'ZOO',
    'pattern':  [{"LOWER": {"REGEX": '^fish([e]?)([s]?)$'}}]
})  

patterns.append({
    'label': 'ZOO',
    'pattern':  [{"LOWER": {"REGEX": '^bird([s]?)$'}}]
})   

patterns.append({
    'label': 'PAL',
    'pattern':  [{"LOWER": {"REGEX": '^fossil([s]?)$'}}]
}) 

patterns.append({
    'label': 'BOT',
    'pattern':  [{"LOWER": {"REGEX": '^orchid([s]?)$'}}]
}) 

patterns.append({
    'label': 'BOT',
    'pattern':  [{"LOWER": {"REGEX": '^plant([s]?)$'}}]
}) 

patterns.append({
    'label': 'PAL',
    'pattern':  [{"LOWER": {"REGEX": '^pal[a]?eo'}}]
}) 

ologies = [
    ('Anthropology', 'PAL'),
    ('Arachnology', 'ZOO'),
    ('Bacteriology', 'ZOO'),
    ('Entomology', 'ENTOM'),
    ('Geology', 'MIN'),
    ('Herpetology', 'ZOO'),
    ('Ichthyology', 'ZOO'),
    ('Mammalogy', 'ZOO'),
    ('Meterology', 'MIN'),
    ('Mineralogy', 'MIN'),
    ('Mycology', 'BOT'),
    ('Ornithology', 'ZOO'),
    ('Petrology', 'PAL'),
    ('Paleontology', 'PALEO'),
    ('volcanology', 'MIN'),
    ('Zoology', 'ZOO'),
]

for ology, label in ologies:
    regex = ology.rstrip('y').lower()
    patterns.append({
        'label': label,
        'pattern':  [{"LOWER": {"REGEX": f'^{regex}'}}]
    }) 


patterns.append({
    'label': 'MIN',
    'pattern':  [{"LOWER": {"REGEX": '^meteor'}}]
}) 

patterns.append({
    'label': 'BOT',
    'pattern':  [{"LOWER": {"REGEX": '^botan'}}]
}) 

patterns.append({
    'label': 'ENT',
    'pattern':  [{"LOWER": {"REGEX": '^arachno'}}]
}) 

patterns.append({
    'label': 'ENT',
    'pattern':  [{"LOWER": {"REGEX": '^beetle'}}]
}) 

patterns.append({
    'label': 'ZOO',
    'pattern':  [{"LOWER": {"REGEX": '^amphibia'}}]
})

patterns.append({
    'label': 'ZOO',
    'pattern':  [{"LOWER": {"REGEX": '^reptile'}}]
})

patterns.append({
    'label': 'ZOO',
    'pattern':  [{"LOWER": {"REGEX": '^reptilia'}}]
})

patterns.append({
    'label': 'ZOO',
    'pattern':  [{"LOWER": {"REGEX": '^inverte[r]?bra'}}]
}) 

# patterns = []

geo_periods = [
    'Phanerozoic',
    'Cenozoic',
    'Quaternary',
    'Pleistocene',
    'Holocene)',
    'Neogene',
    'Miocene', 
    'Pliocene',
    'Paleogene',
    'Paleocene',
    'Eocene',
    'Oligocene',
    'Mesozoic',
    'Cretaceous',
    'Jurassic',
    'Triassic',
    'Paleozoic',
    'Permian',
    'Carboniferous',
    'Mississippian',
    'Pennsylvanian',
    'Devonian',
    'Silurian',
    'Ordovician',
    'Cambrian',
    'Proterozoic',
    'Neoproterozoic',
    'Ediacaran',
    'Cryogenian',
    'Tonian',
    'Mesoproterozoic',
    'Stenian',
    'Ectasian',
    'Calymmian',
    'Paleoproterozoic',
    'Statherian',
    'Orosirian',
    'Rhyacian',
    'Siderian',    
]

for geo_period in geo_periods:
    patterns.append({
        'label': 'MIN',
        'pattern':  [{"LOWER": geo_period.lower()}]
    })  

arch_periods = [
    'Paleolithic',
    'Epipaleolithic',
    'Neolithic',
    'Chalcolithic',
    'Lithic'
]

for arch_period in arch_periods:
    patterns.append({
        'label': 'PAL',
        'pattern':  [{"LOWER": arch_period.lower()}]
    }) 



with Path(DATA_DIR / 'taxa.jsonl').open("w", encoding="UTF-8") as f:
    for pattern in patterns:
        f.write(json.dumps(pattern) + "\n")    
