In [2]:
import numpy as np
import pandas as pd

In [175]:
GROOT_SPELLING_CORRECTIONS = {
    "Anona": "Annona",
    "Braciaria": "Brachiaria",
    "Cyanodon": "Cynodon",
    "Helychrisum": "Helichrysum",
    "Matthiolaria": "Matthiola", # https://list-ui-wfo-staging.rbge.info/taxon/wfo-0000368988-2018-07?page=1 - synonyms
    "Ripogonum": "Rhipogonum",
    "Rynchosia": "Rhynchosia",
    "Ripogonum": "Rhipogonum", # NOT A SPELLING MISTAKE BUT AN ALTERNATIVE SPELLING
    "Monroa": "Munroa", # Monroa squarrosa - Munroa squarrosa
}

GROOT_OLD_NAME_REPLACEMENTS = {
    "Pleuraphis": "Hilaria", # Hilaria rigida - Pleuraphis rigida
    "Paragonia": "Tanaecium", # Paragonia pyramidata - Tanaecium pyramidatum 
    "Carria": "Polyspora", # Carria speciosa - Polyspora gardneri
    "Dasyochloa": "Erioneuron", # Dasyochloa pulchella - Erioneuron pulchellum
    "Ischyrolepis": "Restio", # Restio ferruginosus, Ischyrolepis subverticillata, Restio microstachys, Restio subverticillatus
    "Austrodanthonia": "Rytidosperma", # Rytidosperma caespitosum - Austrodanthonia caespitosa
    "Biota": "Platycladus", # Biota orientalis 
    "Parakmeria": "Magnolia", # Parakmeria yunnanensis - Magnolia yunnanensis
    "Paramichelia": "Magnolia", # Paramichelia baillonii - Magnolia baillonii 
    "Lophozonia": "Nothofagus", # Lophozonia menziesii - Nothofagus menziesii
    "Joycea": "Rytidosperma", # Rytidosperma pallidum - Joycea pallida, Danthonia pallida 
    "Acanthococos": "Acrocomia",
    "Brauneria": "Echinacea", # Echinacea angustifolia - Brauneria angustifolia 
    "Hyeronima": "Hieronyma", # SPELLING VARIANT
    "Bulbilis": "Bouteloua", # Bulbilis dactyloides - Bouteloua dactyloides
    "Acroptilon": "Rhaponticum", # Rhaponticum repens - Acroptilon repens and Leuzea repens
}

In [132]:
lookup = pd.read_csv(r"./plantlookup.csv", low_memory=False, encoding="latin")
groot = pd.read_csv(r"./GRooTFullVersion.csv", low_memory=False, encoding="latin")

In [109]:
lookup.shape, groot.shape

((23279, 4), (114222, 73))

In [110]:
lookup.columns, lookup.shape

(Index(['genus', 'family', 'order', 'group'], dtype='object'), (23279, 4))

In [111]:
# test whether the lookup table has conflicting families for a given genus
doc = lookup.loc[:, ["family", "genus"]].drop_duplicates()
doc

Unnamed: 0,family,genus
0,Acoraceae,Acorus
1,Alismataceae,Albidella
2,Alismataceae,Alisma
3,Alismataceae,Astonia
4,Alismataceae,Baldellia
...,...,...
23274,Lygodiaceae,Ugena
23275,Schizaeaceae,Actinostachys
23276,Schizaeaceae,Lophidium
23277,Schizaeaceae,Microschizaea


In [156]:
family_by_genera_lookup_table = pd.Series(index=doc.genus.str.title().str.strip().values, data=doc.family.str.title().values)

In [157]:
family_by_genera_lookup_table.iloc[:5]

Acorus          Acoraceae
Albidella    Alismataceae
Alisma       Alismataceae
Astonia      Alismataceae
Baldellia    Alismataceae
dtype: object

In [158]:
# good :)
family_by_genera_lookup_table[GROOT_OLD_NAME_REPLACEMENTS.values()]

Hilaria               Poaceae
Tanaecium        Bignoniaceae
Polyspora            Theaceae
Erioneuron            Poaceae
Restio           Restionaceae
Rytidosperma          Poaceae
Platycladus      Cupressaceae
Magnolia         Magnoliaceae
Magnolia         Magnoliaceae
Nothofagus      Nothofagaceae
Rytidosperma          Poaceae
Acrocomia           Arecaceae
dtype: object

In [176]:
groot_genera = pd.Series(groot.loc[:, "genus"].unique()).str.title().str.strip().\
                    apply(lambda genus: genus if not GROOT_SPELLING_CORRECTIONS.get(genus, False) else GROOT_SPELLING_CORRECTIONS.get(genus))
groot_genera = groot_genera.apply(lambda gname: GROOT_OLD_NAME_REPLACEMENTS.get(gname, False) if GROOT_OLD_NAME_REPLACEMENTS.get(gname, False) else gname)

In [177]:
family_by_genera_lookup_table[groot_genera]

KeyError: "['Halimione', 'Hesperostipa', 'Spirea', 'Tsoongiodendron', 'Schizonepeta', 'Pascopyrum', 'Thinopyrum', 'Cyclachaena', 'Glaux', 'Alajja', 'Macrosciadium', 'Plectrachne', 'Rauwolfia', 'Mycelis', 'Myosoton', 'Othocallis', 'Fourraea'] not in index"

In [169]:
groot.query("genus == 'Bulbilis'").loc[:, ["family", "genus", "species"]]

Unnamed: 0,family,genus,species
58897,,Bulbilis,dactyloide


In [174]:
family_by_genera_lookup_table["Rhaponticum"]

'Asteraceae'

In [178]:
MISSING_LOOKUPS = {
    "Halimione": "Amaranthaceae",
    "Hesperostipa": "Poaceae",
    "Spirea": "Rosaceae",
    "Tsoongiodendron": "Magnoliaceae",
    "Schizonepeta": "Lamiaceae",
    "Pascopyrum": "Poaceae",
    "Thinopyrum": "Poaceae",
}

pd.Series(MISSING_LOOKUPS)

Halimione          Amaranthaceae
Hesperostipa             Poaceae
Spirea                  Rosaceae
Tsoongiodendron     Magnoliaceae
Schizonepeta           Lamiaceae
Pascopyrum               Poaceae
Thinopyrum               Poaceae
dtype: object