In [2]:
import numpy as np
import pandas as pd

In [60]:
GROOT_SPELLING_CORRECTIONS = {
    "Anona" : "Annona",
    "Braciaria" : "Brachiaria",
    "Cyanodon" : "Cynodon",
    "Helychrisum" : "Helichrysum",
    "Matthiolaria" : "Matthiola", # https://list-ui-wfo-staging.rbge.info/taxon/wfo-0000368988-2018-07?page=1 - synonyms
    "Rhipogonum" : "Ripogonum",
    "Rynchosia" : "Rhynchosia",
    "Ripogonum": "Rhipogonum" # NOT A SPELLING MISTAKE BUT AN ALTERNATIVE SPELLING
}

# new genus names to be replace the old names in GRooT
GROOT_OLD_NAME_REPLACEMENTS = {
    "Pleuraphis": "Hilaria", # Hilaria rigida - Pleuraphis rigida
    "Paragonia": "Tanaecium", # Paragonia pyramidata - Tanaecium pyramidatum 
    "Carria": "Polyspora", # Carria speciosa - Polyspora gardneri
    "Dasyochloa": "Erioneuron", # Dasyochloa pulchella - Erioneuron pulchellum
    "Ischyrolepis": "Restio", # Restio ferruginosus, Ischyrolepis subverticillata, Restio microstachys, Restio subverticillatus
    "Austrodanthonia": "Rytidosperma", # Rytidosperma caespitosum - Austrodanthonia caespitosa
}

In [44]:
lookup = pd.read_csv(r"./plantlookup.csv", low_memory=False, encoding="latin")
groot = pd.read_csv(r"./GRooTFullVersion.csv", low_memory=False, encoding="latin")

In [46]:
lookup.shape, groot.shape

((23279, 4), (114222, 73))

In [47]:
lookup.head()

Unnamed: 0,genus,family,order,group
0,Acorus,Acoraceae,Acorales,Angiosperms
1,Albidella,Alismataceae,Alismatales,Angiosperms
2,Alisma,Alismataceae,Alismatales,Angiosperms
3,Astonia,Alismataceae,Alismatales,Angiosperms
4,Baldellia,Alismataceae,Alismatales,Angiosperms


In [48]:
lookup.shape

(23279, 4)

In [49]:
# test whether the lookup table has conflicting families for a given genus
doc = lookup.loc[:, ["family", "genus"]].drop_duplicates()
doc

Unnamed: 0,family,genus
0,Acoraceae,Acorus
1,Alismataceae,Albidella
2,Alismataceae,Alisma
3,Alismataceae,Astonia
4,Alismataceae,Baldellia
...,...,...
23274,Lygodiaceae,Ugena
23275,Schizaeaceae,Actinostachys
23276,Schizaeaceae,Lophidium
23277,Schizaeaceae,Microschizaea


In [50]:
family_by_genera_lookup_table = pd.Series(index=doc.genus.str.title().values, data=doc.family.str.title().values)

In [54]:
family_by_genera_lookup_table.iloc[:5]

Acorus          Acoraceae
Albidella    Alismataceae
Alisma       Alismataceae
Astonia      Alismataceae
Baldellia    Alismataceae
dtype: object

In [58]:
# good :)
family_by_genera_lookup_table[GROOT_OLD_NAME_REPLACEMENTS.values()]

Hilaria              Poaceae
Tanaecium       Bignoniaceae
Polyspora           Theaceae
Erioneuron           Poaceae
Restio          Restionaceae
Rytidosperma         Poaceae
dtype: object

In [15]:
groot_genera = pd.Series(groot.loc[:, "genus"].unique()).str.title().\
                    apply(lambda genus: genus if not GROOT_SPELLING_CORRECTIONS.get(genus, False) else GROOT_SPELLING_CORRECTIONS.get(genus))
groot_genera

0             Betula
1              Picea
2              Pinus
3          Agropyron
4          Artemisia
            ...     
1954    Lamprocapnos
1955       Paederota
1956        Prospero
1957     Rhaponticum
1958        Serapias
Length: 1959, dtype: object

In [13]:
family_by_genera_lookup_table[groot_genera]

KeyError: "['Halimione', 'Pleuraphis', 'Hesperostipa', 'Spirea', 'Tsoongiodendron', 'Schizonepeta', 'Paragonia', 'Carria', 'Dasyochloa', 'Ischyrolepis', 'Austrodanthonia', 'Ripogonum', 'Biota', 'Parakmeria', 'Paramichelia', 'Lophozonia', 'Monroa', 'Pascopyrum', 'Joycea', 'Thinopyrum', 'Acanthococos', 'Brauneria', 'Hyeronima', 'Bulbilis', 'Acroptilon', 'Cyclachaena', 'Glaux', 'Alajja', 'Macrosciadium', 'Plectrachne', 'Rauwolfia', 'Brachypodium ', 'Mycelis', 'Myosoton', 'Othocallis', 'Fourraea'] not in index"

In [40]:
groot.query("genus == 'Erioneuron'").loc[:, ["family", "genus", "species"]]

Unnamed: 0,family,genus,species
32757,,Erioneuron,pulchellum
32758,,Erioneuron,pulchellum
32759,,Erioneuron,pulchellum
32760,,Erioneuron,pulchellum
32761,,Erioneuron,pulchellum
32762,,Erioneuron,pulchellum
32763,,Erioneuron,pulchellum
32764,,Erioneuron,pulchellum
32765,,Erioneuron,pulchellum
32766,,Erioneuron,pulchellum


In [59]:
family_by_genera_lookup_table["Rhipogonum"] 

'Rhipogonaceae'

In [None]:
MISSING_LOOKUPS = {
    "Halimione": "Amaranthaceae",
    "Hesperostipa": "Poaceae",
    "Spirea": "Rosaceae",
    "Tsoongiodendron": "Magnoliaceae",
    "Schizonepeta": "Lamiaceae",
    ""
}