In [1]:
import re
import os
import zipfile

import pandas as pd
import numpy as np

In [11]:
# Archive of World Flora Online (WFO) - https://zenodo.org/records/14538251
# families_dwc.tar.gz file contains zipped archives of classification data for all families
# each family is contained within a separate .zip file and the core data are in a file called `classification.csv'
# harvest the following columns from each of  these csv files
# family, genus, specificEpithet & majorGroup

In [27]:
wfo = pd.DataFrame(columns=["family", "genus", "specificEpithet", "majorGroup"], dtype=str)
wfo

Unnamed: 0,family,genus,specificEpithet,majorGroup


In [13]:
os.getcwd()

'C:\\Users\\90963425\\OneDrive - Western Sydney University\\PhD\\code'

In [30]:
for archive in os.listdir(r"./Families/"):
    with zipfile.ZipFile(file=f"./Families/{archive}", mode="r") as zf:
        with zf.open("classification.csv") as csf:
            df = pd.read_csv(filepath_or_buffer=csf, low_memory=False, encoding="latin")
            wfo = pd.concat([wfo, df.loc[:, ["family", "genus", "specificEpithet", "majorGroup"]]], ignore_index=True)

In [35]:
wfo.shape

(1780273, 4)

In [36]:
# A stands for Angiosperms???
wfo.loc[:, "majorGroup"].unique()

array(['A', 'Marchantiophyta', 'Bryophyta', 'Polypodiophyta',
       'Anthocerotophyta', 'Pinophyta', 'Cycadophyta', 'Ginkgophyta',
       'Lycopodiophyta'], dtype=object)

In [39]:
wfo.memory_usage()

Index                   132
family             14242184
genus              14242184
specificEpithet    14242184
majorGroup         14242184
dtype: int64

In [42]:
wfo_noduplicates = wfo.drop_duplicates()
wfo_noduplicates.shape

(1149343, 4)

In [49]:
wfo_noduplicates.isna().sum()

family                13
genus                726
specificEpithet    43254
majorGroup             0
dtype: int64

In [50]:
# the most important rows are family and genus 
wfo_noduplicates = wfo_noduplicates.dropna(subset=["family", "genus"])

In [51]:
wfo_noduplicates.isna().sum()

family                 0
genus                  0
specificEpithet    42520
majorGroup             0
dtype: int64

In [52]:
# wfo_noduplicates.to_csv(r"./wfo_taxonomy_clean.csv")

In [3]:
wfo_noduplicates = pd.read_csv(r"./wfo_taxonomy_clean.csv", low_memory=False)

In [4]:
groot = pd.read_csv(r"./GRooTFullVersion.csv", low_memory=False, encoding="latin")

In [5]:
groot_tax = groot.loc[:, ["family", "genus", "species"]].drop_duplicates()
groot_tax.isna().sum()

family     6135
genus         0
species     138
dtype: int64

In [6]:
groot_tax.genus.drop_duplicates()

0               Betula
1                Picea
2                Pinus
5            Agropyron
6            Artemisia
              ...     
114132    Lamprocapnos
114149       Paederota
114166        Prospero
114168     Rhaponticum
114184        Serapias
Name: genus, Length: 1959, dtype: object

In [7]:
# we don't have the family infor for all genera :(
np.intersect1d(groot_tax.genus.drop_duplicates(), wfo_noduplicates.genus).size

1932

In [8]:
# this is because of case differences!
groot_tax.genus.drop_duplicates().size - np.intersect1d(groot_tax.genus.drop_duplicates(), wfo_noduplicates.genus).size

27

In [29]:
# we still have some work to do manually
missing = set(groot_tax.genus.str.title().str.strip()).difference(set(wfo_noduplicates.genus.str.strip().str.title()))
missing

{'Anona',
 'Braciaria',
 'Cyanodon',
 'Helychrisum',
 'Matthiolaria',
 'Rhipogonum',
 'Rynchosia'}

In [27]:
# these do contain "anona", but.......
wfo_noduplicates.genus[wfo_noduplicates.genus.str.contains("anona")]

45827     Diclinanona
45828     Diclinanona
45829     Diclinanona
45830     Diclinanona
50994       Stenanona
50996       Stenanona
50997       Stenanona
50998       Stenanona
50999       Stenanona
51000       Stenanona
51001       Stenanona
51002       Stenanona
51006       Stenanona
51007       Stenanona
51008       Stenanona
51009       Stenanona
51015       Stenanona
51018       Stenanona
51028       Stenanona
51029       Stenanona
51030       Stenanona
51031       Stenanona
324046    Canonanthus
324053    Canonanthus
Name: genus, dtype: object

In [33]:
# welp
[wfo_noduplicates.genus[wfo_noduplicates.genus.str.startswith(g)].size for g in missing]

[0, 0, 0, 0, 0, 0, 0]

In [93]:
groot_tax.genus.str.title().str.contains("Anona").sum()

np.int64(1)

In [41]:
# seems like Anona is legitimately missing
wfo_noduplicates.loc[wfo_noduplicates.family.str.contains("Annonaceae"), "genus"].unique()

array(['Cananga', 'Cyathocalyx', 'Drepananthus', 'Lettowianthus',
       'Meiocarpidium', 'Ambavia', 'Cleistopholis', 'Mezzettia',
       'Tetrameranthus', 'Anaxagorea', 'Annona', 'Asimina', 'Diclinanona',
       'Disepalum', 'Goniothalamus', 'Anonidium', 'Neostenanthera',
       'Bocagea', 'Hornschuchia', 'Trigynaea', 'Cardiopetalum',
       'Cymbopetalum', 'Froesiodendron', 'Porcelia', 'Mkilua',
       'Artabotrys', 'Letestudoxa', 'Pseudartabotrys', 'Duckeanthus',
       'Duguetia', 'Fusaea', 'Guatteria', 'Asteranthe', 'Hexalobus',
       'Uvariastrum', 'Isolona', 'Monodora', 'Dennettia', 'Lukea',
       'Mischogyne', 'Monocyclanthus', 'Uvariodendron', 'Uvariopsis',
       'Ophrypetalum', 'Sanrafaelia', 'Afroguatteria', 'Cleistochlamys',
       'Sphaerocoryne', 'Toussaintia', 'Dasymaschalon', 'Desmos',
       'Friesodielsia', 'Monanthotaxis', 'Fissistigma', 'Pyramidanthe',
       'Dielsiothamnus', 'Uvaria', 'Xylopia', 'Annickia',
       'Dendrokingstonia', 'Fenerivia', 'Maasia', 'Cre

In [50]:
groot_tax.genus[groot_tax.genus.str.startswith("Mat")]

10722      Matteuccia
57940      Matteuccia
84276       Matthiola
86499      Matricaria
86501      Matteuccia
88444      Matricaria
89359       Matthiola
89360       Matthiola
93829      Matricaria
94965         Matayba
95564    Matthiolaria
95606       Matthiola
Name: genus, dtype: object

In [52]:
# looks like GRoot has some spelling mistakes in the genus names!!! 
# YIKES

groot_spelling_corrections = {
"Anona" : "Annona",
"Braciaria" : "Brachiaria",
"Cyanodon" : "Cynodon",
"Helychrisum" : "Helichrysum ",
"Matthiolaria" : "Matthiola", # https://list-ui-wfo-staging.rbge.info/taxon/wfo-0000368988-2018-07?page=1 - synonyms
"Rhipogonum" : "Ripogonum",
"Rynchosia" : "Rhynchosia"
}

In [59]:
spelling_corrected_groot_genera = groot_tax.genus.\
        apply(func= lambda g: groot_spelling_corrections.get(g) if g in groot_spelling_corrections.keys() else g)

In [63]:
spelling_corrected_groot_genera[spelling_corrected_groot_genera.str.startswith("Ann")]

16613    Annona
44654    Annona
Name: genus, dtype: object

In [68]:
set(spelling_corrected_groot_genera).difference(set(wfo_noduplicates.genus.str.strip().str.title()))

{'AGROSTIS',
 'ANTENNARIA',
 'ANTHOXANTHUM',
 'Brachypodium ',
 'CAREX',
 'GNAPHALIUM',
 'Helichrysum ',
 'LEONTODON',
 'NARDUS',
 'SIBBALDIA',
 'dichanthelium',
 'hesperostipa',
 'mirabilis',
 'oenothera',
 'packera',
 'penstemon',
 'psoralidium',
 'stenosiphon',
 'tradescantia',
 'verbesina',
 'vulpia'}

In [69]:
set(spelling_corrected_groot_genera.str.title()).difference(set(wfo_noduplicates.genus.str.strip().str.title()))

{'Brachypodium ', 'Helichrysum '}

In [72]:
# finally!
set(spelling_corrected_groot_genera.str.title().str.strip()).difference(set(wfo_noduplicates.genus.str.strip().str.title()))

set()