In [1]:
import re
import os
import zipfile
import gc

import ray
ray.init()
import modin.pandas as pd
import numpy as np

2025-03-05 12:21:19,517	INFO worker.py:1841 -- Started a local Ray instance.


In [64]:
# Archive of World Flora Online (WFO) - https://zenodo.org/records/14538251
# families_dwc.tar.gz file contains zipped archives of classification data for all families
# each family is contained within a separate .zip file and the core data are in a file called `classification.csv'
# harvest the following columns from each of  these csv files
# family, genus, specificEpithet & majorGroup

In [27]:
wfo = pd.DataFrame(columns=["family", "genus", "specificEpithet", "majorGroup"], dtype=str)
wfo

Unnamed: 0,family,genus,specificEpithet,majorGroup


In [13]:
os.getcwd()

'C:\\Users\\90963425\\OneDrive - Western Sydney University\\PhD\\code'

In [30]:
for archive in os.listdir(r"./Families/"):
    with zipfile.ZipFile(file=f"./Families/{archive}", mode="r") as zf:
        with zf.open("classification.csv") as csf:
            df = pd.read_csv(filepath_or_buffer=csf, low_memory=False, encoding="latin")
            wfo = pd.concat([wfo, df.loc[:, ["family", "genus", "specificEpithet", "majorGroup"]]], ignore_index=True)

In [35]:
wfo.shape

(1780273, 4)

In [36]:
# A stands for Angiosperms???
wfo.loc[:, "majorGroup"].unique()

array(['A', 'Marchantiophyta', 'Bryophyta', 'Polypodiophyta',
       'Anthocerotophyta', 'Pinophyta', 'Cycadophyta', 'Ginkgophyta',
       'Lycopodiophyta'], dtype=object)

In [39]:
wfo.memory_usage()

Index                   132
family             14242184
genus              14242184
specificEpithet    14242184
majorGroup         14242184
dtype: int64

In [42]:
wfo_noduplicates = wfo.drop_duplicates()
wfo_noduplicates.shape

(1149343, 4)

In [49]:
wfo_noduplicates.isna().sum()

family                13
genus                726
specificEpithet    43254
majorGroup             0
dtype: int64

In [50]:
# the most important rows are family and genus 
wfo_noduplicates = wfo_noduplicates.dropna(subset=["family", "genus"])

In [51]:
wfo_noduplicates.isna().sum()

family                 0
genus                  0
specificEpithet    42520
majorGroup             0
dtype: int64

In [40]:
# wfo_noduplicates.to_csv(r"./wfo_taxonomy_clean.csv", index=False)

In [3]:
wfo_noduplicates = pd.read_csv(r"./wfo_taxonomy_clean.csv", low_memory=False)

In [4]:
groot = pd.read_csv(r"./GRooTFullVersion.csv", low_memory=False, encoding="latin")

In [5]:
groot_taxonomy = groot.loc[:, ["family", "genus", "species"]].drop_duplicates().reset_index(drop=True)
groot_taxonomy.isna().sum()

family     6135
genus         0
species     138
dtype: int64

In [6]:
# 1959 unique genera in GRoot
groot_taxonomy.genus.drop_duplicates().size

1959

In [107]:
# we don't have the family info for all genera :(
np.intersect1d(groot_taxonomy.genus.drop_duplicates(), wfo_noduplicates.genus).size

1932

In [108]:
# this is because of case differences!
groot_taxonomy.genus.drop_duplicates().size - np.intersect1d(groot_taxonomy.genus.drop_duplicates(), wfo_noduplicates.genus).size

27

In [109]:
# we still have some work to do manually
missing = set(groot_taxonomy.genus.str.title().str.strip()).difference(set(wfo_noduplicates.genus.str.strip().str.title()))
missing

{'Anona',
 'Braciaria',
 'Cyanodon',
 'Helychrisum',
 'Matthiolaria',
 'Rhipogonum',
 'Rynchosia'}

In [110]:
# these do contain "anona", but.......
wfo_noduplicates.genus[wfo_noduplicates.genus.str.contains("anona")]

45827     Diclinanona
45828     Diclinanona
45829     Diclinanona
45830     Diclinanona
50994       Stenanona
50996       Stenanona
50997       Stenanona
50998       Stenanona
50999       Stenanona
51000       Stenanona
51001       Stenanona
51002       Stenanona
51006       Stenanona
51007       Stenanona
51008       Stenanona
51009       Stenanona
51015       Stenanona
51018       Stenanona
51028       Stenanona
51029       Stenanona
51030       Stenanona
51031       Stenanona
324046    Canonanthus
324053    Canonanthus
Name: genus, dtype: object

In [111]:
# welp
[wfo_noduplicates.genus[wfo_noduplicates.genus.str.startswith(g)].size for g in missing]

[0, 0, 0, 0, 0, 0, 0]

In [112]:
groot_taxonomy.genus.str.title().str.contains("Anona").sum()

np.int64(1)

In [113]:
# seems like Anona is legitimately missing
wfo_noduplicates.loc[wfo_noduplicates.family.str.contains("Annonaceae"), "genus"].unique()

array(['Cananga', 'Cyathocalyx', 'Drepananthus', 'Lettowianthus',
       'Meiocarpidium', 'Ambavia', 'Cleistopholis', 'Mezzettia',
       'Tetrameranthus', 'Anaxagorea', 'Annona', 'Asimina', 'Diclinanona',
       'Disepalum', 'Goniothalamus', 'Anonidium', 'Neostenanthera',
       'Bocagea', 'Hornschuchia', 'Trigynaea', 'Cardiopetalum',
       'Cymbopetalum', 'Froesiodendron', 'Porcelia', 'Mkilua',
       'Artabotrys', 'Letestudoxa', 'Pseudartabotrys', 'Duckeanthus',
       'Duguetia', 'Fusaea', 'Guatteria', 'Asteranthe', 'Hexalobus',
       'Uvariastrum', 'Isolona', 'Monodora', 'Dennettia', 'Lukea',
       'Mischogyne', 'Monocyclanthus', 'Uvariodendron', 'Uvariopsis',
       'Ophrypetalum', 'Sanrafaelia', 'Afroguatteria', 'Cleistochlamys',
       'Sphaerocoryne', 'Toussaintia', 'Dasymaschalon', 'Desmos',
       'Friesodielsia', 'Monanthotaxis', 'Fissistigma', 'Pyramidanthe',
       'Dielsiothamnus', 'Uvaria', 'Xylopia', 'Annickia',
       'Dendrokingstonia', 'Fenerivia', 'Maasia', 'Cre

In [114]:
groot_taxonomy.genus[groot_taxonomy.genus.str.startswith("Mat")]

766       Matteuccia
2913      Matteuccia
3544       Matthiola
4061      Matricaria
4062      Matteuccia
4555      Matricaria
4911       Matthiola
4912       Matthiola
5296      Matricaria
5436         Matayba
5496    Matthiolaria
5521       Matthiola
Name: genus, dtype: object

In [7]:
# looks like GRoot has some spelling mistakes in the genus names!!! 
# YIKES

groot_spelling_corrections = {
"Anona" : "Annona",
"Braciaria" : "Brachiaria",
"Cyanodon" : "Cynodon",
"Helychrisum" : "Helichrysum",
"Matthiolaria" : "Matthiola", # https://list-ui-wfo-staging.rbge.info/taxon/wfo-0000368988-2018-07?page=1 - synonyms
"Rhipogonum" : "Ripogonum",
"Rynchosia" : "Rhynchosia"
}

In [8]:
spelling_corrected_groot_genera = groot_taxonomy.genus.\
        apply(func= lambda g: groot_spelling_corrections.get(g) if g in groot_spelling_corrections.keys() else g.strip().title())

In [9]:
spelling_corrected_groot_genera[spelling_corrected_groot_genera.str.startswith("Ann")]

1167    Annona
2444    Annona
Name: genus, dtype: object

In [10]:
set(spelling_corrected_groot_genera).difference(set(wfo_noduplicates.genus.str.strip().str.title()))

set()

In [11]:
set(spelling_corrected_groot_genera.str.title()).difference(set(wfo_noduplicates.genus.str.strip().str.title()))

set()

In [12]:
# finally!
print(set(spelling_corrected_groot_genera.str.title().str.strip()).difference(set(wfo_noduplicates.genus.str.strip().str.title())))

# the problem is GRoot NOT the WFO database!
print(set(spelling_corrected_groot_genera.str.title().str.strip()).difference(set(wfo_noduplicates.genus)))

set()
set()


In [13]:
spelling_corrected_groot_genera #.reset_index(drop=True)

0          Betula
1           Picea
2           Pinus
3       Agropyron
4       Artemisia
          ...    
7229    Valeriana
7230     Veronica
7231        Vicia
7232        Vicia
7233    Centaurea
Name: genus, Length: 7234, dtype: object

In [14]:
groot_taxonomy

Unnamed: 0,family,genus,species
0,,Betula,
1,,Picea,
2,,Pinus,
3,,Agropyron,cristatum
4,,Artemisia,tridentata
...,...,...,...
7229,,Valeriana,excelsa
7230,,Veronica,aphylla
7231,,Vicia,onobrychioides
7232,,Vicia,oroboides


In [15]:
# mhmmmm
groot_taxonomy.genus = spelling_corrected_groot_genera
groot_taxonomy.reset_index(inplace=True, drop=True)

In [16]:
groot_taxonomy

Unnamed: 0,family,genus,species
0,,Betula,
1,,Picea,
2,,Pinus,
3,,Agropyron,cristatum
4,,Artemisia,tridentata
...,...,...,...
7229,,Valeriana,excelsa
7230,,Veronica,aphylla
7231,,Vicia,onobrychioides
7232,,Vicia,oroboides


In [19]:
# groot_taxonomy.dtypes
groot_taxonomy.family.dtype

dtype('O')

In [56]:
temp = wfo_noduplicates.loc[:, ["family", "genus"]].drop_duplicates().reset_index(drop=True)
families = pd.Series(index=temp.genus.values, data=temp.family.values, dtype=str, copy=True)

In [58]:
temp.isna().sum()

family    0
genus     0
dtype: int64

In [59]:
families

Acanthopale          Acanthaceae
Acanthopsis          Acanthaceae
Acanthus             Acanthaceae
Afrofittonia         Acanthaceae
Ambongia             Acanthaceae
                       ...      
Bisluederitzia    Zygophyllaceae
Guaiacon          Zygophyllaceae
Nitrapia          Zygophyllaceae
Quaiacum          Zygophyllaceae
Seezenia          Zygophyllaceae
Length: 50936, dtype: object

In [61]:
# WTF WFO???
families["Cannabis"]

Cannabis    Cannabaceae
Cannabis     Urticaceae
dtype: object

In [84]:
# GOT SO FREAKING MANY CONFLICTS BETWEEN GROOT AND THE WFO DATABASE

for i in range(groot_taxonomy.shape[0]):
    if pd.isnull(groot_taxonomy.loc[i, "family"]):
        assert families.get(groot_taxonomy.loc[i, "genus"], False)
        groot_taxonomy.loc[i, "family"] = families.get(groot_taxonomy.loc[i, "genus"])
    else:
        if groot_taxonomy.loc[i, "family"] != families.get(groot_taxonomy.loc[i, "genus"]):
                print(f"{groot_taxonomy.loc[i, "genus"]} Expected {families.get(groot_taxonomy.loc[i, "genus"])}, got {groot_taxonomy.loc[i, "family"]}")

Cannabis Expected Urticaceae, got Cannabaceae
Papaver Expected Papaveraceae, got Papaverceae
Phacelia Expected Polemoniaceae, got Hydrophyllaceae
Spinacia Expected Amaranthaceae, got Chenopodiaceae
Trifolium Expected Oxalidaceae, got Fabaceae
Pinus Expected Sciadopityaceae, got Piceae
Apodytes Expected Stemonuraceae, got Icacinaceae
Combretum Expected Sapotaceae, got Combretaceae
Dovyalis Expected Salicaceae, got Flacourtiaceae
Podocarpus Expected Taxaceae, got Podocarpaceae
Maytenus Expected Rutaceae, got Celastraceae
Arenaria Expected Selaginellaceae, got Caryophyllaceae
Brachypodium Expected Ptychomitriaceae, got Poaceae
Geranium Expected Neuradaceae, got Geraniaceae
Teucrium Expected Plantaginaceae, got Lamiaceae
Trifolium Expected Oxalidaceae, got Fabaceae
Veronica Expected Tetrachondraceae, got Scrophulariaceae
Acer Expected Vitaceae, got Sapindaceae
Acer Expected Vitaceae, got Sapindaceae
Barringtonia Expected Symplocaceae, got Lecythidaceae
Betula Expected Nothofagaceae, got Be

7234

In [3]:
# 1.28 GiBs!!!!
os.path.getsize(r"./../classification.csv") / np.power(1024, 3)

np.float64(1.2805062485858798)

In [4]:
# let's see the csv (pre)compiled by WFO itself

wfo_full = pd.read_csv(r"./../classification.csv", low_memory=False, encoding="latin")

Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


In [5]:
wfo_full.columns

Index(['taxonID', 'scientificNameID', 'localID', 'scientificName', 'taxonRank',
       'parentNameUsageID', 'scientificNameAuthorship', 'family', 'subfamily',
       'tribe', 'subtribe', 'genus', 'subgenus', 'specificEpithet',
       'infraspecificEpithet', 'verbatimTaxonRank', 'nomenclaturalStatus',
       'namePublishedIn', 'taxonomicStatus', 'acceptedNameUsageID',
       'originalNameUsageID', 'nameAccordingToID', 'taxonRemarks', 'created',
       'modified', 'references', 'source', 'majorGroup', 'tplID',
       'speciesHybridMarker', 'infraspecificRank', 'originalID', 'old_t1id',
       'tropicosId', 'references1.0', 'doNotProcess', 'doNotProcess_reason',
       'OfficialFamily', 'comments', 'deprecated'],
      dtype='object')

In [6]:
wfo_full.shape

(1704312, 40)

In [8]:
wfo_full.loc[:, ["family", "genus", "specificEpithet"]].drop_duplicates().reset_index(drop=True)

Unnamed: 0,family,genus,specificEpithet
0,Cyperaceae,Schoenoxiphium,ecklonii
1,Cyperaceae,Cyperus,violifolia
2,Cyperaceae,Carex,viridula
3,Cyperaceae,Mariscus,phleoides
4,Cyperaceae,Tetraria,compar
...,...,...,...
1161993,Brassicaceae,Cardamine,caesiella
1161994,Brassicaceae,Cardamine,chlorina
1161995,Brassicaceae,Sinalliaria,grandifolia
1161996,Brassicaceae,Ceratocnemum,mitabile


In [69]:
wfo_full.loc[:, ["family", "genus"]].isna().sum()

family    19093
genus      5622
dtype: int64

In [80]:
np.set_printoptions(threshold=np.inf)
np.unique(wfo_full.loc[:, "genus"].dropna(), return_counts=True)

(array(['#NAME?', '#name?', '&', '(?)', '(arabidopsis)', '(gymnanthemum)',
        '(muscari)', '(sinapis)', '(sp.-group)', '(teline)', '53.', '?',
        '?group', '?melampyrum', '?polystichum', '?sphaerocionium', 'Aa',
        'Aakia', 'Aalius', 'Aapaca', 'Aaronsohnia', 'Abacopterella',
        'Abacopteris', 'Abacosa', 'Abalon', 'Abalum', 'Abama', 'Abandium',
        'Abapus', 'Abarema', 'Abasicarpon', 'Abasoloa', 'Abatia',
        'Abauria', 'Abaxianthus', 'Abazicarpus', 'Abbevillea', 'Abbotia',
        'Abbottia', 'Abdiverrucospora', 'Abdominea', 'Abdra',
        'Abdulmajidia', 'Abebaia', 'Abelemis', 'Abelia', 'Abelicea',
        'Abeliophyllum', 'Abelmoschus', 'Abena', 'Aberconwayara',
        'Aberemoa', 'Aberia', 'Aberrantia', 'Abesina', 'Abies', 'Abietia',
        'Abietinella', 'Abiga', 'Abildgaardia', 'Abildgardia',
        'Abilgaardia', 'Abioton', 'Ablania', 'Abobra', 'Abola', 'Abolaria',
        'Abolboda', 'Aboriella', 'Abortopetalum', 'Abrahamia', 'Abramsia',
        

In [76]:
wfo_full.loc[:, ["family", "genus"]].dropna(how="any").drop_duplicates()

Unnamed: 0,family,genus
0,Cyperaceae,Schoenoxiphium
1,Cyperaceae,Cyperus
2,Cyperaceae,Carex
3,Cyperaceae,Mariscus
4,Cyperaceae,Tetraria
...,...,...
1703701,Dicranellaceae,Fuscina
1703702,Ditrichaceae,Fuscina
1703750,Orthotrichaceae,Phragmidiolum
1703831,Brachytheciaceae,Leptodon


In [15]:
temp_wfo_full = wfo_full.loc[:, ["family", "genus"]].dropna().drop_duplicates().reset_index(drop=True)
wfo_full = pd.Series(index=temp_wfo_full.genus.values, data=temp_wfo_full.family.values)

In [27]:
# 
wfo_full["Cynodon"]

Cynodon            Poaceae
Cynodon           Bryaceae
Cynodon    Flexitrichaceae
Cynodon      Distichiaceae
Cynodon         Pottiaceae
Cynodon     Dicranellaceae
dtype: object

In [36]:
# using the WFO backbone dataset
# IT'S IDENTICAL TO CLASSIFICATION.CSV :(
# it is actually a tab delimited file with a .csv extension????
backbone = pd.read_table(r"./../wfo_backbone.csv", low_memory=False, delimiter="\t", encoding="latin")

In [39]:
backbone.columns

Index(['taxonID', 'scientificNameID', 'localID', 'scientificName', 'taxonRank',
       'parentNameUsageID', 'scientificNameAuthorship', 'family', 'subfamily',
       'tribe', 'subtribe', 'genus', 'subgenus', 'specificEpithet',
       'infraspecificEpithet', 'verbatimTaxonRank', 'nomenclaturalStatus',
       'namePublishedIn', 'taxonomicStatus', 'acceptedNameUsageID',
       'originalNameUsageID', 'nameAccordingToID', 'taxonRemarks', 'created',
       'modified', 'references', 'source', 'majorGroup', 'tplID'],
      dtype='object')

In [41]:
backbone.loc[:, ["family", "genus", "specificEpithet"]].isna().sum()

family             10664
genus               3038
specificEpithet    52477
dtype: int64

In [43]:
backbone.loc[:, ["family", "genus"]].isna().sum()

family    10664
genus      3038
dtype: int64

In [50]:
bbone = backbone.loc[:, ["family", "genus"]].drop_duplicates().dropna().reset_index(drop=True)
bbone = pd.Series(index=bbone.genus.values, data=bbone.family.values)

In [52]:
bbone["Cynodon"]

Cynodon            Poaceae
Cynodon           Bryaceae
Cynodon    Flexitrichaceae
Cynodon      Distichiaceae
Cynodon         Pottiaceae
Cynodon     Dicranellaceae
dtype: object

In [61]:
# gc.collect()