In [1]:
! python --version

Python 3.12.9


In [3]:
import os
import re

import numpy as np
from numpy.typing import NDArray
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import torch
from numba import njit

pd.options.display.max_rows = 200
np.set_printoptions(threshold=np.inf)

In [10]:
# @njit()
def groot_find_congeneric_pairs(dframe: pd.DataFrame, trait: str) -> pd.Series:
    """
    """
    g: NDArray[str]
    c: NDArray[np.int64]
    
    genera, counts = np.unique(dframe.loc[dframe.loc[:, "traitName"] == trait, ["genus", "species"]].\
                                                      dropna().drop_duplicates().loc[:, "genus"], return_counts=True)
    temp: pd.Series = pd.Series(index=genera, data=counts)
    return temp[temp.values > 1]

In [11]:
os.getcwd()

'C:\\Users\\90963425\\OneDrive - Western Sydney University\\PhD\\code'

In [5]:
# https://github.com/GRooT-Database/GRooT-Data/blob/master/DataFiles/GRooTAggregateSpeciesVersion.zip
# download and unzip the above .zip file for the .csv database
# %timeit -n 1 -r 1 
groot = pd.read_csv(r"./GRooTFullVersion.csv", encoding="latin", low_memory=False)

In [6]:
# it's not that big though!!!!!
groot.shape

(114222, 73)

In [7]:
groot.columns

Index(['GRooTID', 'source', 'versionSource', 'originalID',
       'referencesAbbreviated', 'references', 'referencesDataset',
       'referencesAdditional', 'family', 'genus', 'species', 'infraspecific',
       'familyTNRS', 'genusTNRS', 'speciesTNRS', 'infraspecificTNRS',
       'taxonomicStatus', 'taxonomicInformation', 'group', 'order',
       'growthForm', 'photosyntheticPathway', 'woodiness',
       'mycorrhizalAssociationType', 'mycorrhizalAssociationTypeFungalRoot',
       'nitrogenFixationNodDB', 'abilityToGrownClonallyCloPla',
       'budBearingOrganCloPla', 'vitality', 'measurementProvenance',
       'measurementTreatments', 'measurementMethod', 'year', 'yearBegin',
       'yearEnd', 'ageStand', 'agePlant', 'locationID', 'location',
       'decimalLatitude', 'decimalLongitud', 'climaticInformation',
       'biomesKoeppen', 'biomesKoeppenGroup', 'temperatureColdestMonth',
       'temperatureWarmestMonth', 'meanAnnualTemperature',
       'meanAnnualPrecipitation', 'elevation', 

In [6]:
# this is a ROOT trait database!!!!
groot.loc[:, "traitName"].unique()

array(['Rooting_depth', 'Root_production', 'Root_turnover_rate',
       'Root_C_concentration', 'Root_C_N_ratio',
       'Root_lignin_concentration', 'Root_N_concentration',
       'Root_total_structural_carbohydrate_concentration',
       'Root_Ca_concentration', 'Root_K_concentration',
       'Root_Mg_concentration', 'Root_P_concentration',
       'Root_length_density_volume', 'Specific_root_respiration',
       'Root_mass_density', 'Root_Mn_concentration',
       'Root_mycorrhizal colonization', 'Root_mass_fraction',
       'Root_litter_mass_loss_rate', 'Coarse_root_fine_root_mass_ratio',
       'Fine_root_mass_leaf_mass_ratio', 'Mean_Root_diameter',
       'Root_tissue_density', 'Specific_root_area',
       'Specific_root_length', 'Root_stele_diameter',
       'Root_stele_fraction', 'Root_branching_density',
       'Root_lifespan_mean', 'Root_lifespan_median',
       'Root_vessel_diameter', 'Root_branching_ratio',
       'Root_dry_matter_content', 'Root_cortex_thickness',
       'N

In [7]:
np.unique(groot.loc[:, "traitName"], return_counts=True)

(array(['Coarse_root_fine_root_mass_ratio',
        'Fine_root_mass_leaf_mass_ratio', 'Lateral_spread',
        'Mean_Root_diameter', 'Net_nitrogen_uptake_rate', 'Root_C_N_ratio',
        'Root_C_concentration', 'Root_Ca_concentration',
        'Root_K_concentration', 'Root_Mg_concentration',
        'Root_Mn_concentration', 'Root_N_P_ratio', 'Root_N_concentration',
        'Root_P_concentration', 'Root_branching_density',
        'Root_branching_ratio', 'Root_cortex_thickness',
        'Root_dry_matter_content', 'Root_length_density_volume',
        'Root_lifespan_mean', 'Root_lifespan_median',
        'Root_lignin_concentration', 'Root_litter_mass_loss_rate',
        'Root_mass_density', 'Root_mass_fraction',
        'Root_mycorrhizal colonization', 'Root_production',
        'Root_stele_diameter', 'Root_stele_fraction',
        'Root_tissue_density',
        'Root_total_structural_carbohydrate_concentration',
        'Root_turnover_rate', 'Root_vessel_diameter',
        'Root_xylem_

Unnamed: 0,family,genus,species,traitValue
598,Piceae,Pinus,taeda,66.344315
600,Piceae,Pinus,taeda,21.468776
606,Piceae,Pinus,taeda,17.728916
608,Piceae,Pinus,taeda,30.100000
610,Piceae,Pinus,taeda,41.562209
...,...,...,...,...
90334,,Parkinsonia,microphylla,11.466667
90339,,Parkinsonia,microphylla,3.685185
90342,,Parkinsonia,microphylla,6.833333
90348,,Parkinsonia,microphylla,5.000000


In [9]:
# CONGENERIC PAIRS!!

groot.loc[groot.traitName == "Coarse_root_fine_root_mass_ratio", ["genus", "species"]].drop_duplicates()

Unnamed: 0,genus,species
598,Pinus,taeda
8770,Betula,alleghaniensis
10053,Acer,saccharum
11529,Agrostis,capillaris
11535,Brachypodium,pinnatum
11541,Brachypodium,sylvaticum
11547,Bromus,erectus
11553,Dactylis,glomerata
11559,Deschampsia,cespitosa
11565,Festuca,rubra


In [33]:
groot_find_congeneric_pairs(groot, "Lateral_spread")

Achillea       14
Aconitum        4
Adenostyles     2
Agrimonia       2
Agrostis       12
               ..
Veronica       17
Vicia           7
Viola          13
Woodsia         3
Zostera         2
Length: 213, dtype: int64

In [36]:
       'Root_C_concentration', 'Root_C_N_ratio',
       'Root_N_concentration',
       'Root_mycorrhizal colonization',
       'Root_litter_mass_loss_rate', 'Coarse_root_fine_root_mass_ratio',
       'Fine_root_mass_leaf_mass_ratio', 'Mean_Root_diameter',
       'Root_tissue_density', 'Specific_root_area',
       'Specific_root_length', 'Root_stele_diameter',
       'Root_branching_density',
       
       'Root_branching_ratio',
       'Root_cortex_thickness',
       
       'Lateral_spread'

map(groot_find_congeneric_pairs, )

Abies               7
Acacia             14
Acer               18
Achillea            2
Aesculus            2
Agropyron           4
Agrostis            4
Alnus               3
Anaphalis           2
Andropogon          3
Antennaria          2
Aporosa             2
Ardisia             2
Arenaria            4
Aristida            3
Artemisia           8
Aster               4
Astilbe             2
Astragalus          2
Athyrium            3
Atriplex            3
Berberis            3
Betula              9
Blastus             2
Bouteloua           3
Brachiaria          3
Bromus              7
Calamagrostis       3
Camellia            4
Campanula           4
Cardamine           2
Carex              18
Carum               4
Carya               2
Castanea            3
Castanopsis        10
Castilleja          2
Celastrus           2
Centaurea           4
Chamaesyce          2
Chenopodium         4
Chionochloa         4
Chrysopogon         2
Cinnamomum          7
Cirsium             2
Coccoloba 

In [34]:
groot_find_congeneric_pairs(groot, "Root_N_concentration")

Abies           8
Acacia         15
Acer           22
Achillea        3
Adenostyles     2
               ..
Viburnum        5
Vicia           9
Viola           7
Vitis           2
Weinmannia      2
Length: 301, dtype: int64

In [41]:
groot_find_congeneric_pairs(groot, "Mean_Root_diameter")

Abies          3
Acacia         3
Acer          20
Aesculus       2
Agathis        2
              ..
Vitis          3
Weinmannia     3
Xanthium       2
Zea            2
penstemon      2
Length: 296, dtype: int64

In [42]:
groot_find_congeneric_pairs(groot, "Root_cortex_thickness")

Acacia              3
Acer                4
Alnus               2
Aporosa             2
Athyrium            3
Barringtonia        2
Betula              2
Castanopsis         4
Cinnamomum          4
Cryptocarya         2
Cyclobalanopsis     2
Elaeocarpus         2
Equisetum           2
Fraxinus            2
Garcinia            2
Litsea              3
Machilus            3
Manglietia          4
Michelia           12
Parakmeria          2
Picea               2
Pinus               4
Pithecellobium      2
Quercus             3
Syzygium            3
Ulmus               2
dtype: int64

Abies               7
Acacia             14
Acer               18
Achillea            2
Aesculus            2
Agropyron           4
Agrostis            4
Alnus               3
Anaphalis           2
Andropogon          3
Antennaria          2
Aporosa             2
Ardisia             2
Arenaria            4
Aristida            3
Artemisia           8
Aster               4
Astilbe             2
Astragalus          2
Athyrium            3
Atriplex            3
Berberis            3
Betula              9
Blastus             2
Bouteloua           3
Brachiaria          3
Bromus              7
Calamagrostis       3
Camellia            4
Campanula           4
Cardamine           2
Carex              18
Carum               4
Carya               2
Castanea            3
Castanopsis        10
Castilleja          2
Celastrus           2
Centaurea           4
Chamaesyce          2
Chenopodium         4
Chionochloa         4
Chrysopogon         2
Cinnamomum          7
Cirsium             2
Coccoloba 

In [8]:
rtraits, freq = np.unique(groot.loc[:, "traitName"], return_counts=True)
groot_traits = pd.Series(index= rtraits, data=freq).sort_values()
groot_traits

Root_xylem_vessel_number                              129
Root_vessel_diameter                                  165
Root_N_P_ratio                                        179
Root_lifespan_median                                  231
Net_nitrogen_uptake_rate                              239
Root_lifespan_mean                                    244
Root_Mn_concentration                                 251
Root_cortex_thickness                                 384
Root_total_structural_carbohydrate_concentration      465
Root_branching_ratio                                  474
Root_turnover_rate                                    523
Root_litter_mass_loss_rate                            621
Root_production                                       734
Root_lignin_concentration                             837
Root_stele_diameter                                   982
Root_stele_fraction                                  1080
Root_Mg_concentration                                1408
Lateral_spread

In [9]:
# no missing values in these!
groot.loc[:, ["traitName", "traitValue"]].isna().sum()

traitName     0
traitValue    0
dtype: int64

In [10]:
# TODO: 
# isolate the root traits associated with mycorrhizal associations
# and filter out only the clades that have information for the mycorrhizal associated traits
# including traits from the conservation gradient may also be helpful in location divergence

In [11]:
# :(
groot.loc[:, ["family", "genus", "species"]].isna().mean()

family     0.791082
genus      0.000000
species    0.014463
dtype: float64

In [12]:
# if we have the genus and species names, we can find the corresponding family by hand or by automation
groot.loc[:, ["genus", "species"]].dropna().apply(lambda col: np.unique(col).size)

genus      1932
species    3941
dtype: int64

In [14]:
genus, freqs = np.unique(groot.genus, return_counts=True)
genus = pd.Series(genus, index=freqs).sort_index(ascending=False)
genus[:200]

6675         Pinus
4075       Quercus
4069        Betula
3775          Acer
1966           Poa
           ...    
102      Stellaria
102          Ajuga
101     Onobrychis
100       Gallesia
100           Geum
Length: 200, dtype: object

In [29]:
# certain columns have tons of missing values
groot.isna().mean(axis=0).sort_values()

GRooTID                                   0.000000
source                                    0.000000
references                                0.000000
referencesAbbreviated                     0.000000
genusTNRS                                 0.000000
genus                                     0.000000
taxonomicInformation                      0.000000
taxonomicStatus                           0.000000
measurementProvenance                     0.000000
vitality                                  0.000000
belowgroundEntities                       0.000000
traitValue                                0.000000
traitName                                 0.000000
familyTNRS                                0.000079
order                                     0.000140
species                                   0.014463
errorRisk                                 0.015645
errorRiskEntries                          0.015645
speciesTNRS                               0.015645
mycorrhizalAssociationTypeFunga

In [16]:
groot.taxonomicInformation.unique()

array(['1.00/tpl/ ', '1.00/tpl;usda/ ', '1.00/tpl;tropicos;usda/ ',
       '1.00/gcc;tropicos;usda/ ', '0.98/tpl/ ',
       '1.00/tpl/ [Ambiguous match] ', '1.00/ildis/ ',
       '1.00/tpl;tropicos/ ',
       '1.00/tpl;tropicos;usda/ [Ambiguous match] ',
       '0.97/gcc;tropicos;usda/ ', '1.00/ildis;tropicos;usda/ ',
       '1.00/ildis;tropicos/ ', '1.00/tropicos/ [Ambiguous match] ',
       '1.00/tropicos/ ', '1.00/usda/ ', '0.99/tpl;tropicos/ ',
       '0.99/gcc/ ', '1.00/gcc;tropicos/ ',
       '0.60/ildis;usda/ [Partial match] [Ambiguous match] ',
       '0.93/tpl;tropicos;usda/ ', '0.96/ildis;tropicos/ ', '0.96/usda/ ',
       '1.00/usda/ [Ambiguous match] ',
       '0.97/tpl;tropicos;usda/ [Ambiguous match] ',
       '1.00/tropicos;usda/ [Ambiguous match] ', '1.00/gcc/ ',
       '1.00/gcc;tropicos;usda/ [Ambiguous match] ',
       '0.98/tpl;tropicos;usda/ ',
       '0.56/tpl;tropicos;usda/ [Partial match] ', '0.96/gcc;tropicos/ ',
       '0.96/gcc/ ', '0.92/tpl;tropicos/ ', '0.9

In [24]:
# info on mycorrhizal association types
ty, freq = np.unique(groot.loc[:, "mycorrhizalAssociationTypeFungalRoot"].dropna(), return_counts=True)
mycorrhizal_associations = pd.Series(index=ty, data=freq).sort_values()
mycorrhizal_associations

NM-AM, rarely EcM                                 14
species-specific: AM or rarely EcM-AM or AM       15
uncertain                                        141
OM                                               315
ErM                                             1177
NM                                              1497
EcM-AM                                          2835
NM-AM                                          12231
EcM                                            22727
AM                                             71043
dtype: int64

In [12]:
groot_gs = groot.loc[:, ["genus", "species"]].dropna()

In [19]:
# proper scientific names for all the unique records in the database
scientific_names_unique = np.unique(np.array([g.title() + " " + s.lower() for (g, s) in zip(groot_gs.genus, groot_gs.species)]))

In [20]:
scientific_names_unique.size

6610

In [21]:
scientific_names_unique[:10]

array(['Abelia biflora', 'Abelmoschus esculentus', 'Abies alba',
       'Abies amabilis', 'Abies arizonica', 'Abies balsamea',
       'Abies balsamifera', 'Abies cephalonica', 'Abies cilicica',
       'Abies concolor'], dtype='<U33')

In [27]:
# uniques = groot_gs.drop_duplicates()
# with open(file=r"./phylomatic_species_list.txt", mode="w")  as fp:
#             for (g, s) in zip(uniques.genus, uniques.species):
#                 fp.write(f"{g}/{s}\n")

In [29]:
# no redundant records there :)
scientific_names_unique.size, np.unique(scientific_names_unique).size

(6610, 6610)

In [30]:
# already searialized

# with open(file=r"./groot_unique_scientific_names.txt", mode="w") as fp:
#     for unqscname in scientific_names_unique:
#         fp.write(f"{unqscname}\n")

In [31]:
# already searialized

# with open(file=r"./groot_unique_scientific_names.csv", mode="w") as fp:
#     for unqscname in scientific_names_unique:
#         fp.write(f"{unqscname},")

### ___NCBI CommonTree___

In [32]:
# list of species not found 

In [33]:
# the complexity of the tree could be reduced by first filtering for the traits that are useful for this investigation and 
# looking specifically at those species

In [34]:
fao_crops = """
Musa textilis 
Medicago sativa 
Medicago sativa 
Prunus dulcis 
Pimpinella anisum 
Malus sylvestris 
Prunus armeniaca 
Areca catechu 
Arracacia xanthorrhiza 
Maranta arundinacea 
Cynara scolymus 
Asparagus officinalis 
Persea americana 
Pennisetum americanum 
Vigna subterranea 
Musa paradisiaca 
Hordeum vulgare 
Phaseolus vulgaris 
Phaseolus and Vigna spp.  
Beta vulgaris 
Beta vulgaris 
Beta vulgaris 
Beta vulgaris 
Beta vulgaris 
Citrus bergamia 
Areca catechu 
Piper nigrum 
Acacia mearnsii 
Rubus spp. 
Vaccinium spp.  
Bertholletia excelsa 
Artocarpus altilis 
Vicia faba 
Vicia faba 
Brassica oleracea var. botrytis 
Sorghum bicolor 
Sorghum bicolor 
Brassica oleracea var. gemmifera 
Fagopyrum esculentum 
Brassica oleracea var. capitata 
Brassica chinensis 
Brassica spp. 
Theobroma cacao 
Cucumis melo 
Carum carvi 
Elettaria cardamomum 
Cynara cardunculus 
Ceratonia siliqua 
Daucus carota ssp. sativa 
Daucus carota ssp. sativa 
Anacardium occidentale 
Manihot esculenta 
Ricinus communis 
Brassica oleracea var. botrytis 
Apium graveolens var. rapaceum 
Apium graveolens 
Sechium edule 
Prunus spp. 
Castanea sativa 
Cicer arietinum 
Cichorium intybus 
Cichorium intybus 
Capsicum spp. (annuum)  
Capsicum spp. (annuum)  
Cinnamomum verum 
Citrus medica 
Cymbopogon citrates/ Cymbopogon nardus  
Citrus reticulata 
Eugenia aromatica (Syzygium aromaticum) 
Trifolium spp. 
Trifolium spp. 
Theobroma cacao 
Cocos nucifera 
Colocasia esculenta 
Coffea spp. 
Cola acuminata 
Brassica napus                                                      
Zea mays 
Zea mays 
Zea mays 
Valerianella locusta 
Gossypium spp. 
Gossypium spp. 
Vigna unguiculata 
Vigna unguiculata 
Vaccinium spp.  
Lepidium sativum 
Cucumis sativus 
Ribes spp. 
Annona reticulate 
Colocasia esculenta 
Phoenix dactylifera 
Moringa oleifera 
Sorghum bicolour 
Triticum durum 
Vigna subterranea 
Xanthosoma spp.; Colocasia spp. 
Solanum melongena 
Cichorium endivia 
Foeniculum vulgare 
Trigonella foenum-graecum 
Ficus carica 
Corylus avellana 
Furcraea macrophylla 
Linum usitatissimum 
Linum usitatissimum 
Phormium tenax 
Allium sativum 
Allium sativum 
Pelargonium spp.; Geranium spp. 
Zingiber officinale 
Ribes spp. 
Lagenaria spp; Cucurbita spp.  
Cicer arietinum 
Vitis vinifera 
Citrus paradisi 
Vitis vinifera 
Vitis vinifera 
Vitis vinifera 
Lygeum spartum 
Dactylis glomerata 
Sorghum bicolor var. sudanense 
Arachis hypogaea 
Psidium guajava 
Sorghum bicolor 
Corylus avellana 
Cannabis sativa ssp. indica 
Musa textilis 
Crotalaria juncea 
Cannabis sativa (marijuana) 
Agave fourcroydes 
Lawsonia inermis 
Humulus lupulus 
Vicia faba 
Armoracia rusticana 
Zea mays 
Indigofera tinctoria 
Jasminum spp.  
Helianthus tuberosus 
Sorghum bicolor 
Corchorus spp. (over 30 sp.) 
Brassica oleracea var. acephala 
Ceiba pentandra 
Hibiscus cannabinus 
Brassica oleracea var. gongylodes 
Lavandula spp. (over 15 sp.) 
Allium ampeloprasum; Allium porrum 
Citrus limon 
Cymbopogon citratus 
Lens culinaris 
Lespedeza spp. 
Lactuca sativa var. capitata 
Citrus aurantifolia 
Citrus limetta 
Linum usitatissimum 
Glycyrrhiza glabra 
Litchi chinensis 
Eriobotrya japonica 
Lupinus spp. 
Macadamia spp. ternifolia 
Myristica fragrans 
Agave atrovirens 
Zea mays 
Zea mays 
Zea mays 
Zea mays 
Citrus reticulata 
Beta vulgaris 
Mangifera indica 
Manihot esculenta 
Mixture of Triticum spp.; Secale cereale 
Mespilus germanica 
Cucumis melo 
Sorghum bicolor 
Pennisetum americanum 
Pennisetum americanum 
Eleusine coracana 
Setaria italica 
Echinochloa esculenta 
Pennisetum americanum 
Panicum miliaceum 
Mentha spp. 
Morus spp. 
Morus alba 
Agaricus spp.; Pleurotus spp.; Volvariella 
Brassica nigra; Sinapis alba 
Prunus persica var. nectarina 
Phormium tenax 
Guizotia abyssinica 
Myristica fragrans 
Avena spp. (about 30 sp.) 
Avena spp. (about 30 sp.) 
Elaeis guineensis 
Abelmoschus esculentus 
Olea europaea 
Allium cepa 
Allium cepa 
Allium cepa 
Papaver somniferum 
Citrus sinensis 
Citrus aurantium 
Borassus flabellifer 
Elaeis guineensis 
Elaeis guineensis 
Metroxylon sagu 
Carica papaya 
Pastinaca sativa 
Pisum sativum 
Pisum sativum 
Prunus persica 
Arachis hypogaea 
Pyrus communis 
Carya illinoensis 
Piper nigrum 
Capsicum spp. (over 30 sp.) 
Diospyros kaki;
Diospyros virginiana 
Cajanus cajan 
Ananas comosus 
Pistacia vera 
Musa sapientum 
Prunus domestica  
Punica granatum 
Citrus grandis 
Papaver somniferum 
Solamum tuberosum 
Ipomoea batatas 
Prunus domestica  
Cucurbita spp. (over 25 sp.) 
Cucurbita spp. (over 25 sp.) 
Chrysanthemum cinerariaefolium 
Aspidosperma spp. (more than 3 sp.) 
See Macadamia 
Cydonia oblonga 
Cinchona spp. (more than 6 sp.) 
Chenopodium quinoa 
Raphanus sativus (inc. Cochlearia armoracia) 
Boehmeria nivea 
Brassica napus   
Rubus spp. (over 360 sp.) 
Beta vulgaris 
Agrostis spp. 
Boehmeria nivea 
Rheum spp. 
Oryza sativa; Oryza glaberrima 
Rose spp. 
Hevea brasiliensis 
Brassica napus var. napobrassica 
Secale cereale 
Lolium spp. (about 20 sp.) 
Carthamus tinctorius 
Onobrychis viciifolia 
Tragopogon porrifolius 
Achras sapota 
Citrus reticulata 
Scorzonera hispanica 
Sesamum indicum 
Vitellaria paradoxa 
Agave sisalana 
Sorghum bicolor 
Sorghum bicolor 
Sorghum bicolor 
Sorghum bicolor 
Sorghum bicolor 
Sorghum bicolor 
Glycine max 
Glycine max 
Triticum spelta 
Spinacia oleracea 
Cucurbita spp. (over 25 sp.) 
Fragaria spp. (over 30 sp.) 
Beta vulgaris 
Beta vulgaris 
Beta vulgaris 
Saccharum officinarum 
Saccharum officinarum 
Saccharum officinarum 
Helianthus annuus 
Helianthus annuus 
Crotalaria juncea 
Brassica napus var. napobrassica 
Brassica napus var. napobrassica 
Zea mays 
Citrus limetta 
Capsicum annuum 
Lopmoea batatas 
Sorghum bicolor 
Citrus reticulata 
Xanthosoma sagittifolium 
Manihot esculenta 
Colocasia esculenta 
Camellia sinensis 
Eragrostis abyssinica  
Phleum pratense 
Nicotiana tabacum 
Lycopersicon esculentum 
Lotus spp. (about 100 sp.) 
Hybrid of Triticum aestivum and Secale cereale 
Aleurites spp.; Fordii  
Brassica rapa 
Brassica rapa 
Urena lobata 
Vanilla planifolia 
Vicia sativa 
Juglans spp. (over 20 sp.), ep. regia  
Citrullus lanatus 
Triticum aestivum 
Dioscorea spp. (over 120 sp.) 
Ilex paraguariensis
"""

In [35]:
fao_crops_list = pd.Series(fao_crops.splitlines()).drop_duplicates() 
crops = pd.Series([row[0] + ' ' + row[1] for row in fao_crops_list.str.split(' ') if len(row) >= 2])

In [36]:
# 222 crops species recognized by FAO
crops

0            Musa textilis
1          Medicago sativa
2            Prunus dulcis
3        Pimpinella anisum
4         Malus sylvestris
              ...         
217           Juglans spp.
218      Citrullus lanatus
219      Triticum aestivum
220         Dioscorea spp.
221    Ilex paraguariensis
Length: 222, dtype: object

In [37]:
# FAO recognized crop species that GRoot has trait data for
crops_in_groot = np.intersect1d(scientific_names_unique, crops)
crops_in_groot

array(['Abelmoschus esculentus', 'Acacia mearnsii', 'Allium cepa',
       'Allium sativum', 'Ananas comosus', 'Arachis hypogaea',
       'Artocarpus altilis', 'Asparagus officinalis', 'Beta vulgaris',
       'Brassica napus', 'Brassica oleracea', 'Brassica rapa',
       'Cannabis sativa', 'Capsicum annuum', 'Carica papaya',
       'Carum carvi', 'Castanea sativa', 'Ceiba pentandra',
       'Ceratonia siliqua', 'Cicer arietinum', 'Cichorium intybus',
       'Citrus aurantium', 'Citrus medica', 'Citrus paradisi',
       'Citrus reticulata', 'Citrus sinensis', 'Cocos nucifera',
       'Corylus avellana', 'Crotalaria juncea', 'Cucumis sativus',
       'Cydonia oblonga', 'Cynara cardunculus', 'Dactylis glomerata',
       'Daucus carota', 'Diospyros virginiana', 'Eriobotrya japonica',
       'Ficus carica', 'Glycine max', 'Glycyrrhiza glabra',
       'Helianthus annuus', 'Helianthus tuberosus', 'Hevea brasiliensis',
       'Hibiscus cannabinus', 'Hordeum vulgare', 'Humulus lupulus',
       '

In [38]:
# that's a decent number
crops_in_groot.size

85

In [39]:
crops_in_groot

array(['Abelmoschus esculentus', 'Acacia mearnsii', 'Allium cepa',
       'Allium sativum', 'Ananas comosus', 'Arachis hypogaea',
       'Artocarpus altilis', 'Asparagus officinalis', 'Beta vulgaris',
       'Brassica napus', 'Brassica oleracea', 'Brassica rapa',
       'Cannabis sativa', 'Capsicum annuum', 'Carica papaya',
       'Carum carvi', 'Castanea sativa', 'Ceiba pentandra',
       'Ceratonia siliqua', 'Cicer arietinum', 'Cichorium intybus',
       'Citrus aurantium', 'Citrus medica', 'Citrus paradisi',
       'Citrus reticulata', 'Citrus sinensis', 'Cocos nucifera',
       'Corylus avellana', 'Crotalaria juncea', 'Cucumis sativus',
       'Cydonia oblonga', 'Cynara cardunculus', 'Dactylis glomerata',
       'Daucus carota', 'Diospyros virginiana', 'Eriobotrya japonica',
       'Ficus carica', 'Glycine max', 'Glycyrrhiza glabra',
       'Helianthus annuus', 'Helianthus tuberosus', 'Hevea brasiliensis',
       'Hibiscus cannabinus', 'Hordeum vulgare', 'Humulus lupulus',
       '

In [44]:
groot.columns

Index(['GRooTID', 'source', 'versionSource', 'originalID',
       'referencesAbbreviated', 'references', 'referencesDataset',
       'referencesAdditional', 'family', 'genus', 'species', 'infraspecific',
       'familyTNRS', 'genusTNRS', 'speciesTNRS', 'infraspecificTNRS',
       'taxonomicStatus', 'taxonomicInformation', 'group', 'order',
       'growthForm', 'photosyntheticPathway', 'woodiness',
       'mycorrhizalAssociationType', 'mycorrhizalAssociationTypeFungalRoot',
       'nitrogenFixationNodDB', 'abilityToGrownClonallyCloPla',
       'budBearingOrganCloPla', 'vitality', 'measurementProvenance',
       'measurementTreatments', 'measurementMethod', 'year', 'yearBegin',
       'yearEnd', 'ageStand', 'agePlant', 'locationID', 'location',
       'decimalLatitude', 'decimalLongitud', 'climaticInformation',
       'biomesKoeppen', 'biomesKoeppenGroup', 'temperatureColdestMonth',
       'temperatureWarmestMonth', 'meanAnnualTemperature',
       'meanAnnualPrecipitation', 'elevation', 

In [47]:
groot["measurementMethod"].unique()

array(['excavation', '131I radio tracer', nan, 'soil cores',
       'in-growth cores', 'minirhizotron', 'trench', 'coring',
       'allometry', 'sequential cores', 'soil monolith', 'rhizotron',
       'monolith or excavation', 'litterbags', 'O-18 and soil pit',
       'trench wall', 'O-18', 'c pool dilution',
       'soil cores, in-growth cores', 'in-growth bags',
       'monolith or excavation, soil cores', 'soil blocks',
       'in-growth envelopes', 'soil cores + excavation',
       'soil cores, soil monolith', 'excavation to 1m', 'O18 isotope',
       'sap flow, stem water potential near root', 'soil cores/pits',
       'H isotope, excavation', 'excavation + coring', 'soil samples',
       'soil pits and cores', 'unclear', 'intact cores', 'cutting',
       'gully bank exposure', 'trench wall + soil cores', 'buried pot',
       'mesh bags', 'trench wall + soil sample',
       'lithium chloride tracer', 'excavation + soil samples',
       'excavation, soil cores', 'road cut', 'growth