In [17]:
! python --version

Python 3.12.9


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import networkx as nx

pd.options.display.max_rows = 100
np.set_printoptions(threshold=np.inf)

In [43]:
os.getcwd()

'C:\\Users\\90963425\\phd'

In [16]:
# https://github.com/GRooT-Database/GRooT-Data/blob/master/DataFiles/GRooTAggregateSpeciesVersion.zip
# download and unzip the above .zip file for the .csv database
%timeit -n 1 -r 1 groot = pd.read_csv(r"./GRooTFullVersion.csv", encoding="latin", low_memory=False)

807 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [3]:
# it's not that big though!!!!!
groot.shape

(114222, 73)

In [4]:
groot.columns

Index(['GRooTID', 'source', 'versionSource', 'originalID',
       'referencesAbbreviated', 'references', 'referencesDataset',
       'referencesAdditional', 'family', 'genus', 'species', 'infraspecific',
       'familyTNRS', 'genusTNRS', 'speciesTNRS', 'infraspecificTNRS',
       'taxonomicStatus', 'taxonomicInformation', 'group', 'order',
       'growthForm', 'photosyntheticPathway', 'woodiness',
       'mycorrhizalAssociationType', 'mycorrhizalAssociationTypeFungalRoot',
       'nitrogenFixationNodDB', 'abilityToGrownClonallyCloPla',
       'budBearingOrganCloPla', 'vitality', 'measurementProvenance',
       'measurementTreatments', 'measurementMethod', 'year', 'yearBegin',
       'yearEnd', 'ageStand', 'agePlant', 'locationID', 'location',
       'decimalLatitude', 'decimalLongitud', 'climaticInformation',
       'biomesKoeppen', 'biomesKoeppenGroup', 'temperatureColdestMonth',
       'temperatureWarmestMonth', 'meanAnnualTemperature',
       'meanAnnualPrecipitation', 'elevation', 

In [3]:
# this is a ROOT trait database!!!!
groot.loc[:, "traitName"].unique()

array(['Rooting_depth', 'Root_production', 'Root_turnover_rate',
       'Root_C_concentration', 'Root_C_N_ratio',
       'Root_lignin_concentration', 'Root_N_concentration',
       'Root_total_structural_carbohydrate_concentration',
       'Root_Ca_concentration', 'Root_K_concentration',
       'Root_Mg_concentration', 'Root_P_concentration',
       'Root_length_density_volume', 'Specific_root_respiration',
       'Root_mass_density', 'Root_Mn_concentration',
       'Root_mycorrhizal colonization', 'Root_mass_fraction',
       'Root_litter_mass_loss_rate', 'Coarse_root_fine_root_mass_ratio',
       'Fine_root_mass_leaf_mass_ratio', 'Mean_Root_diameter',
       'Root_tissue_density', 'Specific_root_area',
       'Specific_root_length', 'Root_stele_diameter',
       'Root_stele_fraction', 'Root_branching_density',
       'Root_lifespan_mean', 'Root_lifespan_median',
       'Root_vessel_diameter', 'Root_branching_ratio',
       'Root_dry_matter_content', 'Root_cortex_thickness',
       'N

In [55]:
np.unique(groot.loc[:, "traitName"], return_counts=True)

(array(['Coarse_root_fine_root_mass_ratio',
        'Fine_root_mass_leaf_mass_ratio', 'Lateral_spread',
        'Mean_Root_diameter', 'Net_nitrogen_uptake_rate', 'Root_C_N_ratio',
        'Root_C_concentration', 'Root_Ca_concentration',
        'Root_K_concentration', 'Root_Mg_concentration',
        'Root_Mn_concentration', 'Root_N_P_ratio', 'Root_N_concentration',
        'Root_P_concentration', 'Root_branching_density',
        'Root_branching_ratio', 'Root_cortex_thickness',
        'Root_dry_matter_content', 'Root_length_density_volume',
        'Root_lifespan_mean', 'Root_lifespan_median',
        'Root_lignin_concentration', 'Root_litter_mass_loss_rate',
        'Root_mass_density', 'Root_mass_fraction',
        'Root_mycorrhizal colonization', 'Root_production',
        'Root_stele_diameter', 'Root_stele_fraction',
        'Root_tissue_density',
        'Root_total_structural_carbohydrate_concentration',
        'Root_turnover_rate', 'Root_vessel_diameter',
        'Root_xylem_

In [9]:
rtraits, freq = np.unique(groot.loc[:, "traitName"], return_counts=True)
groot_traits = pd.Series(index= rtraits, data=freq).sort_values()
groot_traits

Root_xylem_vessel_number                              129
Root_vessel_diameter                                  165
Root_N_P_ratio                                        179
Root_lifespan_median                                  231
Net_nitrogen_uptake_rate                              239
Root_lifespan_mean                                    244
Root_Mn_concentration                                 251
Root_cortex_thickness                                 384
Root_total_structural_carbohydrate_concentration      465
Root_branching_ratio                                  474
Root_turnover_rate                                    523
Root_litter_mass_loss_rate                            621
Root_production                                       734
Root_lignin_concentration                             837
Root_stele_diameter                                   982
Root_stele_fraction                                  1080
Root_Mg_concentration                                1408
Lateral_spread

In [19]:
# no missing values in these!
groot.loc[:, ["traitName", "traitValue"]].isna().mean()

traitName     0.0
traitValue    0.0
dtype: float64

In [None]:
# TODO: 
# isolate the root traits associated with mycorrhizal associations
# and filter out only the clades that have information for the mycorrhizal associated traits
# including traits from the conservation gradient may also be helpful in location divergence

In [18]:
# :(
groot.loc[:, ["family", "genus", "species"]].isna().mean()

family     0.791082
genus      0.000000
species    0.014463
dtype: float64

In [24]:
# if we have the genus and family, we can find the corresponding family by hand or by automation
groot.loc[:, ["genus", "species"]].dropna().apply(lambda col: np.unique(col).size)

genus      1932
species    3941
dtype: int64

In [28]:
genus, freqs = np.unique(groot.genus, return_counts=True)
genus = pd.Series(genus, index=freqs).sort_index(ascending=False)
genus[:10]

6675       Pinus
4075     Quercus
4069      Betula
3775        Acer
1966         Poa
1957     Festuca
1636       Carex
1589    Fraxinus
1485       Picea
1440    Plantago
dtype: object

In [29]:
# certain columns have tons of missing values
groot.isna().mean(axis=0).sort_values()

GRooTID                                   0.000000
source                                    0.000000
references                                0.000000
referencesAbbreviated                     0.000000
genusTNRS                                 0.000000
genus                                     0.000000
taxonomicInformation                      0.000000
taxonomicStatus                           0.000000
measurementProvenance                     0.000000
vitality                                  0.000000
belowgroundEntities                       0.000000
traitValue                                0.000000
traitName                                 0.000000
familyTNRS                                0.000079
order                                     0.000140
species                                   0.014463
errorRisk                                 0.015645
errorRiskEntries                          0.015645
speciesTNRS                               0.015645
mycorrhizalAssociationTypeFunga

In [42]:
# info on mycorrhizal association types
ty, freq = np.unique(groot.loc[:, "mycorrhizalAssociationTypeFungalRoot"].dropna(), return_counts=True)
mycorrhizal_associations = pd.Series(index=ty, data=freq).sort_values()
mycorrhizal_associations

NM-AM, rarely EcM                                 14
species-specific: AM or rarely EcM-AM or AM       15
uncertain                                        141
OM                                               315
ErM                                             1177
NM                                              1497
EcM-AM                                          2835
NM-AM                                          12231
EcM                                            22727
AM                                             71043
dtype: int64

In [3]:
groot.loc[:, ["family", "genus", "species"]].isna().sum()

family     90359
genus          0
species     1652
dtype: int64

In [44]:
groot_gs = groot.loc[:, ["genus", "species"]].dropna()

In [45]:
# proper scientific names for all the unique records in the database
scientific_names_unique = np.unique(np.array([g.title() + " " + s.lower() for (g, s) in zip(groot_gs.genus, groot_gs.species)]))

In [46]:
scientific_names_unique.size

6610

In [48]:
scientific_names_unique[:10]

array(['Abelia biflora', 'Abelmoschus esculentus', 'Abies alba',
       'Abies amabilis', 'Abies arizonica', 'Abies balsamea',
       'Abies balsamifera', 'Abies cephalonica', 'Abies cilicica',
       'Abies concolor'], dtype='<U33')

In [46]:
# no redundant records there :)
scientific_names_unique.size, np.unique(scientific_names_unique).size

(6610, 6610)

In [None]:
# already searialized

# with open(file=r"./groot_unique_scientific_names.txt", mode="w") as fp:
#     for unqscname in scientific_names_unique:
#         fp.write(f"{unqscname}\n")

In [64]:
# already searialized

# with open(file=r"./groot_unique_scientific_names.csv", mode="w") as fp:
#     for unqscname in scientific_names_unique:
#         fp.write(f"{unqscname},")

### ___NCBI CommonTree___

In [1]:
# not found 