A test code to look for the number of traits of the plants with the most number of observations on iNaturalist 

https://www.inaturalist.org/observations?hrank=species&lrank=species&photos&quality_grade=research&view=species&iconic_taxa=Plantae&term_id=12

where term_id=12 means the photos must have the plant phenology annotation.
Can possibly be more specific with 13=Flowering, 14=Fruiting, 15=Flower Budding, 21=No Evidence of Flowering

In [1]:
import pandas as pd
import numpy as np

In [2]:
species = pd.read_csv("TryAccSpecies.txt", sep='\t')

In [3]:
species[:5]

# # AccSpeciesName = Consolidated species name
# # ObsNum = Number of observations
# # ObsGRNum = Number of georeferenced observations
# # MeasNum = Number of measurements
# # MeasGRNum = Number of georeferenced measurements
# # TraitNum = Number of traits

# #In a table like

#             Trait1  Trait2  Trait3
# Species1    tv11    tv12    tv13
# Species2    tv21    tv22    tv23
# Species3    tv31    tv32    tv33

#each cell (tv12) is a measurement: this table has 9 total measurements, 3 per species
#each row is an observation, so this table has 3 observation, 1 per species


Unnamed: 0,AccSpeciesID,AccSpeciesName,ObsNum,ObsGRNum,MeasNum,MeasGRNum,TraitNum
0,271060,Aa achalensis,5,,5.0,,3.0
1,200002,Aa argyrolepis,7,,9.0,,3.0
2,271061,Aa aurantiaca,5,,5.0,,3.0
3,200003,Aa calceata,7,,9.0,,3.0
4,271062,Aa colombiana,13,,13.0,,3.0


In [4]:
#List of the plants with the most number of annotated images on iNaturalist
#obtained manually, can probably be optimized with an API call for a greater number of species

species_list = [
    'Ficaria verna',
    'Asimina triloba',
    'Trillium grandiflorum',
    'Bellis perennis',
    'Hedera helix',
    'Sanguinaria canadensis',
    'Caltha palustris',
    'Phragmites australis',
    'Dactylis glomerata',
    'Silybum marianum',
    'Mitchella repens',
    'Cypripedium acaule',
    'Erodium cicutarium',
    'Diospyros virginiana',
    'Convolvulus arvensis',
    'Geranium robertianum',
    'Achillea millefolium',
    'Phytolacca americana',
    'Rhus glabra',
    'Microstegium vimineum',
    'Lamium purpureum',
    'Alliaria petiolata',
    'Glechoma hederacea',
    'Trillium erectum',
    'Passiflora incarnata',
    'Trifolium pratense',
    'Amphicarpaea bracteata',
    'Ligustrum sinense',
    'Asclepias syriaca',
    'Trillium ovatum'
]

In [5]:
#only keep the rows for the desired species
most_images_species = species.loc[species['AccSpeciesName'].isin(species_list)]

most_images_species

Unnamed: 0,AccSpeciesID,AccSpeciesName,ObsNum,ObsGRNum,MeasNum,MeasGRNum,TraitNum
2877,909,Achillea millefolium,3706,1800.0,18278.0,12850.0,446.0
10320,2496,Alliaria petiolata,651,79.0,1790.0,441.0,194.0
14529,3289,Amphicarpaea bracteata,128,80.0,636.0,542.0,32.0
24904,5512,Asclepias syriaca,793,560.0,3706.0,3072.0,181.0
24992,5522,Asimina triloba,342,194.0,1874.0,1537.0,145.0
34855,7173,Bellis perennis,1158,274.0,5330.0,2912.0,290.0
48023,9768,Caltha palustris,1002,85.0,2690.0,765.0,273.0
70530,14244,Convolvulus arvensis,1170,176.0,3780.0,1220.0,261.0
83410,16549,Cypripedium acaule,53,34.0,120.0,93.0,51.0
84592,16700,Dactylis glomerata,3579,1442.0,19628.0,12006.0,420.0


In [6]:
#uncomment to save to file

# most_images_species.to_csv('most_images_species_TRY.csv', index=False)

In [7]:
#calculating mean values

mean_ObsGRNum = most_images_species['ObsGRNum'].mean()
median_ObsGRNum = most_images_species['ObsGRNum'].median()

mean_MeasGRNum = most_images_species['MeasGRNum'].mean()
median_MeasGRNum = most_images_species['MeasGRNum'].median()

print(f"Mean number of georeferenced observations: {mean_ObsGRNum:.1f}")
print(f"Median for georeferenced observations: {median_ObsGRNum:.1f}")

print(f"Mean number of georeferenced measurements: {mean_MeasGRNum:.1f}")
print(f"Median for georeferenced measurements: {median_MeasGRNum:.1f}")

Mean number of georeferenced observations: 292.9
Median for georeferenced observations: 79.5
Mean number of georeferenced measurements: 2164.9
Median for georeferenced measurements: 676.0


In [8]:
#MIS = most_images_species

#only kept the species that had both observations and measurement greater than the median 
#not the mean because it was very skewed

filtered_MIS = most_images_species[most_images_species['ObsGRNum']>median_ObsGRNum]
filtered_MIS = filtered_MIS[filtered_MIS['MeasGRNum']>median_MeasGRNum]

filtered_MIS

Unnamed: 0,AccSpeciesID,AccSpeciesName,ObsNum,ObsGRNum,MeasNum,MeasGRNum,TraitNum
2877,909,Achillea millefolium,3706,1800.0,18278.0,12850.0,446.0
24904,5512,Asclepias syriaca,793,560.0,3706.0,3072.0,181.0
24992,5522,Asimina triloba,342,194.0,1874.0,1537.0,145.0
34855,7173,Bellis perennis,1158,274.0,5330.0,2912.0,290.0
48023,9768,Caltha palustris,1002,85.0,2690.0,765.0,273.0
70530,14244,Convolvulus arvensis,1170,176.0,3780.0,1220.0,261.0
84592,16700,Dactylis glomerata,3579,1442.0,19628.0,12006.0,420.0
94274,18628,Diospyros virginiana,306,154.0,1809.0,1343.0,182.0
129489,26336,Glechoma hederacea,1042,201.0,4020.0,1448.0,260.0
138351,27988,Hedera helix,1425,660.0,6491.0,4696.0,339.0


In [9]:
#the file "filtered_MIS_traits.txt" has been obtained by requesting the traits for the species found in filtered_MIS dataset
#when reading the csv file, skip the first 3 rows because they do not contain information useful for the analysis
filtered_MIS_traits = pd.read_csv('filtered_MIS_traits.txt', skiprows=lambda x : x in [0,1,2], sep='\t')
#find the total number of observations relative to the trait for each species
filtered_MIS_traits.loc['TotTraitMeasxSpecies'] = filtered_MIS_traits.sum(axis=0, numeric_only=True)
# filtered_MIS_traits[:5]

column_list = np.arange(2,14)
# print(column_list)

print("Total traits before filtering: ", filtered_MIS_traits)

#only keeping the traits with a total number of measurements greater than 100 (arbitrary value)

#total observation per trait
filtered_MIS_traits['total'] = filtered_MIS_traits.iloc[:, column_list].sum(axis=1, numeric_only=True)
filtered_MIS_traits_100 = filtered_MIS_traits[filtered_MIS_traits['total']>100]
filtered_MIS_traits_100



Total traits before filtering:                                                                    Trait  \
0                              Bark persistence (deciduous, persistent)   
1                                            Bark sclereids arrangement   
2                     Bark secondary compounds: oil, tannins, latex,...   
3                                             Bark sieve tubes grouping   
4                                                       Bark structure    
...                                                                 ...   
803                                                       Twig diameter   
804                                                       Twig dry mass   
805                                                         Twig length   
806                                      Twig number of leaves per twig   
TotTraitMeasxSpecies                                                NaN   

                        TraitID  .Achillea millefolium.  .Trifolium

Unnamed: 0,Trait,TraitID,.Achillea millefolium.,.Trifolium pratense.,.Dactylis glomerata.,.Hedera helix.,.Asclepias syriaca.,.Phragmites australis.,.Bellis perennis.,.Glechoma hederacea.,.Asimina triloba.,.Convolvulus arvensis.,.Diospyros virginiana.,.Caltha palustris.,Unnamed: 14,total
8,Budbank height distribution,153.0,90.0,83.0,99.0,12.0,34.0,216.0,48.0,120.0,0.0,128.0,0.0,87.0,,917.0
9,Budbank seasonality,865.0,55.0,55.0,65.0,5.0,24.0,152.0,18.0,43.0,0.0,97.0,0.0,50.0,,564.0
13,Dispersal syndrome,28.0,140.0,3895.0,292.0,76.0,33.0,678.0,186.0,50.0,16.0,59.0,19.0,81.0,,5525.0
17,Dispersal unit floating capacity,1263.0,41.0,2111.0,109.0,26.0,0.0,404.0,37.0,19.0,0.0,1.0,0.0,37.0,,2785.0
18,Dispersal unit length,237.0,22.0,15.0,28.0,10.0,0.0,3.0,13.0,5.0,0.0,7.0,0.0,10.0,,113.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,Species understory/overstory,762.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,54.0,0.0,,134.0
511,Stem internode length,393.0,0.0,133.0,41.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,,180.0
619,Seed releasing height,1107.0,0.0,444.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,445.0
784,Branch number of secondary branches at first b...,861.0,0.0,170.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,170.0


In [None]:
#Traits for the top 3 species for number of measurements (it can be seen manually)
top3_traits_df = filtered_MIS_traits_100[['Trait', 'TraitID', '.Achillea millefolium.', '.Trifolium pratense.', '.Dactylis glomerata.']]

#many traits do not have measurements for some of the species. We drop those traits.

#the number of rows (traits) was still too high, so we only kept the traits with a number of measurements greater than the median

#create a mask for the dataframe with True if value is != 0 and False if 0
#then return True if all the elements in the row are != 0 (not used anymore, initally i used .all(axis=1) but now I'm working on a single column)
#then filter the dataframe according to the final condition
top3_traits_df = top3_traits_df[(top3_traits_df['.Achillea millefolium.'] > top3_traits_df['.Achillea millefolium.'].median())]
top3_traits_df = top3_traits_df[(top3_traits_df['.Trifolium pratense.'] > top3_traits_df['.Trifolium pratense.'].median())]
top3_traits_df = top3_traits_df[(top3_traits_df['.Dactylis glomerata.'] > top3_traits_df['.Dactylis glomerata.'].median())]
top3_traits_df



Unnamed: 0,Trait,TraitID,.Achillea millefolium.,.Trifolium pratense.,.Dactylis glomerata.
89,Leaf area per leaf dry mass (specific leaf are...,3117.0,716.0,454.0,678.0
96,Leaf carbon (C) content per leaf dry mass,13.0,232.0,159.0,402.0
109,Leaf dry mass (single leaf),55.0,651.0,281.0,371.0
110,Leaf dry mass per leaf fresh mass (leaf dry ma...,47.0,769.0,488.0,1077.0
112,Leaf fresh mass,163.0,296.0,215.0,326.0
125,Leaf nitrogen (N) content per leaf area,50.0,151.0,81.0,313.0
126,Leaf nitrogen (N) content per leaf dry mass,14.0,349.0,183.0,504.0
209,Plant biomass and allometry: Shoot dry mass (p...,403.0,233.0,1011.0,293.0
222,Plant growth form,42.0,1301.0,1060.0,1002.0
230,Plant height vegetative,3106.0,715.0,520.0,1001.0


In [13]:
trait_list = top3_traits_df['TraitID'].astype(int).to_list()[:-1] #the last value is not needed look above

#necessary for the request to TRY
trait_list

with open('trait_id_list.txt', 'w') as openfile:
    for i, id in enumerate(trait_list):
        id = str(id)
        if i != len(trait_list)-1:
            openfile.write(id+'\n')
        else:
            openfile.write(id)
#'with' statement does not require closing the file

In [None]:
#Saving the name of the obtained traits in a file so that I can use them later in a different file

trait_name_list = top3_traits_df['Trait'].astype(str).to_list()[:-1] #the last value is not needed look above
trait_name_list

with open('trait_name_list.txt', 'w') as openfile:
    for i, name in enumerate(trait_name_list):
        if i != len(trait_name_list)-1:
            openfile.write(name+'\n')
        else:
            openfile.write(name)
#'with' statement does not require closing the file

    