# Traits data extraction

## Preliminary code

In [1]:
# Importing useful packages
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# Loading species data
species_train = np.load(Path('../species/species_train.npz'))
names_train = species_train['taxon_names']
print(len(names_train), "species considered") # number of animal species appearing in the train (and test) set

# Loading EOL data
pagesDF = pd.read_csv('pages.csv', low_memory=False)
print(len(set(names_train) - set(pagesDF['canonical'])), "of which not automatically found in EOL dataset:") # number of species in the train (and test) set not appearing into the online DB
print(set(names_train) - set(pagesDF['canonical'])) # names of the species in the train (and test) set not appearing into the online DB

# Loading species' ids not automatically found in EOL dataset
# (scientific name prefixed or followed by discoverer's information)
unmatchedDF = pd.read_csv('unmatched.csv')
print("(ids manually retrieved)")

# Building the species' EOL ids DF
animals_trainDF = pagesDF[pagesDF['canonical'].isin(set(names_train))][['page_id', 'canonical']]
animals_trainDF = pd.concat([animals_trainDF, unmatchedDF], ignore_index=True)
display(animals_trainDF)

# Saving the EOL ids for scraping
with open('ids.txt','w') as file:
    for page_id in animals_trainDF['page_id']:
        file.write(f"{page_id}\n")
print("Animals' ids correctly saved for scraping!")

500 species considered
14 of which not automatically found in EOL dataset:
{np.str_('Chrysuronia versicolor'), np.str_('Lanius corvinus'), np.str_('Campocolinus coqui'), np.str_('Curruca hortensis'), np.str_('Dasyprocta variegata'), np.str_('Argya striata'), np.str_('Neotamias rufus'), np.str_('Masticophis lateralis'), np.str_('Argya affinis'), np.str_('Psophocichla litsitsirupa'), np.str_('Neophedina cincta'), np.str_('Urile urile'), np.str_('Neotamias canipes'), np.str_('Riccordia ricordii')}
(ids manually retrieved)


Unnamed: 0,page_id,canonical
0,347435,Otospermophilus variegatus
1,337448,Afrixalus fornasini
2,334471,Breviceps montanus
3,336165,Anaxyrus quercicus
4,332453,Scaphiopus couchii
...,...,...
495,60973259,Chrysuronia versicolor
496,60189671,Neophedina cincta
497,60783392,Curruca hortensis
498,311789,Neotamias rufus


Animals' ids correctly saved for scraping!


### _(scraping run with scrape.py)_

## Organizing traits dataset

In [249]:
# Gauging the scraped data
my_traitsDF = pd.read_csv('my_traits.csv')
print(len(animals_trainDF)-len(my_traitsDF['Species ID'].unique()), "species not correctly scraped (ids in log.txt)") # 495 - missing 5, species IDs can be found in log.txt

# Organizing data
my_traitsDF['Species ID'] = my_traitsDF['Species ID'].astype(str)
traitsDF = my_traitsDF.groupby(['Species ID', 'Trait'])['Trait Value'].apply(lambda x: tuple(x)).unstack()

def is_floatable(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

# Finding numerical features
numericalColumns = {column:traitsDF[column].dropna().reset_index(drop=True).iloc(0)[0][0] for column in traitsDF.columns if len([value for value in list(traitsDF[column].dropna().unique()) if not all([value[i][0] in '0123456789-' for i in range(len(value))])])==0}
dirtyNumericalColumns = {column:measuringUnit.split(' ',1)[1] for column,measuringUnit in numericalColumns.items() if ' ' in measuringUnit}
unitNumericalColumns = {column:measuringUnit for column,measuringUnit in dirtyNumericalColumns.items() if len([value for value in list(traitsDF[column].dropna().unique()) if not all([value[i].split(' ',1)[1]==measuringUnit for i in range(len(value))])])==0}
plainNumericalColumns = [column for column,value in numericalColumns.items() if column not in set(dirtyNumericalColumns.keys())|{'litters per year'} and is_floatable(value)]
dirtyNumericalColumns = (set(dirtyNumericalColumns.keys())-set(unitNumericalColumns.keys())) | {'litters per year'}
categoricalColumns = list(set(traitsDF.columns) - set(numericalColumns.keys()))

print(len(traitsDF.columns)-1,"traits found:",len(plainNumericalColumns)+len(unitNumericalColumns)+len(dirtyNumericalColumns),"numerical,",len(categoricalColumns),"categorical")

# Making columns numerical
for column,measuringUnit in unitNumericalColumns.items():
    traitsDF[column] = traitsDF[column].apply(lambda t: (tuple(float(value.replace(' '+measuringUnit,'').strip()) for value in t) if isinstance(t, tuple) else np.nan))
traitsDF = traitsDF.rename(columns={column:f"{column} ({measuringUnit})" for column,measuringUnit in unitNumericalColumns.items()})
for column in plainNumericalColumns:
    traitsDF[column] = traitsDF[column].apply(lambda t: (tuple(float(value) for value in t) if isinstance(t, tuple) else np.nan))

# Make the measuring units uniform
traitsDF['head-body length'] = traitsDF['head-body length'].apply(lambda t: (tuple(str(float(value.replace(' inches','').strip())*25.4)+' mm' if 'inches' in value else value for value in t) if isinstance(t, tuple) else np.nan))
traitsDF['body mass'] = traitsDF['body mass'].apply(lambda t: (tuple(str(float(value.replace(' oz','').strip())*28.35)+' g' if 'oz' in value else value for value in t) if isinstance(t, tuple) else np.nan))
cleaningSet = {('litters per year',' /year','',1),('life span',' months',' years',12),('inter-birth interval',' days',' months',30),('head-body length',' mm',' cm',10),('weaning age',' days',' months',30),('basal metabolic rate',' watts',' ml/hr o2',0.335),('body length',' mm',' cm',10),('gestation period duration',' days',' months',30),('age at maturity',' days',' months',30),('prenatal development duration',' days',' weeks',7),('body mass',' g',' kg',1000)}
for column,suffix,secondarySuffix,ratio in cleaningSet:
    traitsDF[column] = traitsDF[column].apply(lambda t: (tuple(float(value.replace(suffix,'').strip()) if suffix in value else float(value.replace(secondarySuffix,'').strip())*ratio for value in t) if isinstance(t, tuple) else np.nan))
traitsDF['body temperature'] = traitsDF['body temperature'].apply(lambda t: (tuple(float(value.replace(' kelvin','').strip()) if 'kelvin' in value else (float(value.replace(' degrees fahrenheit','').strip())+459.67)*(5/9) for value in t) if isinstance(t, tuple) else np.nan))
traitsDF = traitsDF.rename(columns={column:f"{column} ({suffix[1:]})" for column,suffix,secondarySuffix,ratio in (cleaningSet|{('body temperature',' kelvin',None,None)})-{('litters per year',' /year','',1)}})

# Displaying and saving the resulting DF
display(traitsDF)
traitsDF.to_csv('traits.csv')

5 species not correctly scraped (ids in log.txt)
146 traits found: 77 numerical, 69 categorical


Trait,Body symmetry,actual evapotranspiration rate in geographic range (millimeters per month),adult yearly survival (percent),age at eye opening (days),age at maturity (days),amino acid composition of milk,animal population density (individuals per square kilometer),are commensal with,are eaten by,are host of,...,vocalization behavior,water dissolved o2 concentration (mL/L),water nitrate concentration (µmol/l),water o2 saturation (percent),water phosphate concentration (µmol/l),water salinity (psu),water silicate concentration (µmol/l),water temperature (degrees celsius),weaning age (days),wet body mass (g)
Species ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1018152,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,"(Asio otus (Linnaeus 1758) (Long-eared Owl), T...","(Hexamastix batrachorum (Alexeieff) Alexeieff,...",...,,,,,,,,,,
1018724,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,"(Uromacer catesbyi (Schlegel 1837), Uromacer c...",,...,,,,,,,,,,
1018728,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,,,...,,,,,,,,,,
1018733,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,,,...,,,,,,,,,,
1018894,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,"(Thamnophis sirtalis sirtalis (Linnaeus 1758),)","(Frog virus 3, Frog virus 3, Allocreadium pseu...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
962581,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,,"(Thubunaea asymmetrica,)",...,,,,,,,,,,
963127,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,,,...,,,,,,,,,,
963761,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,,"(Wanaristrongylus ctenoti, Oochoristica piankai)",...,,,,,,,,,,
964195,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,,,,...,,,,,,,,,,
