## Traits data extraction

For the main 500 species, biological characteristics and relational features are scraped and massively organized and cleaned.

### Preliminary code

In [None]:
# Importing useful packages
from pathlib import Path
import numpy as np
import pandas as pd
import ast
# !pip install -r requirements.txt

In [None]:
# Loading train data
species_train = np.load(Path('../species/species_train.npz'))
names_train = species_train['taxon_names']
print(len(names_train), "species considered") # number of animal species appearing in the train (and test) set

# Gauging relevant EOL data
pagesDF = pd.read_csv('pages.csv', low_memory=False, dtype={'page_id': str})
print(len(set(names_train) - set(pagesDF['canonical'])), "of which not automatically found in EOL dataset:") # number of species in the train (and test) set not appearing into the online DB
print(set(names_train) - set(pagesDF['canonical'])) # names of the species in the train (and test) set not appearing into the online DB

# Loading traits' ids not automatically found in EOL dataset (biological name prefixed or followed by discoverer's information)
unmatchedDF = pd.read_csv('unmatched.csv', dtype={'page_id': str})
print("(ids manually retrieved)")

# Building species' EOL-ids dataframe
animals_trainDF = pagesDF[pagesDF['canonical'].isin(set(names_train))][['page_id', 'canonical']]
animals_trainDF = pd.concat([animals_trainDF, unmatchedDF], ignore_index=True)
display(animals_trainDF)

# Saving EOL-ids for scraping
with open('ids.txt','w') as file:
    for page_id in animals_trainDF['page_id']:
        file.write(f"{page_id}\n")
print("Animals' ids correctly saved for scraping!")

500 species considered
14 of which not automatically found in EOL dataset:
{'Neotamias canipes', 'Neotamias rufus', 'Curruca hortensis', 'Dasyprocta variegata', 'Argya affinis', 'Urile urile', 'Argya striata', 'Masticophis lateralis', 'Psophocichla litsitsirupa', 'Chrysuronia versicolor', 'Campocolinus coqui', 'Neophedina cincta', 'Riccordia ricordii', 'Lanius corvinus'}
(ids manually retrieved)


Unnamed: 0,page_id,canonical
0,347435,Otospermophilus variegatus
1,337448,Afrixalus fornasini
2,334471,Breviceps montanus
3,336165,Anaxyrus quercicus
4,332453,Scaphiopus couchii
...,...,...
495,60973259,Chrysuronia versicolor
496,60189671,Neophedina cincta
497,60783392,Curruca hortensis
498,311789,Neotamias rufus


Animals' ids correctly saved for scraping!


In [None]:
# Loading extra-train data
species_train = np.load(Path('../species/species_train_extra.npz'))
names_train = species_train['taxon_names']
print(len(names_train), "species considered") # number of animal species appearing in the train (and test) set

# Gauging relevant EOL data
pagesDF = pd.read_csv('pages.csv', low_memory=False, dtype={'page_id': str})
print(len(set(names_train) - set(pagesDF['canonical'])), "of which not automatically found in EOL dataset:") # number of species in the train (and test) set not appearing into the online DB
print(set(names_train) - set(pagesDF['canonical'])) # names of the species in the train (and test) set not appearing into the online DB

# Building species' EOL-ids dataframe
animals_trainDF = pagesDF[pagesDF['canonical'].isin(set(names_train))][['page_id', 'canonical']]
animals_trainDF = pd.concat([animals_trainDF, unmatchedDF], ignore_index=True)
display(animals_trainDF)

# Saving EOL-ids for scraping
with open('ids_extra.txt','w') as file:
    for page_id in animals_trainDF['page_id']:
        file.write(f"{page_id}\n")
print("Animals' ids correctly saved for scraping!")

1918 species considered
57 of which not automatically found in EOL dataset:
{'Colobus caudatus', 'Pterorhinus albogularis', 'Pachysylvia aurantiifrons', 'Curruca subcoerulea', 'Masticophis taeniatus', 'Herpestes sanguineus', 'Parahyaena brunnea', 'Chionomesa fimbriata', 'Panthera uncia', 'Ortygornis pondicerianus', 'Bradornis mariquensis', 'Agricola infuscatus', 'Masticophis bilineatus', 'Subdoluseps bowringii', 'Mixornis gularis', 'Lophosaurus spinipes', 'Rhodophoneus cruentus', 'Kalophrynus interlineatus', 'Lanius melanoleucus', 'Ctenophorus slateri', 'Carinascincus metallicus', 'Saucerottia beryllina', 'Pseudacris sierra', 'Saucerottia tobaci', 'Fowlea piscator', 'Chlorestes cyanus', 'Tragelaphus oryx', 'Aneides klamathensis', 'Neotamias sonomae', 'Cincloramphus timoriensis', 'Chrysuronia brevirostris', 'Curruca curruca', 'Herpestes pulverulentus', 'Smutsornis africanus', 'Muntiacus vaginalis', 'Hydrobates melania', 'Helopsaltes certhiola', 'Cynanthus canivetii', 'Philothamnus occid

Unnamed: 0,page_id,canonical
0,337391,Tomopterna natalensis
1,334512,Boophis viridis
2,334690,Phrynomantis annectens
3,336167,Plethodon hoffmani
4,336204,Eupsophus emiliopugini
...,...,...
1870,60973259,Chrysuronia versicolor
1871,60189671,Neophedina cincta
1872,60783392,Curruca hortensis
1873,311789,Neotamias rufus


Animals' ids correctly saved for scraping!


### *Scraping was run with scrape.py and scrape_extra.py on both train and extra-train set*

*The subsequent cleaning was then applied on train data only*

### Organizing data

In [None]:
# Organizing scraped datataset
my_traitsDF = pd.read_csv('my_traits.csv', dtype={'Species ID': str})
print(len(animals_trainDF)-len(my_traitsDF['Species ID'].unique()), "species not correctly scraped (ids in log.txt)") # 495 - missing 5, species IDs can be found in log.txt
traitsDF = my_traitsDF.groupby(['Species ID', 'Trait'])['Trait Value'].apply(lambda x: tuple(x)).unstack()

# Finding numerical features
def is_floatable(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False
numericalColumns = {column:traitsDF[column].dropna().reset_index(drop=True).iloc(0)[0][0] for column in traitsDF.columns if len([value for value in list(traitsDF[column].dropna().unique()) if not all([value[i][0] in '0123456789-' for i in range(len(value))])])==0}
dirtyNumericalColumns = {column:measuringUnit.split(' ',1)[1] for column,measuringUnit in numericalColumns.items() if ' ' in measuringUnit}
unitNumericalColumns = {column:measuringUnit for column,measuringUnit in dirtyNumericalColumns.items() if len([value for value in list(traitsDF[column].dropna().unique()) if not all([value[i].split(' ',1)[1]==measuringUnit for i in range(len(value))])])==0}
plainNumericalColumns = [column for column,value in numericalColumns.items() if column not in set(dirtyNumericalColumns.keys())|{'litters per year'} and is_floatable(value)]
dirtyNumericalColumns = (set(dirtyNumericalColumns.keys())-set(unitNumericalColumns.keys())) | {'litters per year'}
categoricalColumns = list(set(traitsDF.columns) - set(numericalColumns.keys()))
print(len(traitsDF.columns)-1,"traits found:",len(plainNumericalColumns)+len(unitNumericalColumns)+len(dirtyNumericalColumns),"numerical,",len(categoricalColumns),"categorical")

# Making columns numerical
for column,measuringUnit in unitNumericalColumns.items():
    traitsDF[column] = traitsDF[column].apply(lambda t: (tuple(float(value.replace(' '+measuringUnit,'').strip()) for value in t) if isinstance(t, tuple) else np.nan))
traitsDF = traitsDF.rename(columns={column:f"{column} ({measuringUnit})" for column,measuringUnit in unitNumericalColumns.items()})
for column in plainNumericalColumns:
    traitsDF[column] = traitsDF[column].apply(lambda t: (tuple(float(value) for value in t) if isinstance(t, tuple) else np.nan))

# Making measuring units uniform
traitsDF['head-body length'] = traitsDF['head-body length'].apply(lambda t: (tuple(str(float(value.replace(' inches','').strip())*25.4)+' mm' if 'inches' in value else value for value in t) if isinstance(t, tuple) else np.nan))
traitsDF['body mass'] = traitsDF['body mass'].apply(lambda t: (tuple(str(float(value.replace(' oz','').strip())*28.35)+' g' if 'oz' in value else value for value in t) if isinstance(t, tuple) else np.nan))
cleaningSet = {('litters per year',' /year','',1),('life span',' months',' years',12),('inter-birth interval',' days',' months',30),('head-body length',' mm',' cm',10),('weaning age',' days',' months',30),('basal metabolic rate',' watts',' ml/hr o2',0.335),('body length',' mm',' cm',10),('gestation period duration',' days',' months',30),('age at maturity',' days',' months',30),('prenatal development duration',' days',' weeks',7),('body mass',' g',' kg',1000)}
for column,suffix,secondarySuffix,ratio in cleaningSet:
    traitsDF[column] = traitsDF[column].apply(lambda t: (tuple(float(value.replace(suffix,'').strip()) if suffix in value else float(value.replace(secondarySuffix,'').strip())*ratio for value in t) if isinstance(t, tuple) else np.nan))
traitsDF['body temperature'] = traitsDF['body temperature'].apply(lambda t: (tuple(float(value.replace(' kelvin','').strip()) if 'kelvin' in value else (float(value.replace(' degrees fahrenheit','').strip())+459.67)*(5/9) for value in t) if isinstance(t, tuple) else np.nan))
traitsDF = traitsDF.rename(columns={column:f"{column} ({suffix[1:]})" for column,suffix,secondarySuffix,ratio in (cleaningSet|{('body temperature',' kelvin',None,None)})-{('litters per year',' /year','',1)}})

# Merging canonical name and species data's indexing
traitsDF = traitsDF.reset_index()
traitsDF.columns.name = None
animals_trainDF['page_id'] = animals_trainDF['page_id'].astype(str)
traitsDF = pd.merge(traitsDF,animals_trainDF,left_on='Species ID',right_on='page_id',how='left').drop(columns='page_id')
traitsDF = pd.merge(pd.DataFrame({'id':species_train['taxon_ids'],'canonical': species_train['taxon_names']}),traitsDF,on='canonical',how='right')

# Displaying and saving the resulting DF
display(traitsDF)
traitsDF.to_csv('traits.csv',index=False)

5 species not correctly scraped (ids in log.txt)
146 traits found: 77 numerical, 69 categorical


Unnamed: 0,id,canonical,Species ID,Body symmetry,actual evapotranspiration rate in geographic range (millimeters per month),adult yearly survival (percent),age at eye opening (days),age at maturity (days),amino acid composition of milk,animal population density (individuals per square kilometer),...,vocalization behavior,water dissolved o2 concentration (mL/L),water nitrate concentration (µmol/l),water o2 saturation (percent),water phosphate concentration (µmol/l),water salinity (psu),water silicate concentration (µmol/l),water temperature (degrees celsius),weaning age (days),wet body mass (g)
0,26745,Ambystoma texanum,1018152,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
1,24379,Osteopilus dominicensis,1018724,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
2,25806,Tomopterna delalandii,1018728,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
3,27696,Rhyacotriton kezeri,1018733,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
4,27500,Gyrinophilus porphyriticus,1018894,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,29073,Philothamnus semivariegatus,962581,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
491,29550,Lycodon travancoricus,963127,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
492,31236,Ctenophorus isolepis,963761,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,
493,31299,Bronchocela jubata,964195,(bilaterally symmetricURI:http://purl.obolibra...,,,,,,,...,,,,,,,,,,


### Cleaning data

In [4]:
# Loading full dataset to clean
traitsDF = pd.read_csv('traits.csv', dtype={'id':str,'Species ID':str})

# Defining cleaning functions
def convert_to_tuple(entry):
    '''
    Function for safe string to tuple evaluation
    '''
    
    try:
        return ast.literal_eval(entry)
    except (ValueError, SyntaxError):
        return None

def replace_with_numbers(column):
    '''
    Function to replace the values of a traitsDF's column with numbers
    '''

    unique_strings = {entry for entry in traitsDF[column] if isinstance(entry, tuple)}
    string_to_number = {string: idx for idx, string in enumerate(unique_strings)}
    return lambda tup: [string_to_number[string] for string in tup if string in string_to_number] if isinstance(tup, tuple) else []

# Keeping columns with less than 200 NaN values
nan_counts = traitsDF.isna().sum()
columns_with_few_nan = nan_counts[nan_counts < 200].index.tolist()
traitsDF = traitsDF[columns_with_few_nan]

# Cleaning remaining columns
for column in ['auditory system','conservation status','ecoregion','geographic distribution','habitat','locomotion']:
    traitsDF[column] = traitsDF[column].apply(convert_to_tuple).apply(replace_with_numbers(column))
to_float = ['hearing range (Hz)','hearing threshold (dB)','number of records in gbif','number of records in iNat','number of research grade records in iNat','number of specimens in ggbn']
for column in to_float:
    traitsDF[column] = (traitsDF[column].str.strip("()").str.rstrip(',')).apply(lambda x: [float(i) for i in x] if isinstance(x, list) else [])
traitsDF['population trend'] = (traitsDF['population trend'].str.strip("()").str.rstrip(',').str.replace("'", "", regex=False)).astype(str)
traitsDF['population trend'][78] = 'Decreasing'

# Removing columns with no actual values' variability
to_remove = ['Body symmetry','cellularity','sexual system', 'mineralized skeleton contains', 'visual system']
traitsDF = traitsDF.drop(to_remove, axis=1)

# Displaying and saving cleaned dataframe
display(traitsDF)
traitsDF.to_csv('cleaned_traits.csv',index=False)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  traitsDF['population trend'][78] = 'Decreasing'


Unnamed: 0,id,canonical,Species ID,auditory system,body mass (g),conservation status,ecoregion,geographic distribution,habitat,hearing range (Hz),hearing threshold (dB),locomotion,number of records in gbif,number of records in iNat,number of research grade records in iNat,number of specimens in ggbn,population trend
0,26745,Ambystoma texanum,1018152,[],,[],[],[],[],[],[],[],[],[],[],[],Stable
1,24379,Osteopilus dominicensis,1018724,[],,[],[],[],[],[],[],[],[],[],[],[],
2,25806,Tomopterna delalandii,1018728,[],,[],[],[],[],[],[],[],[],[],[],[],Unknown
3,27696,Rhyacotriton kezeri,1018733,[],,[],[],[],[],[],[],[],[],[],[],[],Stable
4,27500,Gyrinophilus porphyriticus,1018894,[],,[],[],[],[],[],[],[],[],[],[],[],Stable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,29073,Philothamnus semivariegatus,962581,[],"(887.57,)",[],[],[],[],[],[],[],[],[],[],[],Unknown
491,29550,Lycodon travancoricus,963127,[],,[],[],[],[],[],[],[],[],[],[],[],Unknown
492,31236,Ctenophorus isolepis,963761,[],"(6.532, 6.853, 6.685, 5.819, 6.784)",[],[],[],[],[],[],[],[],[],[],[],
493,31299,Bronchocela jubata,964195,[],"(77.56,)",[],[],[],[],[],[],[],[],[],[],[],Stable
