In [1]:
import pandas as pd

In [3]:
"""
Script demonstrating simple data loading and visualization.

Data Format: 
There are two files 'species_train.npz', and 'species_test.npz'
For the train data, we have the geographical coordinates where different 
species have been observed. This data has been collected by citizen scientists 
so it is noisy. 
For the test data, we have a set of locations for all species from the training, 
set and for each location we know if a species is present there or not. 

You can find out information about each species by appending the taxon_id to this 
URL, e.g. for 22956: 'Leptodactylus mystacinus', the URL is: 
https://www.inaturalist.org/taxa/22956
note some species might not be on the website anymore

Possible questions to explore: 
 - train a separate model to predict what locations a species of interest is present 
 - train a single model instead of per species model 
 - how to deal with "positive only data"
 - dealing with noisy/biased training data
 - using other input features e.g. climate data from  WorldClim Bioclimatic 
   variables  https://www.worldclim.org/data/worldclim21.html
 - how to evaluate e.g. what is a good metric to use?
 
Data sources:
 -  train data is from iNaturalist -  www.inaturalist.org
 -  test data is IUCN - https://www.iucnredlist.org/resources/spatial-data-download
"""


import numpy as np
import matplotlib.pyplot as plt

# loading training data    
data = np.load('../Data/species/species_train.npz')
train_locs = data['train_locs']  # 2D array, rows are number of datapoints and 
                                 # columns are "latitude" and "longitude"
train_ids = data['train_ids']    # 1D array, entries are the ID of the species 
                                 # that is present at the corresponding location in train_locs
species = data['taxon_ids']      # list of species IDe. Note these do not necessarily start at 0 (or 1)
species_names = dict(zip(data['taxon_ids'], data['taxon_names']))  # latin names of species 

# loading test data 
data_test = np.load('../Data/species/species_test.npz', allow_pickle=True)
test_locs = data_test['test_locs']    # 2D array, rows are number of datapoints 
                                      # and columns are "latitude" and "longitude"
# data_test['test_pos_inds'] is a list of lists, where each list corresponds to 
# the indices in test_locs where a given species is present, it can be assumed 
# that they are not present in the other locations 
test_pos_inds = dict(zip(data_test['taxon_ids'], data_test['test_pos_inds']))    

# data stats
print('Train Stats:')
print('Number of species in train set:           ', len(species))
print('Number of train locations:                ', train_locs.shape[0])
_, species_counts = np.unique(train_ids, return_counts=True)
print('Average number of locations per species:  ', species_counts.mean())
print('Minimum number of locations for a species:', species_counts.min())
print('Maximum number of locations for a species:', species_counts.max())


# plot train and test data for a random species
plt.close('all')
plt.figure(0)

Train Stats:
Number of species in train set:            500
Number of train locations:                 272037
Average number of locations per species:   544.074
Minimum number of locations for a species: 50
Maximum number of locations for a species: 2000


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [4]:
DFpages = pd.read_csv('../Data/traits/pages.csv', low_memory=False)
DFpages

Unnamed: 0,page_id,parent_id,rank,canonical
0,60022422,45276842,,
1,60022135,45276777,,
2,55638825,35382,,
3,57948243,46548757,subfamily,Keratoisidinae
4,55635166,46559295,subspecies,Tursiops truncatus gephyreus
...,...,...,...,...
2404785,51878547,49862588,species,Mepraia breyeri
2404786,55695889,55695887,subspecies,Zelurus lugubris anduzei
2404787,51878549,51878548,species,Meccus bassolsae
2404788,51878551,51878550,species,Hermanlentia matsunoi


In [5]:
names_test = data_test['taxon_names']
names_train = data['taxon_names']
print(len(names_train)) # Number of animal species appearing in the train (and test) set
print(len(set(names_train) - set(DFpages['canonical']))) # Number of species in the train (and test) set not appearing into the online DB
print(set(names_test) - set(DFpages['canonical'])) # Names of the species in the train (and test) set not appearing into the online DB

# ID of the 14 animals in the locations dataset that were not found automatically
# (scientific name prefixed or followed by discoverer's information):
DFunmatched = pd.read_csv('unmatched.csv')

DFanimals_train = DFpages[DFpages['canonical'].isin(set(names_train))][['page_id', 'canonical']]
DFanimals_train = pd.concat([DFanimals_train, DFunmatched], ignore_index=True)
DFanimals_train

with open("ids.txt", "w") as file:
    for page_id in DFanimals_train["page_id"]:
        file.write(f"{page_id}\n")


500
14
{np.str_('Argya affinis'), np.str_('Neotamias rufus'), np.str_('Riccordia ricordii'), np.str_('Masticophis lateralis'), np.str_('Dasyprocta variegata'), np.str_('Argya striata'), np.str_('Lanius corvinus'), np.str_('Psophocichla litsitsirupa'), np.str_('Neophedina cincta'), np.str_('Urile urile'), np.str_('Chrysuronia versicolor'), np.str_('Campocolinus coqui'), np.str_('Neotamias canipes'), np.str_('Curruca hortensis')}


In [12]:
DFAnimal_traits = pd.read_csv('my_traits.csv')
len(DFAnimal_traits['Species ID'].unique()) # 495 - missing 5, species IDs can be found in log.txt
DFAnimal_traits

Unnamed: 0,Species ID,Trait,Trait Value
0,347435,Body symmetry,bilaterally symmetricURI:http://purl.obolibrar...
1,347435,actual evapotranspiration rate in geographic r...,509.99 millimeters per month
2,347435,animal population density,136.99 individuals per square kilometer
3,347435,are eaten by,Lynx rufus (Schreber 1777) (bay lynx)
4,347435,are eaten by,Aquila chrysaetos (Linnaeus 1758) (Golden eagle)
...,...,...,...
26939,311789,testis location,scrotalURI:http://eol.org/schema/terms/scrotal...
26940,311789,trophic guild,herbivoreURI:https://www.wikidata.org/entity/Q...
26941,311789,trophic guild,granivoreURI:http://www.wikidata.org/entity/Q1...
26942,311789,visual system,corneal eyesURI:https://eol.org/schema/terms/c...
