# Taxonomy data extraction
Family: Groups related genera, e.g., Agamidae for some lizards, indicating evolutionary relationships. <br>
Kingdom: Broadest classification, often Animalia for animals. <br>
Phylum: Groups organisms by body structure, e.g., Chordata for animals with a spinal cord. <br>
Parent: Immediate higher rank, typically the genus, showing close lineage. <br>
Class: Broad category like Aves (birds) or Mammalia (mammals), highlighting major traits. <br>
Genus: Closest grouping above species, part of the scientific name (e.g., Lophognathus gilberti). <br>

## Preliminary code

In [None]:
# Importing useful packages
from pathlib import Path
import requests
import numpy as np
import pandas as pd

# Loading species data
species_train = np.load(Path('../species/species_train.npz'))
species_names = dict(zip(species_train['taxon_ids'], species_train['taxon_names']))

# Data scraping

In [None]:
def get_gbif_data(scientific_name, species_id):
    url = f"https://api.gbif.org/v1/species?name={scientific_name}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            result = data['results'][0]
            return {
                'id': species_id,  # Include species id in the result
                'scientific_name': scientific_name,
                'family': result.get('family', np.nan),
                'kingdom': result.get('kingdom', np.nan),
                'phylum': result.get('phylum', np.nan),
                'parent': result.get('parent', np.nan),
                'class': result.get('class', np.nan),
                'genus': result.get('genus', np.nan)
            }
    
    # If no results or error, return NaN values
    return {
        'id': species_id,
        'scientific_name': scientific_name,
        'family': np.nan,
        'kingdom': np.nan,
        'phylum': np.nan,
        'parent': np.nan,
        'class': np.nan,
        'genus': np.nan
    }

# Fetching data into DF
results = []
for species_id, scientific_name in species_names.items():
    species_data = get_gbif_data(scientific_name, species_id)
    results.append(species_data)
animals = pd.DataFrame(results)

# Saving obtained dataframe
animals.to_csv('taxonomy.csv', index=False)