In [1]:
import json
import logging
import time
from datetime import date, datetime
from typing import Dict, List, Optional

import pycountry
import requests
import taxopy
from pydantic import BaseModel, ValidationInfo, field_validator
from pydantic_core import to_jsonable_python
from pydantic_extra_types.coordinate import Coordinate
from pydantic_extra_types.country import CountryShortName
from requests.exceptions import RequestException

In [2]:
RUN_ACCESSIONS = ["SRR7091408", "SRR3993031", "SRR6782230", "ERR9632109"]
FIELDS = [
    "run_accession",
    "study_accession",
    "sample_accession",
    "sample_description",
    "library_strategy",
    "library_source",
    "first_public",
    "country",
    "location",
    "collection_date_start",
    "collection_date_end",
    "tax_id",
    "host_tax_id",
]

In [3]:
def get_run_metadata(
    run_accession: str, fields: List[str], n_tries: int = 3, wait: int = 10
) -> Dict[str, Optional[str]]:
    # Define the base URL for the ENA API
    base_url = "https://www.ebi.ac.uk/ena/portal/api/filereport"
    # Define the parameters for the API request
    params = {
        "accession": run_accession,  # Run accession
        "result": "read_run",  # Specify the type of data you want
        "format": "json",  # Request the data in JSON format
        "fields": ",".join(fields),  # Request specific fields
    }
    attempt = 0
    while attempt < n_tries:
        try:
            # Make the GET request to the ENA API
            response = requests.get(base_url, params=params)
            # Check if the request was successful (status code 200)
            response.raise_for_status()
            metadata = response.json()[0]
            return {k: v for k, v in metadata.items() if len(v)}
        except RequestException as e:
            logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
            # Start trying again if the request failed and there are attempts left
            if attempt < n_tries - 1:
                logging.info(f"Retrying in {wait} seconds…")
                time.sleep(wait)
            else:
                raise RequestException(f"Failed to retrieve data: {str(e)}") from e
        attempt += 1

In [4]:
class RunMetadata(BaseModel):
    run_accession: str
    study_accession: str
    sample_accession: str
    sample_description: str
    library_strategy: str
    library_source: str
    first_public: Optional[date] = None
    country: Optional[CountryShortName] = None
    location: Optional[Coordinate] = None
    collection_date_start: Optional[date] = None
    collection_date_end: Optional[date] = None
    tax_id: Optional[int] = None
    host_tax_id: Optional[int] = None

    @field_validator(
        "first_public",
        "collection_date_start",
        "collection_date_end",
        mode="before",
    )
    @classmethod
    def parse_single_timestamp(cls, data: str, info: ValidationInfo) -> datetime:
        data = data.strip("Z")
        date_formats = ["%Y-%m-%d", "%Y", "%Y-%m", "%B %Y"]
        for fmt in date_formats:
            try:
                return datetime.strptime(data, fmt).date()
            except ValueError:
                continue
        raise ValueError(f"Invalid date format for {info.field_name}: {data}")

    @field_validator("country", mode="before")
    @classmethod
    def fix_country_name(cls, data: str, info: ValidationInfo) -> str:
        country_name = (
            data.split(":")[0].split(",")[0].split(";")[0].split("/")[0].strip()
        )
        if country_name.casefold() == "russia":
            country_name = "Russian Federation"
        try:
            return pycountry.countries.lookup(country_name).name
        except LookupError:
            try:
                return pycountry.countries.lookup(
                    country_name.split("(")[1].split(")")[0]
                ).name
            except (LookupError, IndexError):
                raise ValueError(f"Invalid country name for {info.field_name}: {data}")

    @field_validator("location", mode="before")
    @classmethod
    def fix_location(cls, data: str) -> Coordinate:
        lat_str, lat_dir, lon_str, lon_dir = data.split()
        lat = float(lat_str) * (-1 if lat_dir in ["S", "W"] else 1)
        lon = float(lon_str) * (-1 if lon_dir in ["S", "W"] else 1)
        return Coordinate(lat, lon)

In [5]:
run_metadata = [
    RunMetadata.model_validate(get_run_metadata(r, FIELDS)) for r in RUN_ACCESSIONS
]
for m in run_metadata:
    print(m)

run_accession='SRR7091408' study_accession='PRJNA441428' sample_accession='SAMN08777777' sample_description='Rhizosphere microbial communities from Vellozia epidendroides in rupestrian grasslands, the National Park of Serra do Cipo, Brazil - RX_R1' library_strategy='WGS' library_source='METAGENOMIC' first_public=datetime.date(2018, 5, 3) country='Brazil' location=Coordinate(latitude=-19.2822, longitude=-43.5936) collection_date_start=datetime.date(2017, 3, 6) collection_date_end=datetime.date(2017, 3, 6) tax_id=939928 host_tax_id=1051480
run_accession='SRR3993031' study_accession='PRJNA328899' sample_accession='SAMN05414960' sample_description='Keywords: GSC:MIxS MIMS:5.0' library_strategy='WGS' library_source='METAGENOMIC' first_public=datetime.date(2016, 8, 8) country='China' location=Coordinate(latitude=43.95, longitude=116.13) collection_date_start=datetime.date(2010, 9, 30) collection_date_end=datetime.date(2010, 10, 31) tax_id=408170 host_tax_id=9606
run_accession='SRR6782230' st

In [6]:
with open("run_metadata.json", "w") as fo:
    json.dump(to_jsonable_python(run_metadata), fo, indent=2)

In [7]:
taxdb = taxopy.TaxDb()
host_taxon = taxopy.Taxon(run_metadata[0].host_tax_id, taxdb)
for rank, name in host_taxon.ranked_name_lineage:
    print(f"{rank}: {name}")

species: Vellozia epidendroides
genus: Vellozia
family: Velloziaceae
order: Pandanales
subclass: Petrosaviidae
clade: Liliopsida
clade: Mesangiospermae
class: Magnoliopsida
clade: Spermatophyta
clade: Euphyllophyta
clade: Tracheophyta
clade: Embryophyta
subphylum: Streptophytina
phylum: Streptophyta
kingdom: Viridiplantae
superkingdom: Eukaryota
no rank: cellular organisms
no rank: root
