In [1]:
import json
import logging
import time
from datetime import date, datetime
from typing import Dict, List, Optional

import polars as pl
import requests
import taxopy
from pydantic import BaseModel, ValidationInfo, field_validator
from pydantic_core import to_jsonable_python
from requests.exceptions import RequestException

In [2]:
SEQUENCE_ACCESSIONS = ["CP025066", "CP003099", "CP019400", "CP080467", "CP093911"]

In [3]:
def get_sequence_metadata(
    sequence_accession: str, n_tries: int = 3, wait: int = 10
) -> Dict[str, Optional[str]]:
    # Define the base URL for the ENA API
    base_url = f"https://www.ebi.ac.uk/ena/browser/api/summary/{sequence_accession}"
    # Define the parameters for the API request
    params = {
        "dataType": "sequence",  # Specify the data type
        "format": "json",  # Request the data in JSON format
    }
    attempt = 0
    while attempt < n_tries:
        try:
            # Make the GET request to the ENA API
            response = requests.get(base_url, params=params)
            # Check if the request was successful (status code 200)
            response.raise_for_status()
            metadata = response.json()["summaries"]
            if not metadata:
                raise ValueError(f"No metadata found for '{sequence_accession}'")
            return metadata[0]
        except RequestException as e:
            logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
            # Start trying again if the request failed and there are attempts left
            if attempt < n_tries - 1:
                logging.info(f"Retrying in {wait} seconds…")
                time.sleep(wait)
            else:
                raise RequestException(f"Failed to retrieve data: {str(e)}") from e
        attempt += 1

In [4]:
class SequenceMetadata(BaseModel):
    accession: str
    description: str
    version: int
    project: str
    sample: str
    moleculeType: str
    taxon: int
    dataType: str
    dataClass: str
    firstPublic: date
    lastUpdated: date
    status: int
    statusDescription: str
    sequenceLength: int
    publications: Optional[List[Dict[str, str]]] = None

    @field_validator("firstPublic", "lastUpdated", mode="before")
    @classmethod
    def parse_single_timestamp(cls, data: str, info: ValidationInfo) -> datetime:
        try:
            return datetime.strptime(data, "%d-%b-%Y").date()
        except ValueError:
            raise ValueError(f"Invalid date format for {info.field_name}: {data}")

In [5]:
sample_metadata = [
    SequenceMetadata.model_validate(get_sequence_metadata(s)) for s in SEQUENCE_ACCESSIONS
]
for m in sample_metadata:
    print(m)

accession='CP025066' description='Halalkaliarchaeum desulfuricum strain AArc-Sl chromosome, complete genome.' version=1 project='PRJNA419617' sample='SAMN08093583' moleculeType='genomic DNA' taxon=2055893 dataType='SEQUENCE' dataClass='STD' firstPublic=datetime.date(2018, 9, 1) lastUpdated=datetime.date(2019, 11, 28) status=4 statusDescription='public' sequenceLength=3313120 publications=[{'source': 'PUBMED', 'pId': '31166158'}]
accession='CP003099' description='Aggregatibacter actinomycetemcomitans ANH9381, complete genome.' version=1 project='PRJNA47337' sample='SAMN02603044' moleculeType='genomic DNA' taxon=754507 dataType='SEQUENCE' dataClass='STD' firstPublic=datetime.date(2011, 12, 25) lastUpdated=datetime.date(2014, 5, 15) status=4 statusDescription='public' sequenceLength=2136808 publications=[{'source': 'DOI', 'pId': '10.1128/JB.06770-11'}, {'source': 'PUBMED', 'pId': '22408240'}]
accession='CP019400' description='Acidipropionibacterium acidipropionici strain WSH1105, complete

In [6]:
# Serialize the data to a JSON file
with open("sample_metadata.json", "w") as fo:
    json.dump(to_jsonable_python(sample_metadata), fo, indent=2)

In [7]:
# Load the data into a Polars DataFrame
pl.DataFrame(sample_metadata)

accession,description,version,project,sample,moleculeType,taxon,dataType,dataClass,firstPublic,lastUpdated,status,statusDescription,sequenceLength,publications
str,str,i64,str,str,str,i64,str,str,date,date,i64,str,i64,list[struct[2]]
"""CP025066""","""Halalkaliarchaeum desulfuricum…",1,"""PRJNA419617""","""SAMN08093583""","""genomic DNA""",2055893,"""SEQUENCE""","""STD""",2018-09-01,2019-11-28,4,"""public""",3313120,"[{""PUBMED"",""31166158""}]"
"""CP003099""","""Aggregatibacter actinomycetemc…",1,"""PRJNA47337""","""SAMN02603044""","""genomic DNA""",754507,"""SEQUENCE""","""STD""",2011-12-25,2014-05-15,4,"""public""",2136808,"[{""DOI"",""10.1128/JB.06770-11""}, {""PUBMED"",""22408240""}]"
"""CP019400""","""Acidipropionibacterium acidipr…",1,"""PRJNA301197""","""SAMN04230715""","""genomic DNA""",1748,"""SEQUENCE""","""STD""",2017-02-01,2017-02-01,4,"""public""",3645455,
"""CP080467""","""Alicyclobacillus acidoterrestr…",1,"""PRJNA751022""","""SAMN20503121""","""genomic DNA""",1450,"""SEQUENCE""","""STD""",2022-03-24,2023-06-15,4,"""public""",4222202,"[{""PUBMED"",""36240455""}]"
"""CP093911""","""Aggregatibacter actinomycetemc…",1,"""PRJNA787784""","""SAMN23845304""","""genomic DNA""",714,"""SEQUENCE""","""STD""",2022-08-24,2022-08-27,4,"""public""",2077859,


In [None]:
# Parse taxonomy id using taxopy
taxdb = taxopy.TaxDb()
host_taxon = taxopy.Taxon(sample_metadata[0].taxon, taxdb)
for rank, name in host_taxon.ranked_name_lineage:
    print(f"{rank}: {name}")

species: Halalkaliarchaeum desulfuricum
genus: Halalkaliarchaeum
family: Haloferacaceae
order: Halobacteriales
class: Halobacteria
clade: Stenosarchaea group
phylum: Methanobacteriota
kingdom: Methanobacteriati
superkingdom: Archaea
no rank: cellular organisms
no rank: root
