Let's loop through the sequences of the SwissProt dataset and scrape their GO term annotations from the UniProt API.

In [None]:
import requests, json

from os import path

from Bio import SeqIO

from time import sleep

fasta_path = "./dataset/uniprot_sprot.fasta"

start_offset = 1

dataset_path = "./dataset/dataset.jsonl"

params = {
    "fields": [
        " go_p",
        " go_c",
        " go_f",
    ],
}

headers = {
    "accept": "application/json"
}

base_url = "https://rest.uniprot.org/uniprotkb"

with open(dataset_path, "a") as dataset_file, open(fasta_path, "r") as fasta_file:
    for index, record in enumerate(SeqIO.parse(fasta_file, "fasta"), start=1):
        if index < start_offset:
            continue

        sequence_id = record.id.split("|")[1]
        taxon_id = record.description.split("OX=", 1)[1].split(" ")[0]
        sequence = str(record.seq)

        url = path.join(base_url, sequence_id)

        response = requests.get(url, headers=headers, params=params)

        while response.status_code == 503:
            retry_after = int(response.headers.get("Retry-After", 5))

            print(f"Rate limit exceeded. Retrying in {retry_after} seconds ...")

            sleep(retry_after)

            response = requests.get(url, headers=headers, params=params)

        
        if response.status_code != 200:
            print(f"Error fetching data for {sequence_id}: {response.status_code}")

            continue

        data = response.json()

        go_terms = []

        if "uniProtKBCrossReferences" not in data:
            continue

        for cross_reference in data["uniProtKBCrossReferences"]:
            if "database" not in cross_reference:
                continue

            if cross_reference["database"] == "GO":
                id = cross_reference["id"]

                aspect = "?"
                evidence_code = "?"

                for property in cross_reference["properties"]:
                    if property["key"] == "GoTerm":
                        aspect = property["value"].split(":", 1)[0]

                    if property["key"] == "GoEvidenceType":
                        evidence_code = property["value"].split(":", 1)[0]

                go_term = {
                    "id": id,
                    "aspect": aspect,
                    "evidence_code": evidence_code,
                }

                go_terms.append(go_term)

        if len(go_terms) > 0:
            dataset_file.write(json.dumps({
                "id": sequence_id,
                "sequence": sequence,
                "go_terms": go_terms,
                "taxon_id": str(taxon_id),
            }) + "\n")

        print(
            f"Record: #{index:,}, ID: {sequence_id}, "
            f"Length: {len(sequence)}, GO Terms: {len(go_terms):,}"
        )

print("Done!")