Let's loop through the sequences of the SwissProt dataset and scrape their GO term annotations from the UniProt API.

In [None]:
import requests, json

from os import path

from Bio import SeqIO

from time import sleep

fasta_path = "./dataset/uniprot_sprot.fasta"

start_offset = 144807

mf_dataset_path = "./dataset/mf.jsonl"
bp_dataset_path = "./dataset/bp.jsonl"
cc_dataset_path = "./dataset/cc.jsonl"
all_dataset_path = "./dataset/all.jsonl"

params = {
    "fields": [
        " go_p",
        " go_c",
        " go_f",
        " go_id",
    ],
}

headers = {
    "accept": "application/json"
}

base_url = "https://rest.uniprot.org/uniprotkb"

with open(mf_dataset_path, "a") as mf_dataset_file, \
    open(bp_dataset_path, "a") as bp_dataset_file, \
    open(cc_dataset_path, "a") as cc_dataset_file, \
    open(all_dataset_path, "a") as all_dataset_file, \
    open(fasta_path, "r") as fasta_file:

    for index, record in enumerate(SeqIO.parse(fasta_file, "fasta"), start=1):
        if index < start_offset:
            continue

        sequence_id = record.id.split("|")[1]
        taxon_id = record.description.split("OX=", 1)[1].split(" ")[0]
        sequence = str(record.seq)

        url = path.join(base_url, sequence_id)

        response = requests.get(url, headers=headers, params=params)

        while response.status_code == 503:
            retry_after = int(response.headers.get("Retry-After", 5))

            print(f"Rate limit exceeded. Retrying in {retry_after} seconds ...")

            sleep(retry_after)

            response = requests.get(url, headers=headers, params=params)

        
        if response.status_code != 200:
            print(f"Error fetching data for {sequence_id}: {response.status_code}")

            continue

        data = response.json()

        bp_terms = set()
        cc_terms = set()
        mf_terms = set()
        all_terms = set()

        if "uniProtKBCrossReferences" not in data:
            continue

        for go_object in data["uniProtKBCrossReferences"]:
            if "database" not in go_object:
                continue

            if go_object["database"] == "GO":
                go_term = go_object["id"]

                for property in go_object["properties"]:
                    if property["key"] == "GoTerm":
                        value = property["value"]

                        aspect, name = value.split(":", 1)

                    match(aspect.upper()):
                        case "P":
                            bp_terms.add(go_term)
                        case "C":
                            cc_terms.add(go_term)
                        case "F":
                            mf_terms.add(go_term)

                all_terms.add(go_term)

        if len(mf_terms) > 0:
            mf_dataset_file.write(json.dumps({
                "id": sequence_id,
                "sequence": sequence,
                "terms": list(mf_terms),
                "taxon_id": str(taxon_id),
            }) + "\n")

        if len(bp_terms) > 0:
            bp_dataset_file.write(json.dumps({
                "id": sequence_id,
                "sequence": sequence,
                "terms": list(bp_terms),
                "taxon_id": str(taxon_id),
            }) + "\n")

        if len(cc_terms) > 0:
            cc_dataset_file.write(json.dumps({
                "id": sequence_id,
                "sequence": sequence,
                "terms": list(cc_terms),
                "taxon_id": str(taxon_id),
            }) + "\n")

        if len(all_terms) > 0:
            all_dataset_file.write(json.dumps({
                "id": sequence_id,
                "sequence": sequence,
                "terms": list(all_terms),
                "taxon_id": str(taxon_id),
            }) + "\n")

        print(
            f"Record: #{index:,}, ID: {sequence_id}, "
            f"Length: {len(sequence)}, Num Terms: {len(all_terms):,}"
        )

print("Done!")

        

Record: #139,554, ID: Q8KA42, Length: 466, Num Terms: 9
Record: #139,555, ID: Q89AZ7, Length: 454, Num Terms: 9
Record: #139,556, ID: P34268, Length: 1257, Num Terms: 9
Record: #139,557, ID: P0CAT8, Length: 444, Num Terms: 10
Record: #139,558, ID: B8H363, Length: 444, Num Terms: 10
Record: #139,559, ID: F8WK50, Length: 1259, Num Terms: 12
Record: #139,560, ID: Q24020, Length: 1256, Num Terms: 14
Record: #139,561, ID: P52612, Length: 457, Num Terms: 14
Record: #139,562, ID: Q9ZJJ3, Length: 434, Num Terms: 9
Record: #139,563, ID: O07025, Length: 434, Num Terms: 9
Record: #139,564, ID: Q13045, Length: 1269, Num Terms: 18
Record: #139,565, ID: Q9JJ28, Length: 1271, Num Terms: 11
Record: #139,566, ID: Q9I4N1, Length: 451, Num Terms: 10
Record: #139,567, ID: O54249, Length: 467, Num Terms: 10
Record: #139,568, ID: P26465, Length: 456, Num Terms: 11
Record: #139,569, ID: O83417, Length: 447, Num Terms: 10
Record: #139,570, ID: P20487, Length: 147, Num Terms: 6
Record: #139,571, ID: P57179, Le

ConnectTimeout: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprotkb/A1KS57?fields=+go_p&fields=+go_c&fields=+go_f&fields=+go_id (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7101bb889940>, 'Connection to rest.uniprot.org timed out. (connect timeout=None)'))