Let's start by creating a mapping of gene oncology (GO) term IDS to their label indices starting from 0.

In [5]:
from itertools import islice

obo_file_path = "./dataset/train/go-basic.obo"

term_to_label_index = {}
label_index = 0

with open(obo_file_path, 'r') as file:
    data = file.read().split("[Term]")

for term_data in data[1:]:  # Skip the first element as it is before the first [Term].
    lines = term_data.strip().splitlines()

    term_id = None

    for line in lines:
        if line.startswith("id:"):
            term_id = line.split("id:")[1].strip()

        if term_id:
            term_to_label_index[term_id] = label_index

            label_index += 1

            break

first_10 = dict(islice(term_to_label_index.items(), 10))

# Print the first 10 terms.
for term_id, label_index in first_10.items():
    print(f"{term_id} => {label_index}")

GO:0000001 => 0
GO:0000002 => 1
GO:0000003 => 2
GO:0000005 => 3
GO:0000006 => 4
GO:0000007 => 5
GO:0000008 => 6
GO:0000009 => 7
GO:0000010 => 8
GO:0000011 => 9


Next, let's count the number of unique GO terms so we know how many classes we need to represent in the output layer of the Transformer.

In [6]:
num_classes = len(term_to_label_index)

print(f"Total number of classes: {num_classes:,}")

Total number of classes: 47,417


Now we can map the sequence IDs to their GO term label indices.

In [7]:
import pandas as pd

from collections import defaultdict

terms_path = "./dataset/train/train_terms.tsv"

sequence_to_label_indices = defaultdict(list)

df = pd.read_csv(terms_path, sep='\t')

for _, row in df.iterrows():
    sequence_id = row["EntryID"]
    label_index = term_to_label_index[row["term"]]

    sequence_to_label_indices[sequence_id].append(label_index)

first_10 = dict(islice(sequence_to_label_indices.items(), 10))

# Print the first 10 terms.
for term_id, label_indices in first_10.items():
    print(f"{term_id} => {label_indices}")

A0A009IHW8 => [6130, 17759, 33427, 21998, 5218, 40751, 21979, 40817, 6129, 41016, 41017, 6987, 4668, 22006, 23863, 12080, 23851, 28851, 21986, 12152, 12342, 5265, 12378, 40818, 4687, 24055, 7801, 5194, 5254, 17745, 33425, 32690, 12082, 41027, 33430, 23803, 7033, 33428, 4719, 6928, 21980, 5251, 12375, 2628, 2856, 10504, 10514, 10513, 2743]
A0A021WW32 => [26053, 26041, 13187, 31037, 5667, 5473, 36072, 20451, 25726, 6129, 42410, 5777, 8384, 26872, 25678, 11949, 13523, 241, 25679, 546, 13220, 25921, 38541, 5464, 25889, 7710, 10126, 15704, 29141, 7801, 9891, 32815, 5475, 8407, 12883, 5477, 25528, 5421, 53, 239, 13550, 15703, 8264, 25857, 31227, 27340, 32681, 36345, 191, 4204, 21059, 524, 527, 521, 21064, 16155, 21062, 21065, 4215, 4278, 21061, 15210, 525, 35953, 21066, 4273, 4160, 22479, 4268, 526, 6245, 38223, 4289, 31051, 21060, 515, 15204]
A0A023FFD0 => [29188, 7783, 31037, 8427, 29186, 13372, 6129, 46282, 8265, 27171, 26997, 26872, 8425, 26995, 21852, 39621, 26876, 7781, 25785, 19984, 1

Finally, let's read through the training samples and create a new JSONL dataset that contains the raw protein sequences and a list of their GO term label indices.

In [8]:
import json

from Bio import SeqIO

fasta_path = "./dataset/train/train_sequences.fasta"
dataset_path = "./dataset/dataset.jsonl"

with open(dataset_path, "w") as dataset_file:   
    with open(fasta_path, "r") as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequence_id = record.id
            sequence = str(record.seq)

            if sequence_id in sequence_to_label_indices:
                label_indices = sequence_to_label_indices[sequence_id]

                line = {
                    "sequence_id": sequence_id,
                    "sequence": sequence,
                    "label_indices": label_indices
                }

                dataset_file.write(json.dumps(line) + "\n")

print(f"Dataset saved to {dataset_path}")


Dataset saved to ./dataset/dataset.jsonl
