# Setup & Imports

In [3]:
# 📦 Install dependencies
# Run this cell once to make sure all required packages are available

%pip install -q pandas numpy biopython matplotlib seaborn tqdm goatools plotly umap-learn scikit-learn torch transformers


Note: you may need to restart the kernel to use updated packages.


In [4]:
# Standard libraries
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Bioinformatics
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# GO Ontology
from goatools.obo_parser import GODag


In [5]:
# Define data paths
DATA_DIR = "./cafa-6-protein-function-prediction"
TRAIN_DIR = os.path.join(DATA_DIR, "Train")
TEST_DIR = os.path.join(DATA_DIR, "Test")

print("Files in Train:", os.listdir(TRAIN_DIR))
print("Files in Test:", os.listdir(TEST_DIR))


Files in Train: ['train_taxonomy.tsv', 'train_terms.tsv', 'go-basic.obo', 'train_sequences.fasta']
Files in Test: ['testsuperset.fasta', 'testsuperset-taxon-list.tsv']


# 🧠 Explore Dataset Size and Structure

- Count the number of sequences in:

    - train_sequences.fasta

    - testsuperset.fasta

- Count unique proteins in train_terms.tsv and train_taxonomy.tsv

- Check if all sequence IDs match across files.

👉 Goal: Ensure internal consistency between FASTA, taxonomy, and term mapping files.

In [12]:
# Load TSV files
train_terms = pd.read_csv(os.path.join(TRAIN_DIR, "train_terms.tsv"), sep="\t")
train_taxonomy = pd.read_csv(
    os.path.join(TRAIN_DIR, "train_taxonomy.tsv"),
    sep="\t",
    header=None,
    names=["EntryID", "Taxon"]
)

IA = pd.read_csv(
    os.path.join(DATA_DIR, "IA.tsv"),
    sep="\t",
    header=None,
    names=["GO_ID", "InformationAccretion"]
)


print("train_terms:", train_terms.shape)
print("train_taxonomy:", train_taxonomy.shape)
print("IA:", IA.shape)


train_terms: (537027, 3)
train_taxonomy: (82404, 2)
IA: (40122, 2)


In [13]:
# Preview first few rows
display(train_terms.head())
display(train_taxonomy.head())
display(IA.head())


Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P


Unnamed: 0,EntryID,Taxon
0,A0A0C5B5G6,9606
1,A0JNW5,9606
2,A0JP26,9606
3,A0PK11,9606
4,A1A4S6,9606


Unnamed: 0,GO_ID,InformationAccretion
0,GO:0000001,0.0
1,GO:0000002,2.849666
2,GO:0000011,0.137504
3,GO:0000012,6.03863
4,GO:0000017,0.514573


### Count and Check Data Consistency

In [None]:
# Unique protein IDs
def extract_uniprot_id(record_id):
    # e.g. "sp|Q9Y2Q0|ABC1_HUMAN" → "Q9Y2Q0"
    parts = record_id.split('|')
    if len(parts) >= 2:
        return parts[1]
    return record_id  # fallback if no pipe present

train_seq_ids = {extract_uniprot_id(record.id) for record in SeqIO.parse(fasta_path, "fasta")}
term_ids = set(train_terms['EntryID'])
tax_ids = set(train_taxonomy['EntryID'])

print(f"Sequences: {len(train_seq_ids)} | Terms: {len(term_ids)} | Taxonomy: {len(tax_ids)}")
print("Unmatched IDs (terms not in sequences):", len(term_ids - train_seq_ids))
✅ Expected Output

Sequences: 82404 | Terms: 82404 | Taxonomy: 82404
Unmatched IDs (terms not in sequences): 82404
