In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import os
def get_data_path(filename):
    """
    Returns the correct path whether running locally or on Kaggle.
    
    Args:
        filename: Like 'Train/train_terms.tsv' or 'Test/testsuperset.fasta'
    
    Returns:
        Full path to the file
    """
    # Check if we're on Kaggle
    if os.path.exists('/kaggle/input'):
        # On Kaggle, files are in /kaggle/input/cafa-6-protein-function-prediction/
        base_path = '/kaggle/input/cafa-6-protein-function-prediction'
    else:
        # Locally, files are in current directory
        base_path = '.'
    
    return os.path.join(base_path, filename)

In [None]:
# Test the helper function
print("Testing get_data_path:")
print("Train file:", get_data_path('Train/train_terms.tsv'))
print("Test file:", get_data_path('Test/testsuperset.fasta'))

Testing get_data_path:
Train file: ./Train/train_terms.tsv
Test file: ./Test/testsuperset.fasta


In [11]:
# Step 1: Load the training labels
train_terms = pd.read_csv(get_data_path('Train/train_terms.tsv'), sep='\t')

# Let's see what it looks like
print("First few rows:")
print(train_terms.head())
print("\nTotal rows:", len(train_terms))
print("\nColumns:", train_terms.columns.tolist())

First few rows:
  EntryID        term aspect
0  Q5W0B1  GO:0000785      C
1  Q5W0B1  GO:0004842      F
2  Q5W0B1  GO:0051865      P
3  Q5W0B1  GO:0006275      P
4  Q5W0B1  GO:0006513      P

Total rows: 537027

Columns: ['EntryID', 'term', 'aspect']


In [12]:
# Step 2: Count how many times each GO term appears
term_counts = train_terms['term'].value_counts()

# Show the top 20 most common GO terms
print("Top 20 most common GO terms:")
print(term_counts.head(20))
print(f"\nTotal unique GO terms: {len(term_counts)}")

Top 20 most common GO terms:
term
GO:0005515    33713
GO:0005634    13283
GO:0005829    13040
GO:0005886    10150
GO:0005737     9442
GO:0005739     5807
GO:0005654     5065
GO:0016020     3563
GO:0042802     3547
GO:0005576     3241
GO:0005783     2837
GO:0005615     2391
GO:0045944     2319
GO:0070062     2130
GO:0005794     2045
GO:0005730     1789
GO:0042803     1627
GO:0003723     1613
GO:0000122     1551
GO:0009507     1512
Name: count, dtype: int64

Total unique GO terms: 26125


In [13]:
# Step 3: Read test sequences from FASTA file
def read_fasta(filepath):
    """
    Reads a FASTA file and returns a dictionary.
    
    Returns:
        Dictionary where key = protein_id, value = sequence
    """
    sequences = {}
    current_id = None
    current_seq = []
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()  # Remove whitespace
            
            if line.startswith('>'):
                # This is a header line (protein ID)
                if current_id is not None:
                    # Save previous protein
                    sequences[current_id] = ''.join(current_seq)
                
                # Extract protein ID (first word after >)
                current_id = line[1:].split()[0]  # Remove '>' and get first word
                current_seq = []  # Reset sequence
            else:
                # This is a sequence line
                if line:  # If line is not empty
                    current_seq.append(line)
        
        # Don't forget the last protein!
        if current_id is not None:
            sequences[current_id] = ''.join(current_seq)
    
    return sequences

# Read the test sequences
test_sequences = read_fasta(get_data_path('Test/testsuperset.fasta'))

print(f"Loaded {len(test_sequences)} test proteins")
print("\nFirst protein example:")
first_id = list(test_sequences.keys())[0]
print(f"ID: {first_id}")
print(f"Sequence (first 50 chars): {test_sequences[first_id][:50]}...")

Loaded 224309 test proteins

First protein example:
ID: A0A0C5B5G6
Sequence (first 50 chars): MRWQEMGYIFYPRKLR...


In [14]:
# Step 4: Make baseline predictions
# We'll predict the top 50 most common GO terms for each test protein
top_n_terms = term_counts.head(50).index.tolist()  # Get top 50 GO term IDs

print(f"We'll predict {len(top_n_terms)} GO terms for each protein")
print(f"Top 5 terms: {top_n_terms[:5]}")

# Create predictions: each protein gets all top N terms with a confidence score
predictions = []

for protein_id in test_sequences.keys():
    # For each protein, predict all top N terms
    # We'll use a simple confidence: higher for more common terms
    for i, go_term in enumerate(top_n_terms):
        # Confidence decreases slightly for less common terms
        confidence = 0.9 - (i * 0.01)  # Starts at 0.9, decreases by 0.01 each time
        confidence = max(0.1, confidence)  # Don't go below 0.1
        
        predictions.append({
            'protein_id': protein_id,
            'GO_term': go_term,
            'confidence': confidence
        })

print(f"\nTotal predictions: {len(predictions)}")
print(f"Predictions per protein: {len(predictions) // len(test_sequences)}")

We'll predict 50 GO terms for each protein
Top 5 terms: ['GO:0005515', 'GO:0005634', 'GO:0005829', 'GO:0005886', 'GO:0005737']

Total predictions: 11215450
Predictions per protein: 50


In [15]:
# Step 5: Convert predictions to DataFrame and format for submission
submission_df = pd.DataFrame(predictions)

print("First few rows of submission:")
print(submission_df.head(10))
print(f"\nShape: {submission_df.shape} (rows, columns)")
print(f"\nColumns: {submission_df.columns.tolist()}")

First few rows of submission:
   protein_id     GO_term  confidence
0  A0A0C5B5G6  GO:0005515        0.90
1  A0A0C5B5G6  GO:0005634        0.89
2  A0A0C5B5G6  GO:0005829        0.88
3  A0A0C5B5G6  GO:0005886        0.87
4  A0A0C5B5G6  GO:0005737        0.86
5  A0A0C5B5G6  GO:0005739        0.85
6  A0A0C5B5G6  GO:0005654        0.84
7  A0A0C5B5G6  GO:0016020        0.83
8  A0A0C5B5G6  GO:0042802        0.82
9  A0A0C5B5G6  GO:0005576        0.81

Shape: (11215450, 3) (rows, columns)

Columns: ['protein_id', 'GO_term', 'confidence']


In [16]:
# Step 6: Save to TSV file
output_file = 'submission.tsv'

# Save without index and with tab separator
submission_df.to_csv(output_file, sep='\t', index=False, header=False)

print(f"✅ Submission file saved: {output_file}")
print(f"File size: {os.path.getsize(output_file) / (1024*1024):.2f} MB")

# Let's verify the first few lines
print("\nFirst 5 lines of submission file:")
with open(output_file, 'r') as f:
    for i, line in enumerate(f):
        if i < 5:
            print(line.strip())
        else:
            break

✅ Submission file saved: submission.tsv
File size: 285.79 MB

First 5 lines of submission file:
A0A0C5B5G6	GO:0005515	0.9
A0A0C5B5G6	GO:0005634	0.89
A0A0C5B5G6	GO:0005829	0.88
A0A0C5B5G6	GO:0005886	0.87
A0A0C5B5G6	GO:0005737	0.86


In [19]:
# Step 7: Compare with sample submission format
print("Your submission format (first 3 rows):")
print(submission_df.head(3).to_string(index=False))

print("\n" + "="*50)
print("Sample submission format (first 3 GO term lines):")

# Read the sample file line by line, filtering for GO terms only
sample_path = get_data_path('sample_submission.tsv')
go_term_lines = []
with open(sample_path, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        # Only show lines with GO terms (3 columns, second column starts with "GO:")
        if len(parts) == 3 and parts[1].startswith('GO:'):
            go_term_lines.append(line.strip())
            if len(go_term_lines) >= 3:
                break

for line in go_term_lines:
    print(line)

Your submission format (first 3 rows):
protein_id    GO_term  confidence
A0A0C5B5G6 GO:0005515        0.90
A0A0C5B5G6 GO:0005634        0.89
A0A0C5B5G6 GO:0005829        0.88

Sample submission format (first 3 GO term lines):
A0A0C5B5G6	GO:0000001	0.123
A0A0C5B5G6	GO:0000002	0.456
A0A1B0GTW7	GO:0000001	0.123
