# Importing and cleaning data

Data source: https://www.kaggle.com/competitions/cafa-5-protein-function-prediction/data 

First, run the following code in the conda environment to prevent exceeding the IOPub data rate for loading fasta files and then reopen the kernel:

jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e8

And make sure all packages are installed:
- pip install biopython
- pip install pandas
- pip install obonet


In [49]:
# load packages
from Bio import SeqIO
import pandas as pd
import obonet as obo

In [65]:
# import go-basic.obo
graph = obo.read_obo('cafa-5-protein-function-prediction/Train/go-basic.obo')

# Convert the graph to a DataFrame and reset the index
go = pd.DataFrame.from_dict(graph.nodes, orient='index').reset_index()
go = go.rename(columns={go.columns[0]: 'go_term'})

# Keep only important columns
go = go[['go_term', 'name', 'namespace']]
go.head()


Unnamed: 0,go_term,name,namespace
0,GO:0000001,mitochondrion inheritance,biological_process
1,GO:0000002,mitochondrial genome maintenance,biological_process
2,GO:0000003,reproduction,biological_process
3,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function
4,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function


In [68]:
# import train_sequences.fasta

fasta_train = 'cafa-5-protein-function-prediction/Train/train_sequences.fasta'

train_sequences = []

# Open the FASTA file and iterate over each record
for record in SeqIO.parse(fasta_train, "fasta"):
    # Retrieve the sequence ID and sequence
    seq_id = record.id
    sequence = str(record.seq)
    
    # Append the sequence ID and sequence as a tuple to the list
    train_sequences.append((seq_id, sequence))

# convert to dataframe
trainset = pd.DataFrame(train_sequences, columns=['seq_id', 'sequence'])
trainset.head()

Unnamed: 0,seq_id,sequence
0,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...
1,O73864,MTEYRNFLLLFITSLSVIYPCTGISWLGLTINGSSVGWNQTHHCKL...
2,O95231,MRLSSSPPRGPQQLSSFGSVDWLSQSSCSGPTHTPRPADFSLGSLP...
3,A0A0B4J1F4,MGGEAGADGPRGRVKSLGLVFEDESKGCYSSGETVAGHVLLEAAEP...
4,P54366,MVETNSPPAGYTLKRSPSDLGEQQQPPRQISRSPGNTAAYHLTTAM...


In [76]:
# import train_taxonomy.tsv
train_taxon_ID = pd.read_csv("cafa-5-protein-function-prediction/Train/train_taxonomy.tsv", sep="\t")
train_taxon_ID.sample(5)

Unnamed: 0,EntryID,taxonomyID
136848,P92004,6239
66841,Q382P0,185431
56861,B4DPX9,9606
57577,Q9D2G5,10090
135254,P42126,9606


In [77]:
# import train_terms.tsv
train_terms = pd.read_csv("cafa-5-protein-function-prediction/Train/train_terms.tsv", sep="\t")
train_terms.sample(5)

Unnamed: 0,EntryID,term,aspect
2555612,Q8CG19,GO:0007275,BPO
2698906,Q8T059,GO:0070828,BPO
257515,A8JRC7,GO:1905952,BPO
2076285,Q4VQ11,GO:0090066,BPO
596241,M9PHP8,GO:0008582,BPO


In [84]:
# import IA.txt
IA = pd.read_csv("cafa-5-protein-function-prediction/IA.txt", delimiter='\t', header=None)
IA = IA.rename(columns={IA.columns[0]: 'go_term', IA.columns[1]: 'ia_score'})
IA.head()

Unnamed: 0,go_term,ia_score
0,GO:0000001,0.0
1,GO:0000002,3.103836
2,GO:0000003,3.439404
3,GO:0000011,0.056584
4,GO:0000012,6.400377


In [67]:
# import testsuperset.fasta
fasta_test = 'cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta'

testsuperset = []

# Open the FASTA file and iterate over each record
for record in SeqIO.parse(fasta_test, "fasta"):
    # Retrieve the sequence ID and sequence
    seq_id = record.id
    sequence = str(record.seq)
    
    # Append the sequence ID and sequence as a tuple to the list
    testsuperset.append((seq_id, sequence))

# convert to dataframe
testset = pd.DataFrame(testsuperset, columns=['seq_id', 'sequence'])
testset.head()

Unnamed: 0,seq_id,sequence
0,Q9CQV8,MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
1,P62259,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
2,P68510,MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...
3,P61982,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...
4,O70456,MERASLIQKAKLAEQAERYEDMAAFMKSAVEKGEELSCEERNLLSV...


In [74]:
# import testsuperset-taxon-list.tsv
test_taxon = pd.read_csv("cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv", sep="\t", encoding="latin-1")
test_taxon.sample(5)

Unnamed: 0,ID,Species
43,508771,Toxoplasma gondii ME49
64,193080,Rhabdophis tigrinus tigrinus (snakes)
16,85962,Helicobacter pylori 26695
34,284812,Schizosaccharomyces pombe 972h-
40,426428,Fusarium oxysporum f. sp. lycopersici 4287 [as...


In [91]:
# check for missing data
print('go: \n', go.isna().sum(), '\n')
print('trainset: \n', trainset.isna().sum(), '\n')
print('train_taxon_ID: \n', train_taxon_ID.isna().sum(), '\n')
print('train_terms: \n', train_terms.isna().sum(), '\n')
print('IA: \n', IA.isna().sum(), '\n')
print('testset: \n', testset.isna().sum(), '\n')
print('test_taxon: \n', test_taxon.isna().sum(), '\n')


go: 
 go_term      0
name         0
namespace    0
dtype: int64 

trainset: 
 seq_id      0
sequence    0
dtype: int64 

train_taxon_ID: 
 EntryID       0
taxonomyID    0
dtype: int64 

train_terms: 
 EntryID    0
term       0
aspect     0
dtype: int64 

IA: 
 go_term     0
ia_score    0
dtype: int64 

testset: 
 seq_id      0
sequence    0
dtype: int64 

test_taxon: 
 ID         0
Species    0
dtype: int64 

