# Importing and cleaning data

Data source: https://www.kaggle.com/competitions/cafa-5-protein-function-prediction/data 

First, run the following code in the conda environment to prevent exceeding the IOPub data rate for loading fasta files and then reopen the kernel:

jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e8

And make sure all packages are installed:
- pip install biopython
- pip install pandas
- pip install obonet


In [2]:
pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp37-cp37m-win_amd64.whl (2.7 MB)
Installing collected packages: biopython
Successfully installed biopython-1.81
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install obonet

Collecting obonet
  Downloading obonet-1.0.0-py3-none-any.whl (9.2 kB)
Installing collected packages: obonet
Successfully installed obonet-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
# load packages
from Bio import SeqIO
import pandas as pd
import obonet as obo

In [7]:
# import go-basic.obo
graph = obo.read_obo('cafa-5-protein-function-prediction/Train/go-basic.obo')

# Convert the graph to a DataFrame and reset the index
go = pd.DataFrame.from_dict(graph.nodes, orient='index').reset_index()
go1 = go.rename(columns={go.columns[0]: 'go_term'})

# Keep only important columns
go = go1[['go_term', 'name', 'namespace']]
go.head()


Unnamed: 0,go_term,name,namespace
0,GO:0000001,mitochondrion inheritance,biological_process
1,GO:0000002,mitochondrial genome maintenance,biological_process
2,GO:0000003,reproduction,biological_process
3,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function
4,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function


In [8]:
go1.head()

Unnamed: 0,go_term,name,namespace,def,synonym,is_a,alt_id,subset,xref,relationship,comment
0,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","[""mitochondrial inheritance"" EXACT []]","[GO:0048308, GO:0048311]",,,,,
1,GO:0000002,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",,[GO:0007005],,,,,
2,GO:0000003,reproduction,biological_process,"""The production of new individuals that contai...","[""reproductive physiological process"" EXACT []]",[GO:0008150],"[GO:0019952, GO:0050876]","[goslim_agr, goslim_chembl, goslim_flybase_rib...",[Wikipedia:Reproduction],,
3,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...","[""high affinity zinc uptake transmembrane tran...",[GO:0005385],,,,,
4,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function,"""Enables the transfer of a solute or solutes f...",,[GO:0005385],,,,,


In [9]:
# import train_sequences.fasta

fasta_train = 'cafa-5-protein-function-prediction/Train/train_sequences.fasta'

train_sequences = []

# Open the FASTA file and iterate over each record
for record in SeqIO.parse(fasta_train, "fasta"):
    # Retrieve the sequence ID and sequence
    seq_id = record.id
    sequence = str(record.seq)
    
    # Append the sequence ID and sequence as a tuple to the list
    train_sequences.append((seq_id, sequence))

# convert to dataframe
trainset = pd.DataFrame(train_sequences, columns=['seq_id', 'sequence'])
trainset.head()

Unnamed: 0,seq_id,sequence
0,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...
1,O73864,MTEYRNFLLLFITSLSVIYPCTGISWLGLTINGSSVGWNQTHHCKL...
2,O95231,MRLSSSPPRGPQQLSSFGSVDWLSQSSCSGPTHTPRPADFSLGSLP...
3,A0A0B4J1F4,MGGEAGADGPRGRVKSLGLVFEDESKGCYSSGETVAGHVLLEAAEP...
4,P54366,MVETNSPPAGYTLKRSPSDLGEQQQPPRQISRSPGNTAAYHLTTAM...


In [10]:
# import train_taxonomy.tsv
train_taxon_ID = pd.read_csv("cafa-5-protein-function-prediction/Train/train_taxonomy.tsv", sep="\t")
train_taxon_ID.sample(5)

Unnamed: 0,EntryID,taxonomyID
55474,Q77CC7,79889
53657,Q802Y8,7955
80442,P69739,83333
26040,A2RSY6,10090
42271,Q8BHC1,10090


In [14]:
# import train_terms.tsv
train_terms = pd.read_csv("cafa-5-protein-function-prediction/Train/train_terms.tsv", sep="\t")
train_terms.sample(5)

Unnamed: 0,EntryID,term,aspect
1055189,P14137,GO:0007584,BPO
2327802,Q6NUY5,GO:0045595,BPO
173300,A0A8I6B0P9,GO:0050789,BPO
1785759,Q07152,GO:0120036,BPO
4138287,Q18194,GO:0043229,CCO


In [31]:
len(trainset['seq_id'])

142246

In [44]:
#len(train_terms[EntryID]) = len(trainset['seq_id']) = 142246
print('rows_train_terms',len(train_terms['EntryID']))
print('different_id_train_terms',len(set(train_terms['EntryID'])))

#some EntryIDs have many terms(GO:...)
train_terms['EntryID'].value_counts().sort_values(ascending=False)

rows_train_terms 5363863
different_id_train_terms 142246


Q02248    815
Q62226    736
Q01705    721
P22725    709
P01137    668
         ... 
P22179      2
C1BFM5      2
Q9NI45      2
O39491      2
F1R8A4      2
Name: EntryID, Length: 142246, dtype: int64

In [47]:
train_term = train_terms.copy()
train_term['term'] = train_term['term'].apply(lambda x: x[3:])
train_term.head()

Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,8152,BPO
1,A0A009IHW8,34655,BPO
2,A0A009IHW8,72523,BPO
3,A0A009IHW8,44270,BPO
4,A0A009IHW8,6753,BPO


In [42]:
type(train_terms['term'][0])

str

In [15]:
train_terms['aspect'].value_counts().sort_values(ascending=False)

BPO    3497732
CCO    1196017
MFO     670114
Name: aspect, dtype: int64

In [16]:
train_terms['term'].value_counts().sort_values(ascending=False)

GO:0005575    92912
GO:0008150    92210
GO:0110165    91286
GO:0003674    78637
GO:0005622    70785
              ...  
GO:0050439        1
GO:0047470        1
GO:0033942        1
GO:0047921        1
GO:0102628        1
Name: term, Length: 31466, dtype: int64

In [21]:
set1 = set(train_terms['term'])
len(set1)

31466

In [18]:
# import IA.txt
IA = pd.read_csv("cafa-5-protein-function-prediction/IA.txt", delimiter='\t', header=None)
IA = IA.rename(columns={IA.columns[0]: 'go_term', IA.columns[1]: 'ia_score'})
IA.head()

Unnamed: 0,go_term,ia_score
0,GO:0000001,0.0
1,GO:0000002,3.103836
2,GO:0000003,3.439404
3,GO:0000011,0.056584
4,GO:0000012,6.400377


In [20]:
set2 = set(IA['go_term'])
len(set2)

43248

In [23]:
len(set2-set1)

11782

In [24]:
43248-11782

31466

In [67]:
# import testsuperset.fasta
fasta_test = 'cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta'

testsuperset = []

# Open the FASTA file and iterate over each record
for record in SeqIO.parse(fasta_test, "fasta"):
    # Retrieve the sequence ID and sequence
    seq_id = record.id
    sequence = str(record.seq)
    
    # Append the sequence ID and sequence as a tuple to the list
    testsuperset.append((seq_id, sequence))

# convert to dataframe
testset = pd.DataFrame(testsuperset, columns=['seq_id', 'sequence'])
testset.head()

Unnamed: 0,seq_id,sequence
0,Q9CQV8,MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
1,P62259,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
2,P68510,MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...
3,P61982,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...
4,O70456,MERASLIQKAKLAEQAERYEDMAAFMKSAVEKGEELSCEERNLLSV...


In [74]:
# import testsuperset-taxon-list.tsv
test_taxon = pd.read_csv("cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv", sep="\t", encoding="latin-1")
test_taxon.sample(5)

Unnamed: 0,ID,Species
43,508771,Toxoplasma gondii ME49
64,193080,Rhabdophis tigrinus tigrinus (snakes)
16,85962,Helicobacter pylori 26695
34,284812,Schizosaccharomyces pombe 972h-
40,426428,Fusarium oxysporum f. sp. lycopersici 4287 [as...


In [49]:
# after merging trainset and train_terms, and transforming GO:... term into digits, we get the following dataframe. 
merge = pd.merge(trainset, train_term, left_on = 'seq_id', right_on = 'EntryID', how = 'inner')
merge.drop('EntryID', axis = 1, inplace = True)
merge['term'].apply(lambda x: [int(digit) for digit in x])
merge.head()

Unnamed: 0,seq_id,sequence,term,aspect
0,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...,8152,BPO
1,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...,71897,BPO
2,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...,44249,BPO
3,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...,6259,BPO
4,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...,9059,BPO


In [54]:
type(merge['term'][0])

str

In [51]:
merge['term'][0]

'0008152'

In [53]:
for digit in merge['term'][0]:


0
0
0
8
1
5
2


In [39]:
len(merge)

5363863

In [91]:
# check for missing data
print('go: \n', go.isna().sum(), '\n')
print('trainset: \n', trainset.isna().sum(), '\n')
print('train_taxon_ID: \n', train_taxon_ID.isna().sum(), '\n')
print('train_terms: \n', train_terms.isna().sum(), '\n')
print('IA: \n', IA.isna().sum(), '\n')
print('testset: \n', testset.isna().sum(), '\n')
print('test_taxon: \n', test_taxon.isna().sum(), '\n')


go: 
 go_term      0
name         0
namespace    0
dtype: int64 

trainset: 
 seq_id      0
sequence    0
dtype: int64 

train_taxon_ID: 
 EntryID       0
taxonomyID    0
dtype: int64 

train_terms: 
 EntryID    0
term       0
aspect     0
dtype: int64 

IA: 
 go_term     0
ia_score    0
dtype: int64 

testset: 
 seq_id      0
sequence    0
dtype: int64 

test_taxon: 
 ID         0
Species    0
dtype: int64 

