# Download required files

In [None]:
!mkdir data
#Gene ontology
!wget -nc -O data/go.obo "http://purl.obolibrary.org/obo/go/go-basic.obo"
#Gene Ontology Annotations (2021 April) (LARGE FILE >60 GB uncompressed). 
!wget -nc -O data/goa.gaf.gz "ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/old//UNIPROT/goa_uniprot_all.gaf.203.gz"
!wget -nc -O data/swissprot.tar.gz "https://ftp.uniprot.org/pub/databases/uniprot/previous_major_releases/release-2021_02/knowledgebase/uniprot_sprot-only2021_02.tar.gz"
!mkdir data/swissprot && cd data/swissprot && tar -zxvf ../swissprot.tar.gz && cd ..

# Filter GOA for Swissprot Proteins

In [None]:
import gzip
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from Bio.UniProt.GOA import GAF20FIELDS

#Filter GOA annotations to SwissProt proteins. 
with gzip.open('../data/swissprot/uniprot_sprot.fasta.gz', 'rt') as fp:
    seq_ids = set(map(lambda rec: rec.id.split('|')[1], SeqIO.parse(fp, 'fasta')))
    
#Subsample relevant rows of GOA. 
df_iter = pd.read_csv('data/goa.gaf.gz', dtype=str,
                      sep='\t',
                      comment='!',
                      names=GAF20FIELDS,
                      chunksize=int(1e6))

for zdf in tqdm(df_iter, desc='Parsing GOA for SwissProt'):
    # For now, remove all with a qualifier
    zdf = zdf[zdf.DB_Object_ID.isin(seq_ids)]
    zdf.to_csv('data/swissprot_goa.gaf', mode='a', sep='\t', na_rep='', header=False)

# Run GOBench Pipeline

In [None]:
from goatools.obo_parser import GODag
from go_bench.pipeline import pipeline
import os
godag = GODag('data/go.obo')
from go_bench.utils import exp_group, non_iea_group, all_group, namespaces
#Make training, validation, and testing sets
goa_path = "../data/swissprot_goa.gaf.gz"
split_path = "../data_splits/cluster50"

save_dir = "./"
if(not os.path.isdir(save_dir)):
    os.makedirs(save_dir) 
pipeline(goa_path, split_path, save_dir, godag, codes=exp_group, filter_type=('min_samples', 100))