## Protein-GO

In [1]:
import os
import json
import csv
import pandas as pd
import json

In [2]:
def map_go_to_go_type(obo_path):
    '''
    Function:
    - Map each go to its GO ontology: MF, BP, or CC
    Params:
    - go_obo_path (str): Path to the GO OBO file. 
    '''    
    go_to_go_type = dict()
    go_type_to_go = dict()
    ont_type_map = {'biological_process':'BP',
                   'molecular_function':'MF',
                   'cellular_component':'CC',
                   'external':'external'}

    with open(obo_path) as fin:
        for line in fin:

            # GO Term
            if line.startswith('id: '):
                go_id = line.split(' ')[1].strip()

            # Ontology (MF, BP, CC)
            elif line.startswith('namespace: '):
                ont_type = line.split(' ')[1].strip()
                #ont_type = ont_type_map[ont_type]
                go_to_go_type[go_id] = ont_type
                go_type_to_go.setdefault(ont_type, list()).append(go_id)

    go_to_go_type = go_to_go_type
    go_type_to_go = go_type_to_go
    
    return go_type_to_go, go_to_go_type

os.system('wget -N -P input/ http://purl.obolibrary.org/obo/go/go-basic.obo')
obo_path = 'input/go-basic.obo'
go_type_to_go, go_to_go_type = map_go_to_go_type(obo_path)

--2023-06-18 23:35:53--  http://purl.obolibrary.org/obo/go/go-basic.obo
Resolving purl.obolibrary.org (purl.obolibrary.org)... 104.18.13.166, 104.18.12.166, 2606:4700::6812:da6, ...
Connecting to purl.obolibrary.org (purl.obolibrary.org)|104.18.13.166|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/ontology/go-basic.obo [following]
--2023-06-18 23:35:53--  http://current.geneontology.org/ontology/go-basic.obo
Resolving current.geneontology.org (current.geneontology.org)... 204.246.191.83, 204.246.191.71, 204.246.191.49, ...
Connecting to current.geneontology.org (current.geneontology.org)|204.246.191.83|:80... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘input/go-basic.obo’ not modified on server. Omitting download.



In [3]:
go_gaf_url = 'http://geneontology.org/gene-associations/goa_human.gaf.gz'
input_folder = 'output/'


try:
    open(go_gaf_path)
except:
    os.system(f'wget -N -P {input_folder} {go_gaf_url}')
    go_gaf_file = go_gaf_url.split('/')[-1]
    go_gaf_path = os.path.join(input_folder, go_gaf_file)
    os.system(f'gunzip {go_gaf_path}')

--2023-06-18 23:35:53--  http://geneontology.org/gene-associations/goa_human.gaf.gz
Resolving geneontology.org (geneontology.org)... 34.233.67.155
Connecting to geneontology.org (geneontology.org)|34.233.67.155|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/annotations/goa_human.gaf.gz [following]
--2023-06-18 23:35:54--  http://current.geneontology.org/annotations/goa_human.gaf.gz
Resolving current.geneontology.org (current.geneontology.org)... 204.246.191.83, 204.246.191.71, 204.246.191.49, ...
Connecting to current.geneontology.org (current.geneontology.org)|204.246.191.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11574816 (11M) [application/gzip]
Saving to: ‘output/goa_human.gaf.gz’

     0K .......... .......... .......... .......... ..........  0% 6.59M 2s
    50K .......... .......... .......... .......... ..........  0% 8.16M 2s
   100K .......... .......... .......... .......... .....

  5000K .......... .......... .......... .......... .......... 44% 6.56M 1s
  5050K .......... .......... .......... .......... .......... 45% 8.02M 1s
  5100K .......... .......... .......... .......... .......... 45% 6.29M 1s
  5150K .......... .......... .......... .......... .......... 46% 8.06M 1s
  5200K .......... .......... .......... .......... .......... 46% 6.24M 1s
  5250K .......... .......... .......... .......... .......... 46% 8.83M 1s
  5300K .......... .......... .......... .......... .......... 47% 8.81M 1s
  5350K .......... .......... .......... .......... .......... 47% 8.20M 1s
  5400K .......... .......... .......... .......... .......... 48% 5.94M 1s
  5450K .......... .......... .......... .......... .......... 48% 8.03M 1s
  5500K .......... .......... .......... .......... .......... 49% 6.63M 1s
  5550K .......... .......... .......... .......... .......... 49% 7.85M 1s
  5600K .......... .......... .......... .......... .......... 49% 6.18M 1s
  5650K ....

In [4]:
os.system('wget -N -P input/ http://geneontology.org/gene-associations/goa_human.gaf.gz')
os.system('gunzip input/goa_human.gaf.gz')
file = 'Protein_(UniProt)_2_GO_(GO).csv'
bad_gos = set()

with open(os.path.join('output/protein2go',file), 'w', newline='') as fin:
    writer = csv.writer(fin)
    writer.writerow(['Protein (UniProt)','GO (GO)','Relationship'])
    
    proteins_in_go, relations, go_terms, total = set(), dict(), set(), 0
    
    with open('input/goa_human.gaf') as f:
        for i, line in enumerate(f):
            if i > 40:
                line = line.split('\t')
                protein = line[1]
                relation = line[3]
                go_term = line[4]
                try:
                    go_type = go_to_go_type[go_term]
                    go_term = go_type+':'+go_term.split(':')[1]

                    proteins_in_go.add(protein)
                    relations[relation] = relations.get(relation,0) + 1
                    go_terms.add(go_term)
                    total += 1

                    writer.writerow(['UniProt:'+protein, go_term, relation])
                except:
                    bad_gos.add(go_term)
                
print('Total Protein-GO Relationship:', total)
print('Proteins w/GO Term:', len(proteins_in_go))
print('GO Terms:', len(go_terms))
print('Protein-GO Relationships:', len(relations))
relations

--2023-06-18 23:35:54--  http://geneontology.org/gene-associations/goa_human.gaf.gz
Resolving geneontology.org (geneontology.org)... 34.233.67.155
Connecting to geneontology.org (geneontology.org)|34.233.67.155|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/annotations/goa_human.gaf.gz [following]
--2023-06-18 23:35:55--  http://current.geneontology.org/annotations/goa_human.gaf.gz
Resolving current.geneontology.org (current.geneontology.org)... 204.246.191.83, 204.246.191.71, 204.246.191.49, ...
Connecting to current.geneontology.org (current.geneontology.org)|204.246.191.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11574816 (11M) [application/gzip]
Saving to: ‘input/goa_human.gaf.gz’

     0K .......... .......... .......... .......... ..........  0% 8.71M 1s
    50K .......... .......... .......... .......... ..........  0% 9.16M 1s
   100K .......... .......... .......... .......... ......

Total Protein-GO Relationship: 632318
Proteins w/GO Term: 19647
GO Terms: 18815
Protein-GO Relationships: 22


{'enables': 289589,
 'located_in': 143673,
 'involved_in': 158762,
 'part_of': 16050,
 'NOT|enables': 463,
 'NOT|involved_in': 602,
 'is_active_in': 17897,
 'NOT|colocalizes_with': 8,
 'colocalizes_with': 958,
 'acts_upstream_of_or_within': 2418,
 'contributes_to': 1151,
 'NOT|located_in': 176,
 'NOT|part_of': 17,
 'acts_upstream_of_positive_effect': 78,
 'NOT|acts_upstream_of_or_within': 4,
 'acts_upstream_of': 401,
 'acts_upstream_of_negative_effect': 26,
 'acts_upstream_of_or_within_positive_effect': 23,
 'acts_upstream_of_or_within_negative_effect': 9,
 'NOT|contributes_to': 7,
 'NOT|acts_upstream_of_or_within_negative_effect': 1,
 'NOT|is_active_in': 5}

In [5]:
df = pd.read_csv(os.path.join('output/protein2go',file))
df.to_csv(os.path.join('output/edges',file), index=False)
df.to_csv(os.path.join('output/edges_to_use/',file), index=False)
df.tail()

Unnamed: 0,Protein (UniProt),GO (GO),Relationship
632313,UniProt:P08069,biological_process:0071333,involved_in
632314,UniProt:P78545,biological_process:0006357,involved_in
632315,UniProt:Q8N6T3,biological_process:0032012,involved_in
632316,UniProt:Q8WV99,cellular_component:0005783,is_active_in
632317,UniProt:P11802,biological_process:0010033,involved_in


In [8]:
json.dump(list(proteins_in_go), open('output/protein2go/proteins_in_go.json','w'))