## Protein-GO

In [5]:
import os
import json
import csv
import pandas as pd
import json

### pre-PPI Proteins

In [2]:
# Read in proteins from various text files
def prots_from_txt(fname):
    direct = 'output/protein2protein/ppi_input_fromprotein'
    with open(direct+fname) as fin:
        prots = fin.readlines()
    prots = set([prot.strip('\n') for prot in prots])
    return prots

# Protein from all relationships pre-PPI (protein-protein interaction)
prot1 = prots_from_txt('drug.txt')
prot2 = prots_from_txt('genedisease.txt')
prot3 = prots_from_txt('smpdbpw.txt')
prot4 = prots_from_txt('genedisease.txt')
prot5 = prots_from_txt('genedrug.txt')
prot6 = prots_from_txt('reactomepw.txt')
prot7 = list(json.load(open('output/protein2gene/all_uniprot2entrez.json','r')).keys())
proteins = prot1.union(prot2).union(prot3).union(prot4).union(prot5).union(prot6)
with open('output/protein2protein/ppi_input_mergedproteins.csv','w',newline='') as fout:
    csv.writer(fout).writerow(proteins)

## Protein-GO

In [6]:
def map_go_to_go_type(obo_path):
    '''
    Function:
    - Map each go to its GO ontology: MF, BP, or CC
    Params:
    - go_obo_path (str): Path to the GO OBO file. 
    '''    
    go_to_go_type = dict()
    go_type_to_go = dict()
    ont_type_map = {'biological_process':'BP',
                   'molecular_function':'MF',
                   'cellular_component':'CC',
                   'external':'external'}

    with open(obo_path) as fin:
        for line in fin:

            # GO Term
            if line.startswith('id: '):
                go_id = line.split(' ')[1].strip()

            # Ontology (MF, BP, CC)
            elif line.startswith('namespace: '):
                ont_type = line.split(' ')[1].strip()
                #ont_type = ont_type_map[ont_type]
                go_to_go_type[go_id] = ont_type
                go_type_to_go.setdefault(ont_type, list()).append(go_id)

    go_to_go_type = go_to_go_type
    go_type_to_go = go_type_to_go
    
    return go_type_to_go, go_to_go_type

os.system('wget -N -P input/ http://purl.obolibrary.org/obo/go/go-basic.obo')
obo_path = 'input/go-basic.obo'
go_type_to_go, go_to_go_type = map_go_to_go_type(obo_path)

--2023-03-22 22:44:28--  http://purl.obolibrary.org/obo/go/go-basic.obo
Resolving purl.obolibrary.org (purl.obolibrary.org)... 3.223.180.112
Connecting to purl.obolibrary.org (purl.obolibrary.org)|3.223.180.112|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/ontology/go-basic.obo [following]
--2023-03-22 22:44:28--  http://current.geneontology.org/ontology/go-basic.obo
Resolving current.geneontology.org (current.geneontology.org)... 18.161.6.53, 18.161.6.46, 18.161.6.21, ...
Connecting to current.geneontology.org (current.geneontology.org)|18.161.6.53|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31004635 (30M) [text/obo]
Saving to: ‘input/go-basic.obo’

     0K .......... .......... .......... .......... ..........  0% 9.19M 3s
    50K .......... .......... .......... .......... ..........  0% 8.78M 3s
   100K .......... .......... .......... .......... ..........  0% 7.95M 3s
   150K .......... .

 11200K .......... .......... .......... .......... .......... 37%  323M 0s
 11250K .......... .......... .......... .......... .......... 37%  142M 0s
 11300K .......... .......... .......... .......... .......... 37% 44.4M 0s
 11350K .......... .......... .......... .......... .......... 37%  237M 0s
 11400K .......... .......... .......... .......... .......... 37%  178M 0s
 11450K .......... .......... .......... .......... .......... 37%  306M 0s
 11500K .......... .......... .......... .......... .......... 38%  140M 0s
 11550K .......... .......... .......... .......... .......... 38%  130M 0s
 11600K .......... .......... .......... .......... .......... 38%  133M 0s
 11650K .......... .......... .......... .......... .......... 38%  146M 0s
 11700K .......... .......... .......... .......... .......... 38%  342M 0s
 11750K .......... .......... .......... .......... .......... 38% 10.2M 0s
 11800K .......... .......... .......... .......... .......... 39%  311M 0s
 11850K ....

In [7]:
go_gaf_path

'output/goa_human.gaf.gz'

In [9]:
go_gaf_url = 'http://geneontology.org/gene-associations/goa_human.gaf.gz'
input_folder = 'output/'


try:
    open(go_gaf_path)
except:
    os.system(f'wget -N -P {input_folder} {go_gaf_url}')
    go_gaf_file = go_gaf_url.split('/')[-1]
    go_gaf_path = os.path.join(input_folder, go_gaf_file)
    os.system(f'gunzip {go_gaf_path}')

--2023-03-29 23:06:05--  http://geneontology.org/gene-associations/goa_human.gaf.gz
Resolving geneontology.org (geneontology.org)... 34.233.67.155
Connecting to geneontology.org (geneontology.org)|34.233.67.155|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/annotations/goa_human.gaf.gz [following]
--2023-03-29 23:06:05--  http://current.geneontology.org/annotations/goa_human.gaf.gz
Resolving current.geneontology.org (current.geneontology.org)... 18.161.6.101, 18.161.6.53, 18.161.6.46, ...
Connecting to current.geneontology.org (current.geneontology.org)|18.161.6.101|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11323587 (11M) [application/gzip]
Saving to: ‘output/goa_human.gaf.gz’

     0K .......... .......... .......... .......... ..........  0% 5.00M 2s
    50K .......... .......... .......... .......... ..........  0% 4.84M 2s
   100K .......... .......... .......... .......... ..........  1% 

In [7]:
os.system('wget -N -P input/ http://geneontology.org/gene-associations/goa_human.gaf.gz')
os.system('gunzip input/goa_human.gaf.gz')
file = 'Protein_(UniProt)_2_GO_(GO).csv'
bad_gos = set()

with open(os.path.join('output/protein2go',file), 'w', newline='') as fin:
    writer = csv.writer(fin)
    writer.writerow(['Protein (UniProt)','GO (GO)','Relationship'])
    
    proteins_in_go, relations, go_terms, total = set(), dict(), set(), 0
    
    with open('input/goa_human.gaf') as f:
        for i, line in enumerate(f):
            if i > 40:
                line = line.split('\t')
                protein = line[1]
                relation = line[3]
                go_term = line[4]
                try:
                    go_type = go_to_go_type[go_term]
                    go_term = go_type+':'+go_term.split(':')[1]

                    proteins_in_go.add(protein)
                    relations[relation] = relations.get(relation,0) + 1
                    go_terms.add(go_term)
                    total += 1

                    writer.writerow(['UniProt:'+protein, go_term, relation])
                except:
                    bad_gos.add(go_term)
                
print('Total Protein-GO Relationship:', total)
print('Proteins w/GO Term:', len(proteins_in_go))
print('GO Terms:', len(go_terms))
print('Protein-GO Relationships:', len(relations))
relations

bad_gos

--2023-03-22 22:44:38--  http://geneontology.org/gene-associations/goa_human.gaf.gz
Resolving geneontology.org (geneontology.org)... 34.233.67.155
Connecting to geneontology.org (geneontology.org)|34.233.67.155|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/annotations/goa_human.gaf.gz [following]
--2023-03-22 22:44:38--  http://current.geneontology.org/annotations/goa_human.gaf.gz
Resolving current.geneontology.org (current.geneontology.org)... 18.161.6.53, 18.161.6.46, 18.161.6.21, ...
Connecting to current.geneontology.org (current.geneontology.org)|18.161.6.53|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11323587 (11M) [application/gzip]
Saving to: ‘input/goa_human.gaf.gz’

     0K .......... .......... .......... .......... ..........  0%  395K 28s
    50K .......... .......... .......... .......... ..........  0% 3.37M 15s
   100K .......... .......... .......... .......... ..........  1% 1

  6400K .......... .......... .......... .......... .......... 58%  226M 0s
  6450K .......... .......... .......... .......... .......... 58% 26.1M 0s
  6500K .......... .......... .......... .......... .......... 59%  227M 0s
  6550K .......... .......... .......... .......... .......... 59%  189M 0s
  6600K .......... .......... .......... .......... .......... 60%  189M 0s
  6650K .......... .......... .......... .......... .......... 60% 34.0M 0s
  6700K .......... .......... .......... .......... .......... 61%  223M 0s
  6750K .......... .......... .......... .......... .......... 61%  158M 0s
  6800K .......... .......... .......... .......... .......... 61% 8.14M 0s
  6850K .......... .......... .......... .......... .......... 62%  250M 0s
  6900K .......... .......... .......... .......... .......... 62%  209M 0s
  6950K .......... .......... .......... .......... .......... 63%  204M 0s
  7000K .......... .......... .......... .......... .......... 63%  198M 0s
  7050K ....

Total Protein-GO Relationship: 629182
Proteins w/GO Term: 19631
GO Terms: 18871
Protein-GO Relationships: 22


{'GO:0000818',
 'GO:0004367',
 'GO:0005887',
 'GO:0006336',
 'GO:0006975',
 'GO:0016021',
 'GO:0016437',
 'GO:0031497',
 'GO:0031617',
 'GO:0032268',
 'GO:0034724',
 'GO:0034990',
 'GO:0034991',
 'GO:0035093',
 'GO:0039528',
 'GO:0050828',
 'GO:0052928',
 'GO:0102092',
 'GO:0102113',
 'GO:0106424',
 'GO:1990511'}

In [8]:
df = pd.read_csv(os.path.join('output/protein2go',file))
df.to_csv(os.path.join('output/edges',file), index=False)
df.to_csv(os.path.join('output/edges to use/',file), index=False)
df.tail()

Unnamed: 0,Protein (UniProt),GO (GO),Relationship
629177,UniProt:P01860,biological_process:0006958,involved_in
629178,UniProt:Q9ULI4,cellular_component:0005874,is_active_in
629179,UniProt:Q6P1N0,cellular_component:0005634,is_active_in
629180,UniProt:Q9UQB3,cellular_component:0005912,is_active_in
629181,UniProt:Q8IVH8,molecular_function:0008349,enables


In [9]:
count = 0
for protein in proteins:
    if protein in proteins_in_go:
        count += 1
print('Proteins with GO Term:', count, '/', len(proteins), '=', round(count/len(proteins),4)*100,'%')
print('Proteins', len(set(proteins).union(proteins_in_go))) 

Proteins with GO Term: 17724 / 19164 = 92.49000000000001 %
Proteins 21071
