# GO Function Hierarchy/Ontology
Includes molecular function, biological process, and cellular component ontologies

In [1]:
import csv
import pandas as pd
import json
import os
from myutils import *
os.system('wget -N -P input/ http://purl.obolibrary.org/obo/go/go-basic.obo')

--2023-05-24 21:42:29--  http://purl.obolibrary.org/obo/go/go-basic.obo
Resolving purl.obolibrary.org (purl.obolibrary.org)... 3.223.180.112
Connecting to purl.obolibrary.org (purl.obolibrary.org)|3.223.180.112|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://current.geneontology.org/ontology/go-basic.obo [following]
--2023-05-24 21:42:29--  http://current.geneontology.org/ontology/go-basic.obo
Resolving current.geneontology.org (current.geneontology.org)... 204.246.191.49, 204.246.191.18, 204.246.191.83, ...
Connecting to current.geneontology.org (current.geneontology.org)|204.246.191.49|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31087589 (30M) [text/obo]
Saving to: ‘input/go-basic.obo’

     0K .......... .......... .......... .......... ..........  0% 8.09M 4s
    50K .......... .......... .......... .......... ..........  0% 7.27M 4s
   100K .......... .......... .......... .......... ..........  0% 10.1M 4s
   150K 

0

In [2]:
# Properties of GO Terms in the original file
starts = set()
with open('input/go-basic.obo') as fin:
    for line in fin:
        start = line.split(': ')[0].strip('\n')
        starts.add(start)

In [3]:
# Convert GO obo file to dict
ID = ''
go_dict = dict()
with open('input/go-basic.obo') as fin:
    for line in fin:
        if line.startswith('id: '):
            ID = line.split('id: ')[1].strip('\n')
            continue
        if ': ' in line and ID != '':
            k = line.split(': ')[0]
            v = line.split(': ')[1].strip('\n')
            go_dict.setdefault(ID,dict()).setdefault(k,[]).append(v)

{'GO:0000001': {'name': ['mitochondrion inheritance'],
  'namespace': ['biological_process'],
  'def': ['"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton." [GOC:mcc, PMID:10873824, PMID:11389764]'],
  'synonym': ['"mitochondrial inheritance" EXACT []'],
  'is_a': ['GO:0048308 ! organelle inheritance',
   'GO:0048311 ! mitochondrion distribution']},
 'GO:0000002': {'name': ['mitochondrial genome maintenance'],
  'namespace': ['biological_process'],
  'def': ['"The maintenance of the structure and integrity of the mitochondrial genome; includes replication and segregation of the mitochondrial chromosome." [GOC:ai, GOC:vw]'],
  'is_a': ['GO:0007005 ! mitochondrion organization']},
 'GO:0000003': {'name': ['reproduction'],
  'namespace': ['biological_process'],
  'alt_id': ['GO:0019952', 'GO:0050876'],
  'def': ['"The production of new individuals that cont

In [7]:
go_id_to_go_ont = dict()
for go_id, values in go_dict.items():
    go_ont_name = values['namespace'][0]
    go_id_to_go_ont[go_id] = go_ont_name
    
go_ontology_names = set(go_id_to_go_ont.values())
go_ontology_names.remove('external')

{'biological_process', 'cellular_component', 'molecular_function'}

In [69]:
file_name = 'CC_MF_BP_(GO)_2_CC_MF_BP_(GO).csv'
with open(f'output/go2go/{file_name}', 'w') as fout:  
    writer = csv.writer(fout)
    writer.writerow(['CC_MF_BP_(GO)', 'CC_MF_BP_(GO)', 'Relationship'])
    
    for go_id, values in go_dict.items():
        if not go_id.startswith('GO:'):
            continue
        go_id = go_id_to_go_ont[go_id]+':'+go_id.split('GO:')[1]
            
        # 'is_a' relationships
        try:
            is_a_gos = [go.split(' !')[0] for go in values['is_a']]
            for other_go in is_a_gos:
                ont_type = go_id_to_go_ont[other_go]
                other_go = ont_type+':'+other_go.split('GO:')[1]
            writer.writerow([go_id, other_go, '-is_a-'])
        except:
            pass

        # 'part_of', 'regulates', etc. relationships
        try:
            rel_gos = [r_g.split(' !')[0].split(' ') for r_g in values['relationship']]
            for rel, other_go in rel_gos:
                other_go = ont_type+':'+other_go.split('GO:')[1]
                writer.writerow([go_id, other_go, f'-{rel}->'])
        except:
            pass

In [None]:
for first, second in zip(df['CC_MF_BP_(GO)'], df['CC_MF_BP_(GO).1']):
    if first.split(':')[0] != second.split(':')[0]:
        print(first, second)

In [72]:
df = pd.read_csv(f'output/go2go/{file_name}')
df.value_counts('Relationship')

Relationship
-is_a-                     43005
-part_of->                  6802
-regulates->                3127
-negatively_regulates->     2708
-positively_regulates->     2695
dtype: int64

In [75]:
!grep -r "pharmgkb" *.ipynb

compound_to_compound_alignment.ipynb:    "db2keggdrug, db2keggcompound, db2oldttd, db2uniprot, db2pharmgkb, db2wikipedia = dict(), dict(), dict(), dict(), dict(), dict()\n",
compound_to_compound_alignment.ipynb:    "keggdrug2db, keggcompound2db, oldttd2db, uniprot2db, pharmgkb2db, wikipedia2db = dict(), dict(), dict(), dict(), dict(), dict()\n",
compound_to_compound_alignment.ipynb:    "        db2pharmgkb[db] = onts['PharmGKB']  \n",
compound_to_compound_alignment.ipynb:    "            pharmgkb2db.setdefault(pharm,set()).add(db)\n",
compound_to_compound_alignment.ipynb:    "pharmgkb2db = switch_dictset_to_dictlist(pharmgkb2db)\n",
compound_to_compound_alignment.ipynb:    "         db2keggcompound, db2oldttd, db2uniprot, db2pharmgkb,\n",
compound_to_compound_alignment.ipynb:    "         uniprot2db, pharmgkb2db, wikipedia2db]\n",
gene_to_disease-old_5-24-23.ipynb:    "with open('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv','w') as fout:\n",
gene_to_dis

In [31]:
df = pd.read_csv(f'output/go2go/{file_name}')
df.to_csv(f'output/edges/{file_name}', index=False)
df.to_csv(f'output/edges to use/{file_name}', index=False)

In [71]:
for first, second in zip(df['CC_MF_BP_(GO)'], df['CC_MF_BP_(GO).1']):
    if first.split(':')[0] != second.split(':')[0]:
        print(first, second)

In [6]:
for go2go_type in go_ontology_names:
    cap_go2go_type = '_'.join([w.capitalize() for w in go2go_type.split('_')])
    
    # Output GO-[rel]->GO
    with open(f'output/go2go/{cap_go2go_type}_(GO)_2_{cap_go2go_type}_(GO).csv','w') as fout:
        writer = csv.writer(fout)
        writer.writerow([f'{cap_go2go_type} (GO)', f'{cap_go2go_type} (GO)','Relationship'])
        go2go = dict()
        rel_counts = dict()

        for go, values in go_dict.items():
            if values['namespace'][0] == go2go_type:
                
                for value in values:
                    ### Obsolete? ###
                    if value == 'is_obsolete' and go_dict[go][value] == ['true']:
                        continue
                    if value == 'replaced_by':
                        continue
                    if 'GO' not in go:
                        continue

                        
                    ### Relationships ###
                    if value == 'is_a':
                        rel_type = '-'+value+'->'
                        start_node = go.split('GO:')[1]
                        start_node = go2go_type+':'+start_node
                        end_nodes = go_dict[go][value]
                        for end_node in end_nodes:
                            end_node = end_node.split('GO:')[1]
                            end_node = go2go_type+':'+end_node
                            go2go.setdefault(start_node, dict()).setdefault(rel_type,[]).append(end_node.split(' !')[0])
                            writer.writerow([start_node, end_node.split(' !')[0], rel_type])
                            rel_counts[rel_type] = rel_counts.setdefault(rel_type,0) + 1

                    elif value == 'relationship':
                        rels = go_dict[go][value]
                        for rel in rels:
                            rel_type = '-'+rel.split(' ')[0]+'->'
                            start_node = go.split('GO:')[1]
                            start_node = go2go_type+':'+start_node
                            end_node = rel.split(' ')[1]
                            end_node = end_node.split('GO:')[1]
                            end_node = go2go_type+':'+end_node
                            go2go.setdefault(start_node, dict()).setdefault(rel_type,[]).append(end_node)
                            writer.writerow([start_node, end_node, rel_type])
                            rel_counts[rel_type] = rel_counts.setdefault(rel_type,0) + 1    

    df = pd.read_csv(f'output/go2go/{cap_go2go_type}_(GO)_2_{cap_go2go_type}_(GO).csv')
    df.to_csv(f'output/go2go/{cap_go2go_type}_(GO)_2_{cap_go2go_type}_(GO).csv', index=False)
    
    df.to_csv(f'output/go2go/{cap_go2go_type}_(GO)_2_{cap_go2go_type}_(GO).csv', index=False)
    df.to_csv(f'output/edges/{cap_go2go_type}_(GO)_2_{cap_go2go_type}_(GO).csv', index=False)
    df.to_csv(f'output/edges to use/{cap_go2go_type}_(GO)_2_{cap_go2go_type}_(GO).csv', index=False)
    
    
    print(go2go_type)
    display(rel_counts) # Relationship counts
    display(df.head(3))

molecular_function


{'-is_a->': 13736, '-part_of->': 11}

Unnamed: 0,Molecular_Function (GO),Molecular_Function (GO).1,Relationship
0,molecular_function:0000006,molecular_function:0005385,-is_a->
1,molecular_function:0000007,molecular_function:0005385,-is_a->
2,molecular_function:0000009,molecular_function:0000030,-is_a->


biological_process


{'-is_a->': 50938,
 '-regulates->': 3157,
 '-part_of->': 5018,
 '-negatively_regulates->': 2729,
 '-positively_regulates->': 2718}

Unnamed: 0,Biological_Process (GO),Biological_Process (GO).1,Relationship
0,biological_process:0000001,biological_process:0048308,-is_a->
1,biological_process:0000001,biological_process:0048311,-is_a->
2,biological_process:0000002,biological_process:0007005,-is_a->


cellular_component


{'-is_a->': 4676, '-part_of->': 1822}

Unnamed: 0,Cellular_Component (GO),Cellular_Component (GO).1,Relationship
0,cellular_component:0000015,cellular_component:1902494,-is_a->
1,cellular_component:0000015,cellular_component:0005829,-part_of->
2,cellular_component:0000109,cellular_component:0140513,-is_a->


In [61]:
is_a_gos

['GO:0018130',
 'GO:0034309',
 'GO:0042181',
 'GO:0120255',
 'GO:1901362',
 'GO:2001316']

## Node to Text

In [41]:
go2text = dict()

for the_go_term in go_dict:
    try:
        namespace = go_dict[the_go_term]['namespace'][0]
        if namespace in go_ontology_names:
            go_term = namespace+':'+the_go_term.split('GO:')[1]
            text_description = go_dict[the_go_term]['def'][0].split('\"')[1]
            go2text[go_term] = text_description
    except:
        continue
        
json.dump(go2text, open('output/nodes/node features/node2text/go2text_description.json','w'))