In [1]:
from gobp import *

In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_csv("top1000pairs_corr_positive.ver2.tsv", delimiter="\t", header=None)
df.columns=['Ensembl_ID', 'Corr', 'GOBP']

In [6]:
df['Ensembl_ID'].to_list()[:10]

["('ENSG00000001629', 'ENSG00000140718')",
 "('ENSG00000001629', 'ENSG00000110075')",
 "('ENSG00000002016', 'ENSG00000179532')",
 "('ENSG00000004534', 'ENSG00000115464')",
 "('ENSG00000001629', 'ENSG00000077254')",
 "('ENSG00000001629', 'ENSG00000036549')",
 "('ENSG00000001629', 'ENSG00000197323')",
 "('ENSG00000001629', 'ENSG00000122741')",
 "('ENSG00000001629', 'ENSG00000138081')",
 "('ENSG00000001629', 'ENSG00000186153')"]

In [13]:
def get_unique_ensembl_ids(df, colname='Ensembl_ID'):
    
    genes = []
    column_list = df[colname].to_list()
    for item in column_list:
        pair = item.strip("()'").split(", ")
        genes.extend([pair[0].strip("'"), pair[1].strip("'")])
    
    # Remove duplicates
    unique_genes = list(set(genes))
    
    return unique_genes

In [15]:
ens_ids = get_unique_ensembl_ids(df)

In [17]:
uniprot_ids = []
for id in ens_ids:
    uniprot_id = ensembl_id_to_uniprot_id(id)
    print(uniprot_id)
    uniprot_ids.append(uniprot_id)

F8VV11
Q92833
C9JEQ8
A0A8V8TNF6
Q8N3R9
F5H744
G3V3R1
A2A2N6
A0A1B0GWE0
O94864
Q9NZM6
A0A0C4DG35
Q7L5Y6
C9J250
A0A024R8C8
Q8N5S9
E7EQ64
Q96Q42
C9JZR9
M0QYA8
A0A140VKF2
E9PK08
F8WA85
A0A384MDP4
O43739
A6NN97
J3QRE5
H0YMP1
A0A8V8TN57
J3QTH6
Q14527
E9PLM6
H0Y5D2
Q75T13
K7EMH5
B4DI05
C9JJN9
C9J2I1
F5GWF6
A0A7P0Z419
H0Y6Y8
A0A804HJC7
H7BZ89
Q9C0I9
F8W7U5
P50991
Q96PY5
Q96T51
E9PC54
Q5QPN5
F2Z3M9
D6RH21
A0A3B3ISV4
F8VSM8
H3BRS3
A0A7I2V657
Q7L8W6
A0A7P0T9I0
E7EVN4
Q96S59
B5MDU6
K7EIM2
E5RFI6
A0A590UK14
S4R329
Q7Z6K3
M0QYR3
H0Y8U3
A0A087WZ14
Q6P088
Q08945
E9PEP7
D6RGZ3
Q8NFH8
B0QZZ3
Q2M3G0
Q8TEA1
A0A0A0MRQ6
Q9UJJ7
G3V5K2
A0A8V8TRI2
S4R461
Q96QS3
P23511
Q549M8
M0R0Q7
O75879
Q9H8H0
Q9Y216
Q8N6R0
E9PQI2
Q9BYT3
C9J3D7
Q9BX95
O75175
F5GX32
A0A384MED8
A0A8I5KWA9
E9PHV4
Q9ULU4
Q8NEM1
H0Y5T4
Q96PZ0
Q5QPJ9
Q8WU76
Q86V24
Q9P2D3
A0A8V8TQL1
K7ENU6
E9PIT6
O00453
Q5JT61
B7ZAA0
Q9Y4W2
P08579
Q5EG05
Q9H7T9
P40227
P43034
H0Y4J3
P08514
Q8WU10
A0AAA9YHD0
K7EKN9
E9PL67
O75132
Q502X0
C9JWV5
A0A1W2PP08
K7ENP2
J3QLR7

In [21]:
assert len(uniprot_ids) == len(ens_ids)

In [33]:
unidict = dict()

In [23]:
test_id = uniprot_ids[0]

In [29]:
gobp=get_gene_go_terms(test_id)

Querying GO terms for gene: F8VV11


In [31]:
gobp

['GO:0019752']

In [34]:
counts = get_term_gene_counts(gobp)

Count for GO:0019752: 18912482


In [35]:
unidict[test_id] = counts

In [36]:
unidict

{'F8VV11': {'GO:0019752': 18912482}}

In [42]:
uni_small = uniprot_ids[:10]

In [39]:
def create_count_dict(ids) -> dict:
    unidict=dict()
    for id in ids:
        print(id)
        gobp=get_gene_go_terms(id)
        print(gobp)
        counts = get_term_gene_counts(gobp)
        print(counts)
        unidict[id]=counts

    return unidict

In [44]:
count_dict = create_count_dict(uni_small)

F8VV11
Querying GO terms for gene: F8VV11
['GO:0019752']
Count for GO:0019752: 18912482
{'GO:0019752': 18912482}
Q92833
Querying GO terms for gene: Q92833
['GO:0000122', 'GO:0001889', 'GO:0006325', 'GO:0006338', 'GO:0007417', 'GO:0008283', 'GO:0010468', 'GO:0010614', 'GO:0030154', 'GO:0042127', 'GO:0045892', 'GO:0048536', 'GO:0048538', 'GO:0048863', 'GO:0048863', 'GO:0060038', 'GO:0060044', 'GO:0060816', 'GO:0140718', 'GO:1902682', 'GO:1990830']
Count for GO:0000122: 351311
Count for GO:0001889: 18722
Count for GO:0006325: 8542087
Count for GO:0006338: 7602520
Count for GO:0007417: 297532
Count for GO:0008283: 445941
Count for GO:0010468: 29256018
Count for GO:0010614: 1397
Attempt 1 failed for GO:0030154: 500 Server Error:  for url: https://www.ebi.ac.uk/QuickGO/services/annotation/search?goId=GO%3A0030154&limit=0&facetField=geneProductId
Attempt 2 failed for GO:0030154: 500 Server Error:  for url: https://www.ebi.ac.uk/QuickGO/services/annotation/search?goId=GO%3A0030154&limit=0&face

In [45]:
count_dict

{'F8VV11': {'GO:0019752': 18912482},
 'Q92833': {'GO:0000122': 351311,
  'GO:0001889': 18722,
  'GO:0006325': 8542087,
  'GO:0006338': 7602520,
  'GO:0007417': 297532,
  'GO:0008283': 445941,
  'GO:0010468': 29256018,
  'GO:0010614': 1397,
  'GO:0030154': 0,
  'GO:0042127': 341630,
  'GO:0045892': 1838033,
  'GO:0048536': 3398,
  'GO:0048538': 4778,
  'GO:0048863': 65656,
  'GO:0060038': 5817,
  'GO:0060044': 725,
  'GO:0060816': 1317,
  'GO:0140718': 8463,
  'GO:1902682': 83,
  'GO:1990830': 9250},
 'C9JEQ8': {'GO:0042254': 4694918},
 'A0A8V8TNF6': {'GO:0015031': 5296683, 'GO:0016192': 3352605},
 'Q8N3R9': {'GO:0002011': 10757,
  'GO:0007009': 175705,
  'GO:0008104': 7111886,
  'GO:0010467': 0,
  'GO:0016332': 130,
  'GO:0017015': 20200,
  'GO:0021954': 12419,
  'GO:0021987': 16128,
  'GO:0032287': 1483,
  'GO:0032288': 1858,
  'GO:0035750': 102,
  'GO:0045197': 45000,
  'GO:0048699': 967347,
  'GO:0072659': 140694},
 'F5H744': {},
 'G3V3R1': {},
 'A2A2N6': {},
 'A0A1B0GWE0': {'GO:000