In [1]:
import pandas as pd
import numpy as np
with open('Original_dataset.txt','r') as o:
    original_dataset = pd.read_csv(o, header = None)
original_proteins = set(original_dataset[0])

with open('GO_human_dataset','r') as o:
    human_GO_ds = pd.read_csv(o, index_col=0, dtype = str)
human_GO_ds.head()

Unnamed: 0,Entry,GO_id,Go_description,Length,Pfam_domains
0,Q8TE23,16021,integral component of membrane,839,PF00003;PF01094;PF07562;
1,Q8TE23,5887,integral component of plasma membrane,839,PF00003;PF01094;PF07562;
2,Q8TE23,5886,plasma membrane,839,PF00003;PF01094;PF07562;
3,Q8TE23,43235,receptor complex,839,PF00003;PF01094;PF07562;
4,Q8TE23,1903767,sweet taste receptor complex,839,PF00003;PF01094;PF07562;


In [56]:
def get_GO_ids(human_GO_ds, proteins):
    # slice of human ds considering just 'proteins' entries
    original_GO_ds = human_GO_ds[human_GO_ds['Entry'].isin(original_proteins)]
    # put the GO_ids in a set
    original_GO_ids = set(list(original_GO_ds['GO_id'].values))
    # removing nans
    original_GO_ids.remove(np.nan)
    return original_GO_ids

def get_frequencies(human_GO_ds, GO_ids, proteins):
    sliced_GO_ds = human_GO_ds[human_GO_ds['Entry'].isin(proteins)]
    GO_frequencies = {}
    for GO_id in GO_ids:
        GO_frequencies[GO_id] = len(sliced_GO_ds[sliced_GO_ds['GO_id'] == GO_id])
    return GO_frequencies

def get_all_proteins(human_GO_ds):
    return set(list(human_GO_ds['Entry'].values))

def remove_blank_space(string):
    while string[0] == ' ':
        string = string[1:]
    
    while string[-1] == ' ':
        string = string[:-1]
    return string

def get_description(GO_id, human_GO_ds):
    descs = set(list(human_GO_ds[human_GO_ds['GO_id'] == GO_id]['Go_description'].values))
    final_descs = []
    for d in descs:
        final_descs.append(remove_blank_space(d))
    
    return final_descs[0]
    

In [15]:
# test
human_proteins = get_all_proteins(human_GO_ds)
original_GO_ids =  get_GO_ids(human_GO_ds, proteins = original_proteins)
original_frequences = get_frequencies(human_GO_ds, GO_ids = original_GO_ids, proteins = original_proteins)
# for the human frequences we count the appearences of the same GO_ids also in different proteins
human_frequences = get_frequencies(human_GO_ds, GO_ids = original_GO_ids, proteins = human_proteins)

In [17]:
print(len(human_frequences) == len(original_frequences))

True


In [58]:
from scipy.stats import fisher_exact

n_original_proteins = len(original_proteins)
n_human_proteins = len(human_proteins)
Fisher_test_stats = {}
Fisher_test_stats['GO_id'] = []
Fisher_test_stats['Original_freq'] = []
Fisher_test_stats['Human_freq'] = []
Fisher_test_stats['Odds_ratio'] = []
Fisher_test_stats['p-value'] = []
Fisher_test_stats['GO_description'] = []

for GO_id in original_GO_ids:
    mat = np.array([[original_frequences[GO_id], n_original_proteins - original_frequences[GO_id]],
                    [human_frequences[GO_id], n_human_proteins - human_frequences[GO_id]]])
    GO_desc = get_description(GO_id, human_GO_ds)
    odds_ratio,pvalue = fisher_exact(mat)
    Fisher_test_stats['GO_description'].append(GO_desc)
    Fisher_test_stats['GO_id'].append(GO_id)
    Fisher_test_stats['Original_freq'].append(original_frequences[GO_id])
    Fisher_test_stats['Human_freq'].append(human_frequences[GO_id])
    Fisher_test_stats['Odds_ratio'].append(odds_ratio)
    Fisher_test_stats['p-value'].append(pvalue)
Fisher_results = pd.DataFrame(Fisher_test_stats)

In [60]:
Fisher_results.to_csv('HMM_original_ds_fresults.txt')