In [30]:
import pandas as pd
import numpy as np
import pickle as pkl
from scipy.stats import fisher_exact
path = 'C:/Users/alber/OneDrive/Documenti/universita/DATA SCIENCE/Biological Data/PROJECT/'

with open(path+'Original_dataset.txt','r') as o:
    original_dataset = pd.read_csv(o, header = None)
original_proteins = set(original_dataset[0])

with open(path+'GO_human_dataset','r') as o:
    human_GO_ds = pd.read_csv(o, index_col=0, dtype = str)

with open(path+'ARCHITECTURE/architecture_datasets.pkl','rb') as o:
    arch_datasets = pkl.load(o)

with open(path+'PDB_dataset.txt','r') as o:
    pdb_dataset = pd.read_csv(o, index_col=None, dtype = str, header = None)
    
with open(path+'String_dataset.txt','r') as o:
    string_dataset = pd.read_csv(o, index_col=None, dtype = str, header = None)
    
human_GO_ds.head()

Unnamed: 0,Entry,GO_id,Go_description,Length,Pfam_domains
0,Q8TE23,16021,integral component of membrane,839,PF00003;PF01094;PF07562;
1,Q8TE23,5887,integral component of plasma membrane,839,PF00003;PF01094;PF07562;
2,Q8TE23,5886,plasma membrane,839,PF00003;PF01094;PF07562;
3,Q8TE23,43235,receptor complex,839,PF00003;PF01094;PF07562;
4,Q8TE23,1903767,sweet taste receptor complex,839,PF00003;PF01094;PF07562;


In [2]:
def get_GO_ids(ds, proteins):
    # slice of human ds considering just 'proteins' entries
    original_GO_ds = ds[ds['Entry'].isin(proteins)]
    # put the GO_ids in a set
    original_GO_ids = set(list(original_GO_ds['GO_id'].values))
    # removing nans
    if np.nan in original_GO_ids:
        original_GO_ids.remove(np.nan)
    return original_GO_ids

def get_frequencies(human_GO_ds, GO_ids, proteins):
    sliced_GO_ds = human_GO_ds[human_GO_ds['Entry'].isin(proteins)]
    GO_frequencies = {}
    for GO_id in GO_ids:
        GO_frequencies[GO_id] = len(sliced_GO_ds[sliced_GO_ds['GO_id'] == GO_id])
    return GO_frequencies

def get_all_proteins(human_GO_ds):
    return set(list(human_GO_ds['Entry'].values))

def remove_blank_space(string):
    while string[0] == ' ':
        string = string[1:]
    
    while string[-1] == ' ':
        string = string[:-1]
    return string

def get_description(GO_id, human_GO_ds):
    descs = set(list(human_GO_ds[human_GO_ds['GO_id'] == GO_id]['Go_description'].values))
    final_descs = []
    for d in descs:
        final_descs.append(remove_blank_space(d))
    
    return final_descs[0]



def Fisher_test(n_a, n_b,freq_a,freq_b, GO_ids, human_ds):

    Fisher_test_stats = {}
    Fisher_test_stats['GO_id'] = []
    Fisher_test_stats['Original_freq'] = []
    Fisher_test_stats['Human_freq'] = []
    Fisher_test_stats['Odds_ratio'] = []
    Fisher_test_stats['p-value'] = []
    Fisher_test_stats['GO_description'] = []
    for GO_id in GO_ids:
        mat = np.array([[freq_a[GO_id], n_a - freq_a[GO_id]],
                        [freq_b[GO_id], n_b - freq_b[GO_id]]])
        GO_desc = get_description(GO_id, human_ds)
        odds_ratio,pvalue = fisher_exact(mat)
        Fisher_test_stats['GO_description'].append(GO_desc)
        Fisher_test_stats['GO_id'].append(GO_id)
        Fisher_test_stats['Original_freq'].append(freq_a[GO_id])
        Fisher_test_stats['Human_freq'].append(freq_b[GO_id])
        Fisher_test_stats['Odds_ratio'].append(odds_ratio)
        Fisher_test_stats['p-value'].append(pvalue)
    
    Fisher_results = pd.DataFrame(Fisher_test_stats)
    
    return Fisher_results

### FISHER TEST: ORIGINAL VS HUMAN

In [3]:
human_proteins = get_all_proteins(human_GO_ds)
original_GO_ids =  get_GO_ids(human_GO_ds, proteins = original_proteins)
original_frequences = get_frequencies(human_GO_ds, GO_ids = original_GO_ids, proteins = original_proteins)
# for the human frequences we count the appearences of the same GO_ids also in different proteins
human_frequences = get_frequencies(human_GO_ds, GO_ids = original_GO_ids, proteins = human_proteins)

In [4]:
print(len(human_frequences) == len(original_frequences))

True


In [7]:
Fisher_test(len(original_proteins), len(human_proteins),original_frequences,human_frequences, original_GO_ids, human_GO_ds)

Unnamed: 0,GO_id,Original_freq,Human_freq,Odds_ratio,p-value,GO_description
0,0001736,2,14,57.011204,0.000775,establishment of planar polarity
1,0060122,1,17,23.020362,0.045721,inner ear receptor cell stereocilium organization
2,0010754,1,6,65.259615,0.018030,negative regulation of cGMP-mediated signaling
3,0006974,1,244,1.585987,0.473004,cellular response to DNA damage stimulus
4,0000785,1,101,3.858720,0.233364,chromatin
...,...,...,...,...,...,...
532,0002091,1,11,35.587413,0.030713,negative regulation of receptor internalization
533,0010596,1,22,17.784091,0.058053,negative regulation of endothelial cell migration
534,0060173,1,31,12.615385,0.079858,limb development
535,0016525,1,86,4.535107,0.202744,negative regulation of angiogenesis


### FISHER TEST: ARCHITECTURE VS ORIGINAL

In [12]:
# in this case we take the proteins from each architecture dataset defined by a different combination of pfam entries
pfam_keys = arch_datasets.keys()
arch_proteins = arch_datasets
# here we slice our original dataset with only those proteins
# arch_GO_ids =  get_GO_ids(human_GO_ds, proteins = arch_proteins)
# arch_frequences = get_frequencies(human_GO_ds, GO_ids = arch_GO_ids, proteins = arch_proteins)
# original_frequences = get_frequencies(human_GO_ds, GO_ids = arch_GO_ids, proteins = original_proteins)


In [13]:
arch_GO_ids = {pfam_key: get_GO_ids(human_GO_ds, arch_proteins[pfam_key]) for pfam_key in pfam_keys}
arch_frequences = {pfam_key: get_frequencies(human_GO_ds, arch_GO_ids[pfam_key], arch_proteins[pfam_key]) for pfam_key in pfam_keys}
original_frequences = {pfam_key: get_frequencies(human_GO_ds, arch_GO_ids[pfam_key], original_proteins) for pfam_key in pfam_keys}

Architecture_f_tests = {pfam_key: Fisher_test(len(arch_datasets[pfam_key]), len(original_proteins),
                                  arch_frequences[pfam_key],original_frequences[pfam_key], arch_GO_ids[pfam_key], human_GO_ds)
                                  for pfam_key in pfam_keys}

In [90]:
#with open('Architecture_Fisher_tests.pkl','wb') as save_file:
    #pkl.dump(Architecture_f_tests,save_file)

### FISHER TEST: PDB VS HUMAN

In [25]:
pdb_proteins = set(list(pdb_dataset[0].values))
pdb_GO_ids =  get_GO_ids(human_GO_ds, proteins = pdb_proteins)
pdb_frequences = get_frequencies(human_GO_ds, GO_ids = pdb_GO_ids, proteins = pdb_proteins)
human_frequences = get_frequencies(human_GO_ds, GO_ids = pdb_GO_ids, proteins = human_proteins)

In [27]:
pdb_fishert_results = Fisher_test(len(pdb_proteins), len(human_proteins),pdb_frequences,human_frequences, pdb_GO_ids, human_GO_ds)

In [28]:
pdb_fishert_results.head()

Unnamed: 0,GO_id,Original_freq,Human_freq,Odds_ratio,p-value,GO_description
0,7589,1,9,29.0,0.037982,body fluid secretion
1,1736,1,14,18.638278,0.056435,establishment of planar polarity
2,51904,1,1,261.102564,0.007713,pigment granule transport
3,60122,3,17,47.252322,6e-05,inner ear receptor cell stereocilium organization
4,60956,1,8,32.626603,0.034248,endocardial cell differentiation


In [29]:
pdb_fishert_results.to_csv('PDB_ds_fresults.txt')

### FISHER TEST: STRING VS HUMAN

In [33]:
string_proteins = set(list(string_dataset[0].values))
string_GO_ids =  get_GO_ids(human_GO_ds, proteins = string_proteins)
string_frequences = get_frequencies(human_GO_ds, GO_ids = string_GO_ids, proteins = string_proteins)
human_frequences = get_frequencies(human_GO_ds, GO_ids = string_GO_ids, proteins = human_proteins)

In [34]:
string_fishert_results = Fisher_test(len(string_proteins), len(human_proteins),string_frequences,human_frequences, string_GO_ids, human_GO_ds)

In [35]:
string_fishert_results.head()

Unnamed: 0,GO_id,Original_freq,Human_freq,Odds_ratio,p-value,GO_description
0,45616,2,17,26.308985,0.003324,regulation of keratinocyte differentiation
1,72686,1,88,2.504817,0.333919,mitotic spindle
2,1736,2,14,31.951334,0.002353,establishment of planar polarity
3,7275,2,440,0.995355,1.0,multicellular organism development
4,60122,3,17,39.901961,9.8e-05,inner ear receptor cell stereocilium organization


In [36]:
print(len(string_fishert_results))

1156


In [37]:
string_fishert_results.to_csv('String_ds_fresults.txt')