# This notebook is used to evaluate different methods [Discover, fishers, MEGSA]
# This is the main code for network centric epistasis evaluation

Based on sum of pvalues of neighbors - random sum for cg-all, cg-intact,cg-cg 

## The output is different from #3 because here we look at all posible cases between LHS, RHS + TP, FP, TN, FN

| Row  |  LHS      |  RHS      |  Which one is more signif?  |   Which one would be chosen?  | Result                     |
|------|-----------|-----------|-----------------------------|-------------------------------|----------------------------|
| 1    |  Sig      |  non-sig  |  LHS                        |  LHS                          |  Positive result           |
|  2   |  Sig      |  Sig      |  LHS                        |  LHS                          |  Positive result           |
|  3   |  Sig      |  Sig      |  RHS                        |  RHS                          |  Negative result           |
|  4   |  Non-sig  |  Non-sig  |  LHS                        |  None                         |  ? less neg compared to 6  |
|  5   |  Non-sig  |  Non-sig  |  RHS                        |  None                         |  ? less neg compared to 6  |
|  6   |  Non-sig  |  Sig      |  RHS                        |  RHS                          |  Negative result           |

In [1]:
%matplotlib inline 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import gc
from tqdm.notebook import tqdm
from scipy.stats import pearsonr,spearmanr
from random import sample,choice
import random 
random.seed(1234)

pd.set_option('display.max_columns', None)

In [2]:
t = 20 # mutation threshold
c = 'COADREAD' # cancer type
coadread_strat_498 = True # if sample size is 498 or 559. the latter is the original patient count 
#where the former is the number of patients for whom we have the stratification data

# We have discover_strat versions for COADREAD and BRCA. For t20 all methods are available (memo unavailable for SKCM, STAD and UCEC)
# for t5 we have only discover, fishers and wext
methods = ['discover','fishers','megsa','memo','wext']#, 'megsa', 'memo']#'discover_strat','fishers', 'megsa']


inpath_mla = '../mutex_data/version 11 - ep data MLA logit unfiltered/' #
cosmic_infile = '../mutex_data/Census_allFri_Apr_26_12_49_57_2019.tsv'


if coadread_strat_498==True or c!='COADREAD':
## assign method paths to a dict
    dict_infile = {}
    dict_infile_intact = {}
    # setting file paths
    for m in methods:
        if m == 'discover':
            suffix = '{}_mutation_filtered_ep_data/{}_{}_result_mutations_all_genes_q1.0_normal_{}.txt'.format(m,c,m,t)
            suffix_intact = '{}_mutation_filtered_ep_data/{}_pairs_q1.0_normal_intact_filtered_subset{}.txt'.format(m,c,t)
        elif m == 'discover_strat':
            suffix = '{}_mutation_filtered_ep_data/{}_{}_result_mutations_all_genes_q1.0_stratified_{}.txt'.format('discover',c,'discover',t)
            suffix_intact = '{}_mutation_filtered_ep_data/{}_pairs_q1.0_stratified_intact_filtered_subset{}.txt'.format('discover',c,t)    
        else:
            suffix = '{}_mutation_filtered_ep_data/{}_{}_result_mutations_all_genes_{}.txt'.format(m,c,m,t)
            suffix_intact = '{}_mutation_filtered_ep_data/{}_{}_pairs_intact_filtered_subset{}.txt'.format(m,c,m,t)

        dict_infile[m] = '../mutex_data/' + suffix
        dict_infile_intact[m] = '../mutex_data/' + suffix_intact

else:
    ## assign method paths to a dict
    dict_infile = {}
    dict_infile_intact = {}
    for m in methods:
        if m == 'discover':
            suffix = '{}_mutation_filtered_ep_data/COADREAD_ORIG/{}_{}_result_mutations_all_genes_q1.0_normal_{}.txt'.format(m,c,m,t)
            suffix_intact = '{}_mutation_filtered_ep_data/COADREAD_ORIG/{}_pairs_q1.0_normal_intact_filtered_subset{}.txt'.format(m,c,t)
        elif m == 'discover_strat':
            suffix = '{}_mutation_filtered_ep_data/{}_{}_result_mutations_all_genes_q1.0_stratified_{}.txt'.format('discover',c,'discover',t)
            suffix_intact = '{}_mutation_filtered_ep_data/{}_pairs_q1.0_stratified_intact_filtered_subset{}.txt'.format('discover',c,t)    
        else:
            suffix = '{}_mutation_filtered_ep_data/COADREAD_ORIG/{}_{}_result_mutations_all_genes_{}.txt'.format(m,c,m,t)
            suffix_intact = '{}_mutation_filtered_ep_data/COADREAD_ORIG/{}_{}_pairs_intact_filtered_subset{}.txt'.format(m,c,m,t)

        dict_infile[m] = '../mutex_data/' + suffix
        dict_infile_intact[m] = '../mutex_data/' + suffix_intact



### Intact

In [3]:
intact_edge_file = '../mutex_data/intact_nodupl_edge_file.txt'
intact_index_file = '../mutex_data/intact_nodupl_index_file.txt'

with open(intact_index_file, 'r') as f:
    indices = {line.split()[0]:line.split()[1] for line in f.readlines()}

with open(intact_edge_file, 'r') as f:
    edges = [(indices[line.split()[0]].upper(),indices[line.split()[1]].upper()) for line in f.readlines()]
len(edges)

intact_genes_list = list(indices.values())
intact_genes_list[:5]

['""CHEBI', '100147744', '1B', '1EFV', '1KLA']

## Rare Genes

In [4]:
# consider investigating rare genes based on percentage
rare_genes_percentage = 2
infile_rare_genes = '../mutex_data/rare_genes/{}_t5_rarely_mutated_genes_{}_perc.txt'.format(c,rare_genes_percentage)

with open(infile_rare_genes) as f:
    rare_genes = [line.split()[0] for line in f.readlines()[1:]]

len(rare_genes)

6330

### COSMIC

In [5]:
with open(cosmic_infile,'r') as f:
    cosmic_genes = [line.split()[0].upper() for line in f.readlines()[1:]]
print(len(cosmic_genes))
cosmic_genes

723


['A1CF',
 'ABI1',
 'ABL1',
 'ABL2',
 'ACKR3',
 'ACSL3',
 'ACSL6',
 'ACVR1',
 'ACVR2A',
 'AFF1',
 'AFF3',
 'AFF4',
 'AKAP9',
 'AKT1',
 'AKT2',
 'AKT3',
 'ALDH2',
 'ALK',
 'AMER1',
 'ANK1',
 'APC',
 'APOBEC3B',
 'AR',
 'ARAF',
 'ARHGAP26',
 'ARHGAP5',
 'ARHGEF10',
 'ARHGEF10L',
 'ARHGEF12',
 'ARID1A',
 'ARID1B',
 'ARID2',
 'ARNT',
 'ASPSCR1',
 'ASXL1',
 'ASXL2',
 'ATF1',
 'ATIC',
 'ATM',
 'ATP1A1',
 'ATP2B3',
 'ATR',
 'ATRX',
 'AXIN1',
 'AXIN2',
 'B2M',
 'BAP1',
 'BARD1',
 'BAX',
 'BAZ1A',
 'BCL10',
 'BCL11A',
 'BCL11B',
 'BCL2',
 'BCL2L12',
 'BCL3',
 'BCL6',
 'BCL7A',
 'BCL9',
 'BCL9L',
 'BCLAF1',
 'BCOR',
 'BCORL1',
 'BCR',
 'BIRC3',
 'BIRC6',
 'BLM',
 'BMP5',
 'BMPR1A',
 'BRAF',
 'BRCA1',
 'BRCA2',
 'BRD3',
 'BRD4',
 'BRIP1',
 'BTG1',
 'BTK',
 'BUB1B',
 'C15ORF65',
 'C2ORF44',
 'CACNA1D',
 'CALR',
 'CAMTA1',
 'CANT1',
 'CARD11',
 'CARS',
 'CASC5',
 'CASP3',
 'CASP8',
 'CASP9',
 'CBFA2T3',
 'CBFB',
 'CBL',
 'CBLB',
 'CBLC',
 'CCDC6',
 'CCNB1IP1',
 'CCNC',
 'CCND1',
 'CCND2',
 'CCND3',


## MLA

In [6]:
# load MLA
if coadread_strat_498==True or c!='COADREAD':
    MLA_infile = '../mutex_data/MLA_ep_mutation_filtered_all_genes/{}_MLA_standardized.txt'.format(c)
else:
    MLA_infile = '../mutex_data/MLA_ep_mutation_filtered_all_genes/{}_MLA_standardized_559.txt'.format(c)
    
with open(MLA_infile, 'r') as f:
    MLA = {line.split()[0]: float(line.split()[1]) for line in f.readlines()}
MLA

{'A1BG': 4.261253658699028,
 'A1CF': 5.095042391780406,
 'A2M': 5.539871662596874,
 'A2ML1': 6.227034038924631,
 'A3GALT2': 2.9816064323348526,
 'A4GALT': 2.4760175602042125,
 'A4GNT': 4.84331572832302,
 'AAAS': 3.8960855861975774,
 'AACS': 5.043901737923377,
 'AADAC': 5.03358562235896,
 'AADACL2': 4.532736869473897,
 'AADACL3': 5.1002959015462705,
 'AADACL4': 4.290582726088209,
 'AADAT': -1.0668391382704936,
 'AAED1': 4.196797278762107,
 'AAGAB': 2.1675017166388217,
 'AAK1': 4.6752232377079235,
 'AAMDC': 0.7725909064700299,
 'AAMP': 1.5490260103495654,
 'AANAT': 1.4365307257245812,
 'AAR2': 1.3732952167625578,
 'AARD': 4.8171629815844526,
 'AARS': 5.574031588112374,
 'AARS2': 4.6664416631331385,
 'AARSD1': 0.670390127330707,
 'AASDH': 5.249347844256076,
 'AASDHPPT': 3.2230897103876734,
 'AASS': 5.228128901713833,
 'AATF': 4.494594513632189,
 'AATK': 4.277019451865018,
 'ABAT': 5.081292805626867,
 'ABCA1': 6.012651714665176,
 'ABCA10': 5.957033109633376,
 'ABCA12': 6.5712151043401645,


### NOTE: 35 COSMIC gENES HAVE LARGER OR EQUAL COSMIC NEIGHBORS THAN NON-COSMIC NEIGHBORS.
SOME DONT HAVE ANY NON-COSMIC NEIGHBORS

In [21]:
### FUNCTIONS

def chunks(list_of_genes,n=1000):
    """Seperate total genes into chunks for memory management"""
    for i in range(0,len(list_of_genes),n):
        yield list_of_genes[i:i+n]
        
def get_genes(filename):
    """get all genes in cohort, return a list"""
    with open(filename, 'r') as f:
        genes = set()
        for line in tqdm(f.readlines()[1:],desc='Counting total Genes'):
            genes.update(line.strip().split('\t')[1:3])

    return list(genes)   

def get_neighbors(genes, ref_edges):
    """create a dictionary of neighbors of genes. for each gene in the dict, all its neighbors
    will be present in the subdictionary.
    ref_edges = g1,g2 must be a part of the reference edges (ppi network edges)"""
    dict_neighbors = {}
    for g1,g2 in ref_edges:
        if g1 in genes and g2 in genes:
            if g1 not in dict_neighbors:
                dict_neighbors[g1] = set()
            if g2 not in dict_neighbors:
                dict_neighbors[g2] = set()

            dict_neighbors[g1].update([g2])
            dict_neighbors[g2].update([g1])

    return dict_neighbors

def get_cg_cg_genes(cohort_specific_genes, dict_neighbor,ref_genes=cosmic_genes):
    """get (cosmic gene --- cosmic gene) pairs"""
    set_cg_cg = set()
    for g in set.intersection(set(cohort_specific_genes),set(dict_neighbor), set(ref_genes)):
        if len(set.intersection(set(dict_neighbor[g]), set(ref_genes)))>0:
               set_cg_cg.update([g])
                
    return set_cg_cg

    
def count_sig_cosmic_pairs(filename, reference_genes=cosmic_genes,sig_threshold = 0.05):
    '''get the count of significant cosmic --- cosmic pairs'''
    count=0
    with open(filename, 'r') as f:
        for line in tqdm(f.readlines()[1:]):
            line=line.split()
            g1,g2,p = line[1],line[2],float(line[3])
            
            if (g1 in reference_genes or g2 in reference_genes) and p<sig_threshold:
                count+=1
    del line
    gc.collect()
    
    return count
    
###################################################################################################
## 1
## cosmic neighbors minus random non cosmic neighbors | int: interacting pairs
   

def get_sig_logpval_counts_cgcg_minus_cgnnb_single(d, neighbor_set,reference_genes=cosmic_genes,randiter=100,sig_threshold=-np.log(0.05),zero_threshold=0):
    '''
    First evaluation: for each CGC gene g, check all its neighbors. 
    If more CGC-CGC pairs are present, get the necessary statistics by comparing
    CGC-CGC pairs vs CGC-non neighbor pairs. 
    d: dictionary for single CGC g containing all its neighbors
    neighbor_set: all neighbors of g
    randiter: inner iteration
    
    '''
    d_cg_cg = {k:v for k,v in d.items() if k in reference_genes and k in neighbor_set}
    d_cg_nnb = {k:v for k,v in d.items() if k in reference_genes and k not in neighbor_set}
    
    if len(d_cg_cg)==0 or len(d_cg_cg)>len(d_cg_nnb):
        return (np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan)

    else:
        count_sigLHS_nonsigRHS = [] #1
        count_sigLHS_sigRHS_LHS = [] #2
        count_sigLHS_sigRHS_RHS = [] #3
        count_nonsigLHS_sigRHS = [] #6
        count_nonsigLHS_nonsigRHS_LHS = [] #4
        count_nonsigLHS_nonsigRHS_RHS = [] #5
        sig_sum_RHS = []
        sum_RHS = []
        
        sig_sum_LHS = np.sum([v for v in d_cg_cg.values() if v>sig_threshold])
        sum_LHS = np.sum([v for v in d_cg_cg.values()])
        
        
        ##outer loop for 100
        for i in range(randiter):
            count_sigLHS_nonsigRHS_temp = 0 #1
            count_sigLHS_sigRHS_LHS_temp = 0 #2
            count_sigLHS_sigRHS_RHS_temp = 0 #3
            count_nonsigLHS_sigRHS_temp = 0 #6
            count_nonsigLHS_nonsigRHS_LHS_temp = 0 #4
            count_nonsigLHS_nonsigRHS_RHS_temp = 0 #5
            sig_sum_RHS_temp = 0
            sum_RHS_temp = 0
            
            #inner loop for all cgcg
            for cg in d_cg_cg:
                
                pval_cgcg = d_cg_cg[cg]
                
                rand_cg_nnb =choice(list(d_cg_nnb))
                pval_rand_cg_nnb = d_cg_nnb[rand_cg_nnb]
                sum_RHS_temp+=pval_rand_cg_nnb  #add rand pvals to sum_RHS_temp
                
                #count pos if rand non neighbor is smaller tahn significant LHS>RHS
                if pval_cgcg > sig_threshold:
                    
                    if pval_rand_cg_nnb < sig_threshold:
                        count_sigLHS_nonsigRHS_temp+=1
                    else:
                        sig_sum_RHS_temp+=pval_rand_cg_nnb
                        
                        if pval_cgcg > pval_rand_cg_nnb:
                            count_sigLHS_sigRHS_LHS_temp+=1
                        else:
                            count_sigLHS_sigRHS_RHS_temp+=1
                            
                else: #LHS not sig
                    if pval_rand_cg_nnb > sig_threshold:
                        sig_sum_RHS_temp+=pval_rand_cg_nnb
                        count_nonsigLHS_sigRHS_temp+=1
                    else:
                        
                        if pval_cgcg > pval_rand_cg_nnb:
                            count_nonsigLHS_nonsigRHS_LHS_temp+=1
                        else:
                            count_nonsigLHS_nonsigRHS_RHS_temp+=1         
                        
            count_sigLHS_nonsigRHS.append(count_sigLHS_nonsigRHS_temp) #1
            count_sigLHS_sigRHS_LHS.append(count_sigLHS_sigRHS_LHS_temp) #2
            count_sigLHS_sigRHS_RHS.append(count_sigLHS_sigRHS_RHS_temp) #3
            count_nonsigLHS_sigRHS.append(count_nonsigLHS_sigRHS_temp) #6
            count_nonsigLHS_nonsigRHS_LHS.append(count_nonsigLHS_nonsigRHS_LHS_temp) #4
            count_nonsigLHS_nonsigRHS_RHS.append(count_nonsigLHS_nonsigRHS_RHS_temp)#5
            sig_sum_RHS.append(sig_sum_RHS_temp)
            sum_RHS.append(sum_RHS_temp)
                        
        
        
#         med_idx = np.argsort(count_pos_sig)[len(count_pos_sig)//2]
        
        return len(d_cg_cg),np.median(count_sigLHS_nonsigRHS), np.median(count_sigLHS_sigRHS_LHS), np.median(count_sigLHS_sigRHS_RHS),\
        np.median(count_nonsigLHS_nonsigRHS_LHS),np.median(count_nonsigLHS_nonsigRHS_RHS), np.median(count_nonsigLHS_sigRHS),\
        sum_LHS,sig_sum_LHS, np.median(sum_RHS),np.median(sig_sum_RHS),\
        np.median(count_sigLHS_nonsigRHS)/float(len(neighbor_set)),\
        np.median(count_sigLHS_sigRHS_LHS)/float(len(neighbor_set)),\
        np.median(count_sigLHS_sigRHS_RHS)/float(len(neighbor_set)),\
        np.median(count_nonsigLHS_nonsigRHS_LHS)/float(len(neighbor_set)),\
        np.median(count_nonsigLHS_nonsigRHS_RHS)/float(len(neighbor_set)),\
        np.median(count_nonsigLHS_sigRHS)/float(len(neighbor_set))
    

       
        

###################################################################################################

## 2
## cosmic neighbors minus random non cosmic non neighbors | int: interacting pairs


def get_sig_logpval_counts_cgcg_minus_cgncgnb_single(d, neighbor_set,reference_genes=cosmic_genes,randiter=100,sig_threshold=-np.log(0.05),zero_threshold=0):
    '''
    Second evaluation: for each CGC gene g, check all its neighbors. 
    If more CGC-CGC pairs are present, get the necessary statistics by comparing
    CGC-CGC pairs vs CGC-non CGC neighbor pairs. 
    d: dictionary for single CGC g containing all its neighbors
    neighbor_set: all neighbors of g
    randiter: inner iteration
    sig_threshold: significance threshold for MEX values, commonly 0.05
    '''
    d_cg_cg = {k:v for k,v in d.items() if k in reference_genes and k in neighbor_set}
    d_cg_ncgnb = {k:v for k,v in d.items() if k not in reference_genes and k in neighbor_set}
    
    if len(d_cg_cg)==0 or len(d_cg_cg)>len(d_cg_ncgnb):
        return (np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan)

    else:
        count_sigLHS_nonsigRHS = [] #1
        count_sigLHS_sigRHS_LHS = [] #2
        count_sigLHS_sigRHS_RHS = [] #3
        count_nonsigLHS_sigRHS = [] #6
        count_nonsigLHS_nonsigRHS_LHS = [] #4
        count_nonsigLHS_nonsigRHS_RHS = [] #5
        sig_sum_RHS = []
        sum_RHS = []
        
        sig_sum_LHS = np.sum([v for v in d_cg_cg.values() if v>sig_threshold])
        sum_LHS = np.sum([v for v in d_cg_cg.values()])
        
        ##outer loop for 100
        for i in range(randiter):
            count_sigLHS_nonsigRHS_temp = 0 #1
            count_sigLHS_sigRHS_LHS_temp = 0 #2
            count_sigLHS_sigRHS_RHS_temp = 0 #3
            count_nonsigLHS_sigRHS_temp = 0 #6
            count_nonsigLHS_nonsigRHS_LHS_temp = 0 #4
            count_nonsigLHS_nonsigRHS_RHS_temp = 0 #5
            sig_sum_RHS_temp = 0
            sum_RHS_temp = 0
            
            #inner loop for all cgcg
            for cg in d_cg_cg:
                
                pval_cgcg = d_cg_cg[cg]
                
                rand_cg_ncgnb =choice(list(d_cg_ncgnb))
                pval_rand_cg_ncgnb = d_cg_ncgnb[rand_cg_ncgnb]
                sum_RHS_temp+=pval_rand_cg_ncgnb
                
                #count pos if rand non neighbor is smaller than significant LHS>RHS
                if pval_cgcg > sig_threshold:
                    
                    if pval_rand_cg_ncgnb < sig_threshold:
                        count_sigLHS_nonsigRHS_temp+=1
                    else:
                        sig_sum_RHS_temp+=pval_rand_cg_ncgnb
                        
                        if pval_cgcg > pval_rand_cg_ncgnb:
                            count_sigLHS_sigRHS_LHS_temp+=1
                        else:
                            count_sigLHS_sigRHS_RHS_temp+=1
                            
                else: #RHS not sig
                    if pval_rand_cg_ncgnb > sig_threshold:
                        sig_sum_RHS_temp+=pval_rand_cg_ncgnb
                        count_nonsigLHS_sigRHS_temp+=1
                    else:
                        
                        if pval_cgcg > pval_rand_cg_ncgnb:
                            count_nonsigLHS_nonsigRHS_LHS_temp+=1
                        else:
                            count_nonsigLHS_nonsigRHS_RHS_temp+=1         
                        
            count_sigLHS_nonsigRHS.append(count_sigLHS_nonsigRHS_temp) #1
            count_sigLHS_sigRHS_LHS.append(count_sigLHS_sigRHS_LHS_temp) #2
            count_sigLHS_sigRHS_RHS.append(count_sigLHS_sigRHS_RHS_temp) #3
            count_nonsigLHS_sigRHS.append(count_nonsigLHS_sigRHS_temp) #6
            count_nonsigLHS_nonsigRHS_LHS.append(count_nonsigLHS_nonsigRHS_LHS_temp) #4
            count_nonsigLHS_nonsigRHS_RHS.append(count_nonsigLHS_nonsigRHS_RHS_temp)#5
            sig_sum_RHS.append(sig_sum_RHS_temp)
            sum_RHS.append(sum_RHS_temp)
        
        
#         med_idx = np.argsort(count_pos_sig)[len(count_pos_sig)//2]
        
        return len(d_cg_cg),np.median(count_sigLHS_nonsigRHS), np.median(count_sigLHS_sigRHS_LHS), np.median(count_sigLHS_sigRHS_RHS),\
        np.median(count_nonsigLHS_nonsigRHS_LHS),np.median(count_nonsigLHS_nonsigRHS_RHS), np.median(count_nonsigLHS_sigRHS),\
        sum_LHS,sig_sum_LHS, np.median(sum_RHS),np.median(sig_sum_RHS),\
        np.median(count_sigLHS_nonsigRHS)/float(len(neighbor_set)),\
        np.median(count_sigLHS_sigRHS_LHS)/float(len(neighbor_set)),\
        np.median(count_sigLHS_sigRHS_RHS)/float(len(neighbor_set)),\
        np.median(count_nonsigLHS_nonsigRHS_LHS)/float(len(neighbor_set)),\
        np.median(count_nonsigLHS_nonsigRHS_RHS)/float(len(neighbor_set)),\
        np.median(count_nonsigLHS_sigRHS)/float(len(neighbor_set))
    

###################################################################################################

# main function to run both evaluations
    
###################################################################################################      

def get_pvalues_single(filename, n=2000, reference_genes=cosmic_genes,pvalue_threshold=0.05,pvalue_position=3, ref_edges=edges,randiter=100):
    """main function to evaluate methods based on their pvalues.
    n: chunk size (genes to consider for each iteration)
    reference_genes: known driver genes (default is CGC)
    pvalue_threhsold: significance threhsold
    pvalue_position: column number from mutex result file where p-values are stored. Default is 3.
    ref_edges: PPI network edges.
    randiter: amount of iterations to account for randomization
    """

    #get genes
    genes = get_genes(filename=filename) 
    print('Total Genes:',len(genes))
    
    ##test
#     dict_cg_cg_test = {}
#     dict_ncg_ncg_test = {}
    
    ## dictionaries for pos/neg
    #1

    dict_pairs_cgcg_for_nnb = {}
    dict_nnb_sigLHS_nonsigRHS = {} #1
    dict_nnb_sigLHS_sigRHS_LHS = {} #2
    dict_nnb_sigLHS_sigRHS_RHS = {} #3
    dict_nnb_nonsigLHS_sigRHS = {} #6
    dict_nnb_nonsigLHS_nonsigRHS_LHS = {} #4
    dict_nnb_nonsigLHS_nonsigRHS_RHS = {} #5
    dict_nnb_sum_LHS = {}
    dict_nnb_sum_sig_LHS = {}
    dict_nnb_sum_RHS = {}
    dict_nnb_sum_sig_RHS = {}
    
    dict_norm_nnb_sigLHS_nonsigRHS = {} #1
    dict_norm_nnb_sigLHS_sigRHS_LHS = {} #2
    dict_norm_nnb_sigLHS_sigRHS_RHS = {} #3
    dict_norm_nnb_nonsigLHS_sigRHS = {} #6
    dict_norm_nnb_nonsigLHS_nonsigRHS_LHS = {} #4
    dict_norm_nnb_nonsigLHS_nonsigRHS_RHS = {} #5
    
    
    
    #2

    
    dict_pairs_cgcg_for_ncgnb = {}
    dict_ncgnb_sigLHS_nonsigRHS = {} #1
    dict_ncgnb_sigLHS_sigRHS_LHS = {} #2
    dict_ncgnb_sigLHS_sigRHS_RHS = {} #3
    dict_ncgnb_nonsigLHS_sigRHS = {} #6
    dict_ncgnb_nonsigLHS_nonsigRHS_LHS = {} #4
    dict_ncgnb_nonsigLHS_nonsigRHS_RHS = {} #5
    dict_ncgnb_sum_sig_LHS = {}
    dict_ncgnb_sum_LHS = {}
    dict_ncgnb_sum_sig_RHS = {} 
    dict_ncgnb_sum_RHS = {}    
    
    dict_norm_ncgnb_sigLHS_nonsigRHS = {} #1
    dict_norm_ncgnb_sigLHS_sigRHS_LHS = {} #2
    dict_norm_ncgnb_sigLHS_sigRHS_RHS = {} #3
    dict_norm_ncgnb_nonsigLHS_sigRHS = {} #6
    dict_norm_ncgnb_nonsigLHS_nonsigRHS_LHS = {} #4
    dict_norm_ncgnb_nonsigLHS_nonsigRHS_RHS = {} #5
    
    
    ## dictionaries for cg-ncg neighbor degrees
    dict_neighbors_degree_all = {}
    dict_neighbors_cg = {}
    dict_neighbors_ncg = {}
    
    ## filter intact to contain only these genes
    dict_neighbors = get_neighbors(genes,ref_edges=ref_edges)
    
    ## Cohort specific ref (COSMIC) genes
    cohort_ref_genes = set.intersection(set(reference_genes),set(genes),set(dict_neighbors))
    cg_cg_genes = get_cg_cg_genes(cohort_specific_genes=genes, dict_neighbor=dict_neighbors) 
    print('Cosmic Genes:',len(cohort_ref_genes))
    print('CG-CG:',len(cg_cg_genes))
    
    #group gene into chunks
    group_of_genes = list(chunks(genes, n=n))
    
    #read file and get min nonzero pvalue
    with open(filename, 'r') as f:
        lines = f.readlines()[1:]
        
        list_pvals = []
        for line in tqdm(lines):
            val = float(line.split()[pvalue_position])
            if val!=0:
                list_pvals.append(val)
        min_pval = min(list_pvals)
        del list_pvals
    print('min pval:',min_pval)

    ## groupwise computations
    count_g = 0
    
#     cp_sig_old=0
#     cn_sig_old=0
#     cp_sig_new=0
#     cn_sig_new=0

    for group in tqdm(group_of_genes,desc='group'):
        
        dict_temp = {g:{} for g in group}
        
        for line in tqdm(lines):
            line = line.strip().split('\t')
        
            g1 = line[1]
            g2 = line[2]
            
            ## added post memo run
            p_temp = float(line[pvalue_position])
#             p = -np.log2(float(line[pvalue_position]))
            
            if p_temp==0:
                if m=='wext':
                    p=0.0
                else:
                    p = -np.log(min_pval)
            else:
                p = -np.log(p_temp)
            
            if g1 in dict_temp and g2 not in dict_temp[g1]:
                dict_temp[g1][g2] = p
            if g2 in dict_temp and g1 not in dict_temp[g2]:
                dict_temp[g2][g1] = p
                

        count_s = []
        for g in tqdm(dict_temp,desc='genes and neighbors'):
            
            ## -logp values for the pairs
#             dict_temp_log2p = {k:-np.log2(v) for k,v in dict_temp[g].items()}
            
            ## g is in cosmic and g has neighbors in PPI
            if g in reference_genes and g in dict_neighbors:
                
 
                ### 1 with non neighbors
                temp_pairs,temp_one, temp_two,temp_three,temp_four,temp_five,\
                temp_six, temp_sumLHS, temp_sumsigLHS, temp_sumRHS,temp_sumsigRHS,\
                temp_one_norm, temp_two_norm,temp_three_norm,temp_four_norm,temp_five_norm, temp_six_norm= get_sig_logpval_counts_cgcg_minus_cgnnb_single(dict_temp[g], neighbor_set=dict_neighbors[g],randiter=randiter)
                if not np.isnan(temp_one):
                    dict_pairs_cgcg_for_nnb[g],dict_nnb_sigLHS_nonsigRHS[g], dict_nnb_sigLHS_sigRHS_LHS[g],dict_nnb_sigLHS_sigRHS_RHS[g],\
                    dict_nnb_nonsigLHS_nonsigRHS_LHS[g], dict_nnb_nonsigLHS_nonsigRHS_RHS[g],dict_nnb_nonsigLHS_sigRHS[g],\
                    dict_nnb_sum_LHS[g],dict_nnb_sum_sig_LHS[g],dict_nnb_sum_RHS[g],\
                    dict_nnb_sum_sig_RHS[g]= temp_pairs,temp_one, temp_two,temp_three,temp_four,temp_five,temp_six, temp_sumLHS,temp_sumsigLHS, temp_sumRHS,temp_sumsigRHS
                
                    dict_norm_nnb_sigLHS_nonsigRHS[g], dict_norm_nnb_sigLHS_sigRHS_LHS[g],dict_norm_nnb_sigLHS_sigRHS_RHS[g],\
                    dict_norm_nnb_nonsigLHS_nonsigRHS_LHS[g], dict_norm_nnb_nonsigLHS_nonsigRHS_RHS[g],\
                    dict_norm_nnb_nonsigLHS_sigRHS[g]= temp_one_norm, temp_two_norm,temp_three_norm,temp_four_norm,temp_five_norm,temp_six_norm
                
                ## 2 with non cosmic neighbors
                temp_pairs,temp_one, temp_two,temp_three,temp_four,temp_five,\
                temp_six, temp_sumLHS, temp_sumsigLHS, temp_sumRHS,temp_sumsigRHS,\
                temp_one_norm, temp_two_norm,temp_three_norm,temp_four_norm,temp_five_norm,\
                temp_six_norm = get_sig_logpval_counts_cgcg_minus_cgncgnb_single(dict_temp[g], neighbor_set=dict_neighbors[g],randiter=randiter)
                if not np.isnan(temp_one):
                
                    dict_pairs_cgcg_for_ncgnb[g],dict_ncgnb_sigLHS_nonsigRHS[g], dict_ncgnb_sigLHS_sigRHS_LHS[g],dict_ncgnb_sigLHS_sigRHS_RHS[g],\
                    dict_ncgnb_nonsigLHS_nonsigRHS_LHS[g], dict_ncgnb_nonsigLHS_nonsigRHS_RHS[g],dict_ncgnb_nonsigLHS_sigRHS[g],\
                    dict_ncgnb_sum_LHS[g],dict_ncgnb_sum_sig_LHS[g],dict_ncgnb_sum_RHS[g],\
                    dict_ncgnb_sum_sig_RHS[g]= temp_pairs,temp_one, temp_two,temp_three,temp_four,temp_five,temp_six, temp_sumLHS, temp_sumsigLHS, temp_sumRHS,temp_sumsigRHS
        
                    dict_norm_ncgnb_sigLHS_nonsigRHS[g], dict_norm_ncgnb_sigLHS_sigRHS_LHS[g],dict_norm_ncgnb_sigLHS_sigRHS_RHS[g],\
                    dict_norm_ncgnb_nonsigLHS_nonsigRHS_LHS[g], dict_norm_ncgnb_nonsigLHS_nonsigRHS_RHS[g],\
                    dict_norm_ncgnb_nonsigLHS_sigRHS[g]= temp_one_norm, temp_two_norm,temp_three_norm,temp_four_norm,temp_five_norm,temp_six_norm
        
                    
        for g in tqdm(dict_neighbors,desc='degrees'):
            dict_neighbors_degree_all[g] = len([v for v in dict_neighbors[g]])
            dict_neighbors_cg[g] = len([v for v in dict_neighbors[g] if v in reference_genes])
            dict_neighbors_ncg[g] = len([v for v in dict_neighbors[g] if v not in reference_genes])
            
    sample_count_2_4 = len(cg_cg_genes)-count_g
    print('sample count',sample_count_2_4)
    
#     print('COUNTS', cp_sig_old, cn_sig_old, cp_sig_new, cn_sig_new, cp_sig_old+ cn_sig_old, cp_sig_new+ cn_sig_new )
#     print(len(dict_cgcg_cgnnb_sum2),len(dict_cgcg_cgncgnb_sum2))
    return len(cohort_ref_genes), sample_count_2_4,\
    dict_neighbors_degree_all, dict_neighbors_cg, dict_neighbors_ncg, \
    dict_pairs_cgcg_for_nnb,dict_pairs_cgcg_for_ncgnb,\
    dict_nnb_sigLHS_nonsigRHS, dict_nnb_sigLHS_sigRHS_LHS,dict_nnb_sigLHS_sigRHS_RHS,\
    dict_nnb_nonsigLHS_nonsigRHS_LHS, dict_nnb_nonsigLHS_nonsigRHS_RHS,dict_nnb_nonsigLHS_sigRHS,\
    dict_nnb_sum_LHS,dict_nnb_sum_sig_LHS,dict_nnb_sum_RHS, dict_nnb_sum_sig_RHS,\
    dict_norm_nnb_sigLHS_nonsigRHS, dict_norm_nnb_sigLHS_sigRHS_LHS,dict_norm_nnb_sigLHS_sigRHS_RHS,\
    dict_norm_nnb_nonsigLHS_nonsigRHS_LHS, dict_norm_nnb_nonsigLHS_nonsigRHS_RHS,dict_norm_nnb_nonsigLHS_sigRHS,\
    dict_ncgnb_sigLHS_nonsigRHS, dict_ncgnb_sigLHS_sigRHS_LHS,dict_ncgnb_sigLHS_sigRHS_RHS,\
    dict_ncgnb_nonsigLHS_nonsigRHS_LHS, dict_ncgnb_nonsigLHS_nonsigRHS_RHS,dict_ncgnb_nonsigLHS_sigRHS,\
    dict_ncgnb_sum_LHS,dict_ncgnb_sum_sig_LHS,dict_ncgnb_sum_RHS, dict_ncgnb_sum_sig_RHS,\
    dict_norm_ncgnb_sigLHS_nonsigRHS, dict_norm_ncgnb_sigLHS_sigRHS_LHS,dict_norm_ncgnb_sigLHS_sigRHS_RHS,\
    dict_norm_ncgnb_nonsigLHS_nonsigRHS_LHS, dict_norm_ncgnb_nonsigLHS_nonsigRHS_RHS,dict_norm_ncgnb_nonsigLHS_sigRHS

## for single neighbor analysis

In [22]:
# The algorithms are run and the results are stored in dictionaries

rare = False

list_vals_cgcg_cgnnb = []
list_vals_cgcg_cgnnb_subset = []
list_vals_cgcg_cgncgnb = []
list_vals_cgcg_cgnnb_rare = []
list_vals_cgcg_cgncgnb_rare = []


cg_size = {}
cg_cg_size = {} #for getting cg-cg minus cg-ncg size
common_genes = {}
dict_neighbor_degree_all = {}
dict_neighbor_cg={}
dict_neighbor_ncg={}

## 1

dict_pairs_cg_for_cgnnb = {}
dict_cgnnb_sigLHS_nonsigRHS = {} #1
dict_cgnnb_sigLHS_sigRHS_LHS = {} #2
dict_cgnnb_sigLHS_sigRHS_RHS = {} #3
dict_cgnnb_nonsigLHS_sigRHS = {} #6
dict_cgnnb_nonsigLHS_nonsigRHS_LHS = {} #4
dict_cgnnb_nonsigLHS_nonsigRHS_RHS = {} #5
dict_cgnnb_sum_LHS = {}
dict_cgnnb_sumsig_LHS = {}
dict_cgnnb_sum_RHS = {}
dict_cgnnb_sumsig_RHS = {}

dict_norm_cgnnb_sigLHS_nonsigRHS = {} #1
dict_norm_cgnnb_sigLHS_sigRHS_LHS = {} #2
dict_norm_cgnnb_sigLHS_sigRHS_RHS = {} #3
dict_norm_cgnnb_nonsigLHS_sigRHS = {} #6
dict_norm_cgnnb_nonsigLHS_nonsigRHS_LHS = {} #4
dict_norm_cgnnb_nonsigLHS_nonsigRHS_RHS = {} #5



## 2

dict_pairs_cg_for_cgncgnb = {}
dict_cgncgnb_sigLHS_nonsigRHS = {} #1
dict_cgncgnb_sigLHS_sigRHS_LHS = {} #2
dict_cgncgnb_sigLHS_sigRHS_RHS = {} #3
dict_cgncgnb_nonsigLHS_sigRHS = {} #6
dict_cgncgnb_nonsigLHS_nonsigRHS_LHS = {} #4
dict_cgncgnb_nonsigLHS_nonsigRHS_RHS = {} #5
dict_cgncgnb_sum_LHS = {}
dict_cgncgnb_sumsig_LHS = {}
dict_cgncgnb_sum_RHS = {}
dict_cgncgnb_sumsig_RHS = {}

dict_norm_cgncgnb_sigLHS_nonsigRHS = {} #1
dict_norm_cgncgnb_sigLHS_sigRHS_LHS = {} #2
dict_norm_cgncgnb_sigLHS_sigRHS_RHS = {} #3
dict_norm_cgncgnb_nonsigLHS_sigRHS = {} #6
dict_norm_cgncgnb_nonsigLHS_nonsigRHS_LHS = {} #4
dict_norm_cgncgnb_nonsigLHS_nonsigRHS_RHS = {} #5

randiter = 100
randiter_outer = 100
zero_threshold = 0

# cols_sum = ['method','genesize','sum_sum', 'med_sum','sum_med', 'med_med','sum_pos', 'med_pos','sum_neg','med_neg',\
#             'sum_+cts','med_+cts','sum_--cts','med_--cts']

cols_sum = ['method', 'pairs','case1', 'case2', 'case3', 'case4', 'case5', 'case6', '(1+2)','(3+6)', 'sgm_allCGNB','sgm_sigCGNB','avg_allCGNB','avg_CGNNB','sgm_allCGNNB','sgm_sigCGNNB',\
            'TP', 'FP', 'TN', 'FN', 'PR','SN(TPR)', 'SP(TNR)', 'F1']
cols_sum_ncgnb = ['method', 'pairs','case1', 'case2', 'case3', 'case4', 'case5', 'case6', '(1+2)','(3+6)', 'sgm_allCGNB','sgm_sigCGNB','avg_allCGNB','avg_sigCGNCGB','sgm_allNCGNB','sgm_sigNCGNB',\
                  'TP', 'FP', 'TN', 'FN', 'PR','SN(TPR)', 'SP(TNR)', 'F1']
for m in tqdm(methods):#['discover', 'fishers']:
    print(m)
    filename = dict_infile[m]
    if m=='wext':
        pval_position = 3
    else:
        pval_position=3

    cg_size[m],cg_cg_size[m], dict_neighbor_degree_all[m],dict_neighbor_cg[m], dict_neighbor_ncg[m], \
    dict_pairs_cg_for_cgnnb[m],dict_pairs_cg_for_cgncgnb[m],\
    dict_cgnnb_sigLHS_nonsigRHS[m], dict_cgnnb_sigLHS_sigRHS_LHS[m],dict_cgnnb_sigLHS_sigRHS_RHS[m],\
    dict_cgnnb_nonsigLHS_nonsigRHS_LHS[m], dict_cgnnb_nonsigLHS_nonsigRHS_RHS[m],dict_cgnnb_nonsigLHS_sigRHS[m],\
    dict_cgnnb_sum_LHS[m],dict_cgnnb_sumsig_LHS[m],dict_cgnnb_sum_RHS[m],dict_cgnnb_sumsig_RHS[m],\
    dict_norm_cgnnb_sigLHS_nonsigRHS[m], dict_norm_cgnnb_sigLHS_sigRHS_LHS[m],dict_norm_cgnnb_sigLHS_sigRHS_RHS[m],\
    dict_norm_cgnnb_nonsigLHS_nonsigRHS_LHS[m], dict_norm_cgnnb_nonsigLHS_nonsigRHS_RHS[m],dict_norm_cgnnb_nonsigLHS_sigRHS[m],\
    dict_cgncgnb_sigLHS_nonsigRHS[m], dict_cgncgnb_sigLHS_sigRHS_LHS[m],dict_cgncgnb_sigLHS_sigRHS_RHS[m],\
    dict_cgncgnb_nonsigLHS_nonsigRHS_LHS[m], dict_cgncgnb_nonsigLHS_nonsigRHS_RHS[m],dict_cgncgnb_nonsigLHS_sigRHS[m],\
    dict_cgncgnb_sum_LHS[m],dict_cgncgnb_sumsig_LHS[m],dict_cgncgnb_sum_RHS[m],dict_cgncgnb_sumsig_RHS[m],\
    dict_norm_cgncgnb_sigLHS_nonsigRHS[m], dict_norm_cgncgnb_sigLHS_sigRHS_LHS[m],dict_norm_cgncgnb_sigLHS_sigRHS_RHS[m],\
    dict_norm_cgncgnb_nonsigLHS_nonsigRHS_LHS[m], dict_norm_cgncgnb_nonsigLHS_nonsigRHS_RHS[m],\
    dict_norm_cgncgnb_nonsigLHS_sigRHS[m] = get_pvalues_single(filename,randiter=randiter,n=3000,pvalue_position=pval_position)
    
    print()
    ########################################################################################################
    ## for CGNB and CG NNB
    case1_cgnnb = sum(dict_cgnnb_sigLHS_nonsigRHS[m].values())
    case2_cgnnb = sum(dict_cgnnb_sigLHS_sigRHS_LHS[m].values())
    case3_cgnnb = sum(dict_cgnnb_sigLHS_sigRHS_RHS[m].values())
    case4_cgnnb = sum(dict_cgnnb_nonsigLHS_nonsigRHS_LHS[m].values())
    case5_cgnnb = sum(dict_cgnnb_nonsigLHS_nonsigRHS_RHS[m].values())
    case6_cgnnb = sum(dict_cgnnb_nonsigLHS_sigRHS[m].values())
    sumsig_LHS_cgnnb = sum(dict_cgnnb_sumsig_LHS[m].values())
    sum_LHS_cgnnb = sum(dict_cgnnb_sum_LHS[m].values())
    medsig_LHS_cgnnb = np.median(list(dict_cgnnb_sumsig_LHS[m].values())) #median of summed LHS across all CG
    sumsig_RHS_cgnnb = sum(dict_cgnnb_sumsig_RHS[m].values())
    sum_RHS_cgnnb = sum(dict_cgnnb_sum_RHS[m].values()) 
    medsig_RHS_cgnnb = np.median(list(dict_cgnnb_sumsig_RHS[m].values()))
    
    TP_cgnnb = case1_cgnnb + case2_cgnnb +case3_cgnnb 
    FP_cgnnb = case2_cgnnb + case3_cgnnb +case6_cgnnb
    TN_cgnnb = case1_cgnnb + case4_cgnnb +case5_cgnnb 
    FN_cgnnb = case4_cgnnb + case5_cgnnb +case6_cgnnb
    sensitivity_cgnnb = TP_cgnnb/(TP_cgnnb+FN_cgnnb) #recall
    specificity_cgnnb = TN_cgnnb/(TN_cgnnb+FP_cgnnb)
    precision_cgnnb = TP_cgnnb/(TP_cgnnb+FP_cgnnb)
    f1_score_cgnnb = 2*precision_cgnnb*sensitivity_cgnnb/(precision_cgnnb+sensitivity_cgnnb)

    total_cg_pairs_for_nnb = np.sum(list(dict_pairs_cg_for_cgnnb[m].values()))
    list_vals_cgcg_cgnnb.append([m,total_cg_pairs_for_nnb]#, total_count_cgcg_cgnnb]
        + [case1_cgnnb, case2_cgnnb,case3_cgnnb, case4_cgnnb, case5_cgnnb, case6_cgnnb,\
           case1_cgnnb + case2_cgnnb, case3_cgnnb + case6_cgnnb,\
           sum_LHS_cgnnb,sumsig_LHS_cgnnb,sum_LHS_cgnnb/total_cg_pairs_for_nnb,sum_RHS_cgnnb/total_cg_pairs_for_nnb,\
           sum_RHS_cgnnb,sumsig_RHS_cgnnb,\
          TP_cgnnb,FP_cgnnb,TN_cgnnb,FN_cgnnb, precision_cgnnb,sensitivity_cgnnb, specificity_cgnnb, f1_score_cgnnb])
    
    ##### CGNNB rare cosmic mutations
    if rare==True:
        case1_cgnnb_rare = sum([v for k,v in dict_cgnnb_sigLHS_nonsigRHS[m].items() if k in rare_genes])
        case2_cgnnb_rare = sum([v for k,v in dict_cgnnb_sigLHS_sigRHS_LHS[m].items() if k in rare_genes])
        case3_cgnnb_rare = sum([v for k,v in dict_cgnnb_sigLHS_sigRHS_RHS[m].items() if k in rare_genes])
        case4_cgnnb_rare = sum([v for k,v in dict_cgnnb_nonsigLHS_nonsigRHS_LHS[m].items() if k in rare_genes])
        case5_cgnnb_rare = sum([v for k,v in dict_cgnnb_nonsigLHS_nonsigRHS_RHS[m].items() if k in rare_genes])
        case6_cgnnb_rare = sum([v for k,v in dict_cgnnb_nonsigLHS_sigRHS[m].items() if k in rare_genes])
        sumsig_LHS_cgnnb_rare = sum([v for k,v in dict_cgnnb_sumsig_LHS[m].items() if k in rare_genes])
        sum_LHS_cgnnb_rare = sum([v for k,v in dict_cgnnb_sum_LHS[m].items() if k in rare_genes])
        medsig_LHS_cgnnb_rare = np.median([v for k,v in dict_cgnnb_sumsig_LHS[m].items() if k in rare_genes])
        sumsig_RHS_cgnnb_rare = sum([v for k,v in dict_cgnnb_sumsig_RHS[m].items() if k in rare_genes])
        sum_RHS_cgnnb_rare = sum([v for k,v in dict_cgnnb_sum_RHS[m].items() if k in rare_genes]) 
        medsig_RHS_cgnnb_rare = np.median([v for k,v in dict_cgnnb_sumsig_RHS[m].items() if k in rare_genes])
        
        TP_cgnnb_rare = case1_cgnnb_rare + case2_cgnnb_rare +case3_cgnnb_rare 
        FP_cgnnb_rare = case2_cgnnb_rare + case3_cgnnb_rare +case6_cgnnb_rare
        TN_cgnnb_rare = case1_cgnnb_rare + case4_cgnnb_rare +case5_cgnnb_rare 
        FN_cgnnb_rare = case4_cgnnb_rare + case5_cgnnb_rare +case6_cgnnb_rare
        
        sensitivity_cgnnb_rare = TP_cgnnb_rare/(TP_cgnnb_rare+FN_cgnnb_rare)
        specificity_cgnnb_rare = TN_cgnnb_rare/(TN_cgnnb_rare+FP_cgnnb_rare)
        precision_cgnnb_rare = TP_cgnnb_rare/(TP_cgnnb_rare+FP_cgnnb_rare)
        f1_score_cgnnb_rare = 2*precision_cgnnb_rare*sensitivity_cgnnb_rare/(precision_cgnnb_rare+sensitivity_cgnnb_rare)



        list_vals_cgcg_cgnnb_rare.append([m]#, total_count_cgcg_cgnnb]
            + [case1_cgnnb_rare, case2_cgnnb_rare,case3_cgnnb_rare, case4_cgnnb_rare, case5_cgnnb_rare, case6_cgnnb_rare,\
               case1_cgnnb_rare + case2_cgnnb_rare, case3_cgnnb_rare + case6_cgnnb_rare,\
               sum_LHS_cgnnb_rare,sumsig_LHS_cgnnb_rare,sum_RHS_cgnnb_rare, sumsig_RHS_cgnnb_rare,\
              TP_cgnnb_rare,FP_cgnnb_rare,TN_cgnnb_rare,FN_cgnnb_rare,precision_cgnnb_rare, sensitivity_cgnnb_rare, specificity_cgnnb_rare,\
              f1_score_cgnnb_rare])
    
    ############################################################################################################
    ## for CGNB and NCG NB

    case1_cgncgnb = sum(dict_cgncgnb_sigLHS_nonsigRHS[m].values())
    case2_cgncgnb = sum(dict_cgncgnb_sigLHS_sigRHS_LHS[m].values())
    case3_cgncgnb = sum(dict_cgncgnb_sigLHS_sigRHS_RHS[m].values())
    case4_cgncgnb = sum(dict_cgncgnb_nonsigLHS_nonsigRHS_LHS[m].values())
    case5_cgncgnb = sum(dict_cgncgnb_nonsigLHS_nonsigRHS_RHS[m].values())
    case6_cgncgnb = sum(dict_cgncgnb_nonsigLHS_sigRHS[m].values())
    sumsig_LHS_cgncgnb = sum(dict_cgncgnb_sumsig_LHS[m].values())
    sum_LHS_cgncgnb = sum(dict_cgncgnb_sum_LHS[m].values())
    medsig_LHS_cgncgnb = np.median(list(dict_cgncgnb_sumsig_LHS[m].values()))
    sumsig_RHS_cgncgnb = sum(dict_cgncgnb_sumsig_RHS[m].values())
    sum_RHS_cgncgnb = sum(dict_cgncgnb_sum_RHS[m].values())
    medsig_RHS_cgncgnb = np.median(list(dict_cgncgnb_sumsig_RHS[m].values()))
    
    TP_cgncgnb = case1_cgncgnb + case2_cgncgnb +case3_cgncgnb 
    FP_cgncgnb = case2_cgncgnb + case3_cgncgnb +case6_cgncgnb
    TN_cgncgnb = case1_cgncgnb + case4_cgncgnb +case5_cgncgnb 
    FN_cgncgnb = case4_cgncgnb + case5_cgncgnb +case6_cgncgnb
    sensitivity_cgncgnb = TP_cgncgnb/(TP_cgncgnb+FN_cgncgnb)
    specificity_cgncgnb = TN_cgncgnb/(TN_cgncgnb+FP_cgncgnb)
    precision_cgncgnb = TP_cgncgnb/(TP_cgncgnb+FP_cgncgnb)
    f1_score_cgncgnb = 2*precision_cgncgnb*sensitivity_cgncgnb/(precision_cgncgnb+sensitivity_cgncgnb)
 
    total_cg_pairs_for_ncgnb = np.sum(list(dict_pairs_cg_for_cgncgnb[m].values()))

    list_vals_cgcg_cgncgnb.append([m,total_cg_pairs_for_ncgnb]#, total_count_cgcg_cgncgnb] 
        + [case1_cgncgnb, case2_cgncgnb,case3_cgncgnb, case4_cgncgnb, case5_cgncgnb, case6_cgncgnb,\
           case1_cgncgnb + case2_cgncgnb, case3_cgncgnb + case6_cgncgnb,\
           sum_LHS_cgncgnb,sumsig_LHS_cgncgnb,sum_LHS_cgncgnb/total_cg_pairs_for_ncgnb,sum_RHS_cgncgnb/total_cg_pairs_for_ncgnb,\
           sum_RHS_cgncgnb, sumsig_RHS_cgncgnb,\
          TP_cgncgnb,FP_cgncgnb,TN_cgncgnb,FN_cgncgnb,precision_cgncgnb,sensitivity_cgncgnb, specificity_cgncgnb,f1_score_cgncgnb])

    
    ## CGNNB rare cosmic mutations
    if rare==True:
        case1_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sigLHS_nonsigRHS[m].items() if k in rare_genes])
        case2_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sigLHS_sigRHS_LHS[m].items() if k in rare_genes])
        case3_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sigLHS_sigRHS_RHS[m].items() if k in rare_genes])
        case4_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_nonsigLHS_nonsigRHS_LHS[m].items() if k in rare_genes])
        case5_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_nonsigLHS_nonsigRHS_RHS[m].items() if k in rare_genes])
        case6_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_nonsigLHS_sigRHS[m].items() if k in rare_genes])
        sumsig_LHS_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sumsig_LHS[m].items() if k in rare_genes])
        sum_LHS_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sum_LHS[m].items() if k in rare_genes])
        medsig_LHS_cgncgnb_rare = np.median([v for k,v in dict_cgncgnb_sumsig_LHS[m].items() if k in rare_genes])
        sumsig_RHS_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sumsig_RHS[m].items() if k in rare_genes])
        sum_RHS_cgncgnb_rare = sum([v for k,v in dict_cgncgnb_sum_RHS[m].items() if k in rare_genes])
        medsig_RHS_cgncgnb_rare = np.median([v for k,v in dict_cgncgnb_sumsig_RHS[m].items() if k in rare_genes])
        
        TP_cgncgnb_rare = case1_cgncgnb_rare + case2_cgncgnb_rare +case3_cgncgnb_rare 
        FP_cgncgnb_rare = case2_cgncgnb_rare + case3_cgncgnb_rare +case6_cgncgnb_rare
        TN_cgncgnb_rare = case1_cgncgnb_rare + case4_cgncgnb_rare +case5_cgncgnb_rare 
        FN_cgncgnb_rare = case4_cgncgnb_rare + case5_cgncgnb_rare +case6_cgncgnb_rare
        sensitivity_cgncgnb_rare = TP_cgncgnb_rare/(TP_cgncgnb_rare+FN_cgncgnb_rare)
        specificity_cgncgnb_rare = TN_cgncgnb_rare/(TN_cgncgnb_rare+FP_cgncgnb_rare)
        precision_cgncgnb_rare = TP_cgncgnb_rare/(TP_cgncgnb_rare+FP_cgncgnb_rare)
        f1_score_cgncgnb_rare = 2*precision_cgncgnb_rare*sensitivity_cgncgnb_rare/(precision_cgncgnb_rare+sensitivity_cgncgnb_rare)


        list_vals_cgcg_cgncgnb_rare.append([m]#, total_count_cgcg_cgnnb]
            + [case1_cgncgnb_rare, case2_cgncgnb_rare,case3_cgncgnb_rare, case4_cgncgnb_rare, case5_cgncgnb_rare, case6_cgncgnb_rare,\
               case1_cgncgnb_rare + case2_cgncgnb_rare, case3_cgncgnb_rare + case6_cgncgnb_rare,\
               sum_LHS_cgncgnb_rare,sumsig_LHS_cgncgnb_rare,sum_RHS_cgncgnb_rare, sumsig_RHS_cgncgnb_rare,\
              TP_cgncgnb_rare,FP_cgncgnb_rare,TN_cgncgnb_rare,FN_cgncgnb_rare,precision_cgncgnb_rare,sensitivity_cgncgnb_rare, specificity_cgncgnb_rare, f1_score_cgncgnb_rare])


    
df_summary_cgcg_cgnnb = pd.DataFrame(list_vals_cgcg_cgnnb, columns = cols_sum)
df_summary_cgcg_cgncgnb = pd.DataFrame(list_vals_cgcg_cgncgnb, columns = cols_sum_ncgnb)

if rare==True:
    df_summary_cgcg_cgnnb_rare = pd.DataFrame(list_vals_cgcg_cgnnb_rare, columns = cols_sum)
    df_summary_cgcg_cgncgnb_rare = pd.DataFrame(list_vals_cgcg_cgncgnb_rare, columns = cols_sum_ncgnb)





HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

discover


HBox(children=(FloatProgress(value=0.0, description='Counting total Genes', max=959805.0, style=ProgressStyle(…


Total Genes: 1386
Cosmic Genes: 110
CG-CG: 74


HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))


min pval: 8.567836990636648e-17


HBox(children=(FloatProgress(value=0.0, description='group', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='genes and neighbors', max=1386.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='degrees', max=496.0, style=ProgressStyle(description_widt…



sample count 74

fishers


HBox(children=(FloatProgress(value=0.0, description='Counting total Genes', max=959805.0, style=ProgressStyle(…


Total Genes: 1386
Cosmic Genes: 110
CG-CG: 74


HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))


min pval: 8.691293396637518e-10


HBox(children=(FloatProgress(value=0.0, description='group', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='genes and neighbors', max=1386.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='degrees', max=496.0, style=ProgressStyle(description_widt…



sample count 74

megsa


HBox(children=(FloatProgress(value=0.0, description='Counting total Genes', max=959805.0, style=ProgressStyle(…


Total Genes: 1386
Cosmic Genes: 110
CG-CG: 74


HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))


min pval: 3.78407122792655e-10


HBox(children=(FloatProgress(value=0.0, description='group', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='genes and neighbors', max=1386.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='degrees', max=496.0, style=ProgressStyle(description_widt…



sample count 74

memo


HBox(children=(FloatProgress(value=0.0, description='Counting total Genes', max=959805.0, style=ProgressStyle(…


Total Genes: 1386
Cosmic Genes: 110
CG-CG: 74


HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))


min pval: 0.0001


HBox(children=(FloatProgress(value=0.0, description='group', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='genes and neighbors', max=1386.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='degrees', max=496.0, style=ProgressStyle(description_widt…



sample count 74

wext


HBox(children=(FloatProgress(value=0.0, description='Counting total Genes', max=959805.0, style=ProgressStyle(…


Total Genes: 1386
Cosmic Genes: 110
CG-CG: 74


HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))


min pval: 5.713604576218321e-26


HBox(children=(FloatProgress(value=0.0, description='group', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=959805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='genes and neighbors', max=1386.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='degrees', max=496.0, style=ProgressStyle(description_widt…



sample count 74




In [23]:
%%html
<style>
table {float:left}
</style>

In [24]:
df_summary_cgcg_cgnnb

Unnamed: 0,method,pairs,case1,case2,case3,case4,case5,case6,(1+2),(3+6),sgm_allCGNB,sgm_sigCGNB,avg_allCGNB,avg_CGNNB,sgm_allCGNNB,sgm_sigCGNNB,TP,FP,TN,FN,PR,SN(TPR),SP(TNR),F1
0,discover,196,28.0,7.0,7.0,83.5,56.5,8.0,35.0,15.0,403.675323,235.25814,2.059568,1.444171,283.057471,123.568007,42.0,22.0,168.0,148.0,0.65625,0.221053,0.884211,0.330709
1,fishers,196,6.5,0.0,0.0,115.5,66.0,5.0,6.5,5.0,83.705742,27.975866,0.42707,0.260016,50.963144,31.043321,6.5,5.0,188.0,186.5,0.565217,0.033679,0.974093,0.06357
2,megsa,196,10.0,0.0,1.0,16.0,162.0,5.0,10.0,6.0,202.956297,44.552877,1.035491,0.898732,176.151531,35.134317,11.0,6.0,188.0,183.0,0.647059,0.056701,0.969072,0.104265
3,memo,196,37.0,9.0,17.0,71.0,47.0,8.5,46.0,25.5,593.086443,451.63974,3.025951,1.947262,381.663257,242.431097,63.0,34.5,155.0,126.5,0.646154,0.332454,0.817942,0.439024
4,wext,196,50.0,12.0,15.0,57.0,45.5,9.0,62.0,24.0,782.272945,667.602684,3.991188,2.607681,511.105542,359.092595,77.0,36.0,152.5,111.5,0.681416,0.408488,0.809019,0.510779


In [25]:
df_summary_cgcg_cgncgnb

Unnamed: 0,method,pairs,case1,case2,case3,case4,case5,case6,(1+2),(3+6),sgm_allCGNB,sgm_sigCGNB,avg_allCGNB,avg_sigCGNCGB,sgm_allNCGNB,sgm_sigNCGNB,TP,FP,TN,FN,PR,SN(TPR),SP(TNR),F1
0,discover,107,13.0,9.0,7.0,40.0,27.0,10.0,22.0,17.0,242.968233,161.377919,2.270731,1.833995,196.237497,128.830988,29.0,26.0,80.0,77.0,0.527273,0.273585,0.754717,0.360248
1,fishers,107,3.0,0.0,0.0,57.0,41.0,4.0,3.0,4.0,60.613845,13.987933,0.566485,0.463176,49.55986,25.146374,3.0,4.0,101.0,102.0,0.428571,0.028571,0.961905,0.053571
2,megsa,107,7.0,0.0,1.0,10.0,83.0,6.0,7.0,7.0,123.664974,29.01223,1.155747,1.097813,117.466023,30.898741,8.0,7.0,100.0,99.0,0.533333,0.074766,0.934579,0.131148
3,memo,107,16.0,8.5,18.0,34.0,25.0,6.0,24.5,24.0,367.497472,308.403178,3.434556,2.698748,288.766056,235.097216,42.5,32.5,75.0,65.0,0.566667,0.395349,0.697674,0.465753
4,wext,107,18.0,14.0,14.0,29.0,24.0,6.0,32.0,20.0,493.394359,439.53569,4.611162,3.730939,399.210483,342.453502,46.0,34.0,71.0,59.0,0.575,0.438095,0.67619,0.497297


In [13]:
np.sum(list(dict_pairs_cg_for_cgnnb['discover'].values())),np.sum(list(dict_pairs_cg_for_cgncgnb['discover'].values()))

(1048, 895)

In [14]:
# df_summary_cgcg_cgncgnb_rare

LEGEND:

| Row  |  LHS      |  RHS      |  Which one is more signif?  |   Which one would be chosen?  | Result                     |
|------|-----------|-----------|-----------------------------|-------------------------------|----------------------------|
| Case1    |  Sig      |  non-sig  |  LHS                        |  LHS                          |  Positive result           |
|  Case2   |  Sig      |  Sig      |  LHS                        |  LHS                          |  Positive result           |
|  Case3   |  Sig      |  Sig      |  RHS                        |  RHS                          |  Negative result           |
|  Case4   |  Non-sig  |  Non-sig  |  LHS                        |  None                         |  ? less neg compared to 6  |
|  Case5   |  Non-sig  |  Non-sig  |  RHS                        |  None                         |  ? less neg compared to 6  |
|  Case6   |  Non-sig  |  Sig      |  RHS                        |  RHS                          |  Negative result           |


| Term  |  Meaning      |  Description      |
|------|-----------|-----------|
|  sgm_allCGNB   |  Sum of -log(pvalue) for all COSMIC neighbors across all COSMIC genes  |
|  sgm_sigCGNB   |  Sum of -log(pvalue) for significant COSMIC neighbors across all COSMIC genes  |
|  sgm_allCGNNB   |  Sum of -log(pvalue) for all COSMIC non-neighbors across all COSMIC genes  |
|  sgm_sigCGNNB   |  Sum of -log(pvalue) for significant COSMIC non-neighbors across all COSMIC genes  |
|  sgm_allCGNCGNB   | Sum of -log(pvalue) for all non-COSMIC neighbors across all COSMIC genes |
|  sgm_sigCGNCGNB   | Sum of -log(pvalue) for significant non-COSMIC neighbors across all COSMIC genes  | |
|  PR   |  Precision  | TP/(TP+FP)|
|  SN(TPR)   |  Sensitivity (True Positive Rate)  | TP/(TP+FN)|
|  SP(TNR)   |  Specificity (True Negative Rate)  | TN/(TN+FP)|
|  F1   |  F1 Score  | 2\*PR\* SN/(PR+SN)|

In [15]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [16]:
m = 'discover'
gg = 'SMARCD1'
dict_cgnnb_sum_LHS[m][gg]

KeyError: 'SMARCD1'

In [None]:
dict_cgnnb_sum_RHS[m][gg]

In [None]:
dict_cgnnb_sigLHS_nonsigRHS[m][gg],dict_cgnnb_sigLHS_sigRHS_LHS[m][gg],dict_cgnnb_sigLHS_sigRHS_RHS[m][gg],  dict_cgnnb_nonsigLHS_nonsigRHS_LHS[m][gg],dict_cgnnb_nonsigLHS_nonsigRHS_RHS[m][gg],dict_cgnnb_nonsigLHS_sigRHS[m][gg]

In [26]:
#output tables into files
outfile_cgcg_cgnnb = '../results_main/evaluation_results/intact/results_counts_eval1_cgcg_cgnnb_tpfp/{}_t{}_{}.txt'.format(c,t, '_'.join(methods))
outfile_cgcg_cgncgnb = '../results_main/evaluation_results/intact/results_counts_eval2_cgcg_cgncgnb_tpfp/{}_t{}_{}.txt'.format(c,t, '_'.join(methods))
df_summary_cgcg_cgnnb.to_csv(outfile_cgcg_cgnnb, index=False, sep='\t')
df_summary_cgcg_cgncgnb.to_csv(outfile_cgcg_cgncgnb, index=False, sep='\t')

if rare==True:
    outfile_cgcg_cgnnb_rare = '../results_main/evaluation_results/intact/results_counts_eval1_cgcg_cgnnb_tpfp_rare/{}_t{}_rare_{}.txt'.format(c,t, '_'.join(methods))
    outfile_cgcg_cgncgnb_rare = '../results_main/evaluation_results/intact/results_counts_eval2_cgcg_cgncgnb_tpfp_rare/{}_t{}_rare_{}.txt'.format(c,t, '_'.join(methods))
    df_summary_cgcg_cgnnb_rare.to_csv(outfile_cgcg_cgnnb_rare, index=False, sep='\t')
    df_summary_cgcg_cgncgnb_rare.to_csv(outfile_cgcg_cgncgnb_rare, index=False, sep='\t')

## Write normalized counts to file
#### A normalized count is taken for each case. Irrelevant to the main evaluation methods.

In [56]:
dict_cases_eval1 = {1:('case1_merged_eval1',dict_cgnnb_sigLHS_nonsigRHS, dict_norm_cgnnb_sigLHS_nonsigRHS),
                    2: ('case2_merged_eval1',dict_cgnnb_sigLHS_sigRHS_LHS, dict_norm_cgnnb_sigLHS_sigRHS_LHS),
                    3: ('case3_merged_eval1',dict_cgnnb_sigLHS_sigRHS_RHS, dict_norm_cgnnb_sigLHS_sigRHS_RHS),
                    4: ('case4_merged_eval1',dict_cgnnb_nonsigLHS_nonsigRHS_LHS, dict_norm_cgnnb_nonsigLHS_nonsigRHS_LHS),
                    5: ('case5_merged_eval1',dict_cgnnb_nonsigLHS_nonsigRHS_RHS, dict_norm_cgnnb_nonsigLHS_nonsigRHS_RHS),
                    6: ('case6_merged_eval1',dict_cgnnb_nonsigLHS_sigRHS, dict_norm_cgnnb_nonsigLHS_sigRHS)}

dict_cases_eval2 = {1:('case1_merged_eval2',dict_cgncgnb_sigLHS_nonsigRHS, dict_norm_cgncgnb_sigLHS_nonsigRHS),
                    2: ('case2_merged_eval2',dict_cgncgnb_sigLHS_sigRHS_LHS, dict_norm_cgncgnb_sigLHS_sigRHS_LHS),
                    3: ('case3_merged_eval2',dict_cgncgnb_sigLHS_sigRHS_RHS, dict_norm_cgncgnb_sigLHS_sigRHS_RHS),
                    4: ('case4_merged_eval2',dict_cgncgnb_nonsigLHS_nonsigRHS_LHS, dict_norm_cgncgnb_nonsigLHS_nonsigRHS_LHS),
                    5: ('case5_merged_eval2',dict_cgncgnb_nonsigLHS_nonsigRHS_RHS, dict_norm_cgncgnb_nonsigLHS_nonsigRHS_RHS),
                    6: ('case6_merged_eval2',dict_cgncgnb_nonsigLHS_sigRHS, dict_norm_cgncgnb_nonsigLHS_sigRHS)}

In [50]:
def get_merged_norm_results(d,d_norm, d_nb):

    df_norm = pd.DataFrame(d_norm).merge(pd.DataFrame(d_nb)[methods[0]].rename('Neighbors'), left_index=True, right_index=True)
    cols = df_norm.columns.to_list()
    df_norm = df_norm[[cols[-1]]+cols[:-1]]
    df_merged=df_norm.merge(pd.DataFrame(d), left_index=True, right_index=True, suffixes=('_norm','')).sort_values('Neighbors', ascending=False)
    return df_merged

In [57]:
### EVAL1

if coadread_strat_498==True or c!='COADREAD':
    outpath = '../results_main/evaluation_results/intact/merged/cases_merged_norm_eval1_cgcg_cgnnb/{}_t{}/'.format(c,t)
else:
    outpath = '../results_main/evaluation_results/intact/merged/cases_merged_norm_eval1_cgcg_cgnnb/{}_t{}_559/'.format(c,t)
    
if not os.path.exists(outpath):
    os.makedirs(outpath)
    
for i in range(1,7):
    filename, d, d_norm = dict_cases_eval1[i]

    df_out = get_merged_norm_results(d, d_norm,d_nb=dict_neighbor_degree_all)
    df_out.to_csv(outpath+'{}_t{}_{}.txt'.format(c,t,filename), sep='\t')

In [58]:
### EVAL2

if coadread_strat_498==True or c!='COADREAD':
    outpath = '../results_main/evaluation_results/intact/merged/cases_merged_norm_eval2_cgcg_cgncgnb/{}_t{}/'.format(c,t)
else:
    outpath = '../results_main/evaluation_results/intact/merged/cases_merged_norm_eval2_cgcg_cgncgnb/{}_t{}_559/'.format(c,t)
    
if not os.path.exists(outpath):
    os.makedirs(outpath)
    
for i in range(1,7):
    filename, d, d_norm = dict_cases_eval2[i]

    df_out = get_merged_norm_results(d, d_norm,d_nb=dict_neighbor_degree_all)
    df_out.to_csv(outpath+'{}_t{}_{}.txt'.format(c,t,filename), sep='\t')