In [11]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

import os
import pandas as pd
import numpy as np
from scipy.stats import hypergeom
import random
import sys
import glob


def calc_phred_score(pval):

    if pval == 0:
        return 500
    else:
        return -10 * np.log10(pval)



def read_full_ranking(full_ranking_file):
    
    full_ranked_df = pd.read_csv(full_ranking_file)
#     full_ranked_df = full_ranked_df.loc[0:top_genes, :]
    print(full_ranked_df.shape)
    print("full_ranked_df: ", full_ranked_df.head())
    full_ranked_df = full_ranked_df[full_ranked_df.known_gene==1]
    print("\n external_top ranking:", full_ranked_df.head())
    print(full_ranked_df.shape)
    full_ranked_df.sort_values("mantis_ml_proba", ascending=False, inplace=True)
    r = list(range(1, full_ranked_df.shape[0]+1))
    full_ranked_df['full_rank'] = r
    cols = ['Gene_Name'] + ["mantis_ml_proba"] + ['full_rank']
    full_ranked_df = full_ranked_df[cols]
    print('full ranking with rank', full_ranked_df.head())
    return full_ranked_df


def read_external_ranking(disease, top_genes_ratio = 0.05):
    
    synonyms = pd.read_csv('synonyms.csv', index_col=0)
    disease_synonyms = synonyms.synonym[synonyms.disease==disease].tolist()
    external_top_ranking = pd.read_csv('predictions_synonyms.csv', index_col=0)
    # external_top top predictions
    
    external_top_ranking = external_top_ranking.reset_index()
    external_top_ranking = external_top_ranking.rename(columns = {'index':'Gene_Name'})

    external_rankings = []
    for synonym in disease_synonyms:
        external_top_ranking.sort_values(synonym, ascending=False, inplace=True)
        cols = ['Gene_Name'] + [synonym]
        synonym_ranking = external_top_ranking[cols]
        cutoff = int(synonym_ranking.shape[0] * top_genes_ratio)
        synonym_ranking = synonym_ranking.loc[0:cutoff, :]
        synonym_ranking = synonym_ranking['Gene_Name']
        print(synonym_ranking.shape)
        print("external_top_ranking: ", synonym_ranking.head())
        external_rankings.append(synonym_ranking.tolist())
    return external_rankings

In [12]:
def calc_hypergeom_pvals(external_top_ranking, full_rank_file, top_full_ranking=0.1):

    
    full_ranked_df = read_full_ranking(full_rank_file)
    M = full_ranked_df.shape[0]
    print('\n Population Size:', M)
    print('\n Full ranking:\n', full_ranked_df.head())

    full_rank_cutoff = int(full_ranked_df.shape[0] * top_full_ranking) # 1050 genes (21015 in total)
    print('full_rank_cutoff:', full_rank_cutoff)
    print('full columns', full_ranked_df.columns)
    
#     external_top_genes = read_external_ranking(external_top_ranking)

    full_ranked_df = full_ranked_df.loc[full_ranked_df.Gene_Name.isin(external_top_ranking)]
    full_ranked_df.reset_index(drop=True, inplace=True)
    print('\n', full_ranked_df.head())
    print('\n', full_ranked_df.tail())
    print(full_ranked_df.shape)
    
    
    n = full_ranked_df.shape[0]
    #print('Total number of Successes:', n)



    # ************* Hypergeometric Test *************
    hypergeom_pvals = []
    hypergeom_ordered_genes = []

    for x in range(full_ranked_df.shape[0]):

        print(full_ranked_df.iloc[x, 0])

        full_rank = full_ranked_df.loc[x, 'full_rank']
        print(full_rank)

        N = full_ranked_df.iloc[x, full_ranked_df.shape[1]-1]
        print(x, N)


        cur_pval = hypergeom.sf(x - 1, M, n, N)
        print('cur_pval: ', cur_pval)

        hypergeom_pvals = hypergeom_pvals + [cur_pval]

        cur_gene = full_ranked_df.loc[x, 'Gene_Name']
        hypergeom_ordered_genes = hypergeom_ordered_genes + [cur_gene]


        if full_rank >= full_rank_cutoff:
            break
    # ***********************************************

    
    min_pval = min(hypergeom_pvals)
    hypergeom_pvals = [calc_phred_score(pval) for pval in hypergeom_pvals]
    print('Max:', min(hypergeom_pvals))
    print('Avg:', np.mean(hypergeom_pvals))

    return hypergeom_pvals, hypergeom_ordered_genes




def plot_hypergeom_stepwise(hypergeom_pvals, ax, label, color='#33a02c', signif_thres = 0.05,\
                            plot_pval_thres=True):

    if plot_pval_thres:
        signif_thres = calc_phred_score(signif_thres)
        ax.axhline(y=signif_thres, linestyle='--', color='red', label='p-val: 0.05')


    linewidth = 1
    ax.plot(hypergeom_pvals, color=color,
                label=label,
                linewidth=linewidth)

    ax.set_xlim(left=-0.5)

    y_label = 'Phred score from hypergeometric test\n(significance increasing in positive direction)'
    ax.set_ylabel(y_label, fontsize=14)

    ax.legend(bbox_to_anchor=(1.32, 1), fontsize=12, loc='upper right', framealpha =0.6)
    #ax.set_xlim(0, max_x_lim * 1.5)
    #ax.set_ylim(0, 500)




def run_hypergeom_test(disease, folder='kidney', top=0.1):

    max_x_lim = 50

    # Provide multiple full ranking files
    external_top_files = read_external_ranking(disease, top_genes_ratio = top)
    full_ranking_labels_folders = {'chem': glob.glob(folder + "/chem"),
                                 'clin': glob.glob(folder + "/clin"),
                                  'bio': glob.glob(folder + "/bio")}
    
    
    synonyms = pd.read_csv('synonyms.csv', index_col=0)
    disease_synonyms = synonyms.synonym[synonyms.disease==disease].tolist()
    
    for level in full_ranking_labels_folders.keys():
        
        full_ranking_files = glob.glob(folder + '/' + level + '/*.csv')
        print(full_ranking_files)

        for full_ranking_file in full_ranking_files:

            hypergeom_results = {}
            hypergeom_ordered_genes = {}


            for i, external_file in enumerate(external_top_files):
#                 print("external file: ", external_file)
                full_ranking_name = full_ranking_file
                print(full_ranking_name)
                external_file_name = disease_synonyms[i]

                import ntpath
                def rename_file(file_path):
                    _, file = ntpath.split(file_path)
                    file = file.split('/')[-1]
                    file = file.replace('.mantis-ml_predictions.csv', '')
                    if 'All' not in file:
                        file = file.replace('Classifier', '')
                    return file

                full_rank = rename_file(full_ranking_file)
#                     external_file = rename_file(external_file)
                name = full_rank + "_vs_" + external_file_name
                
                hypergeom_results[name], hypergeom_ordered_genes[name] = calc_hypergeom_pvals(external_file,\
                                                                        full_ranking_name, top)
                print(hypergeom_results[name])


        # random gene ranking
        #random_external_top_ranking = full_ranked_df.sample(frac=1)
        #random_external_top_ranking.drop('full_rank', axis=1, inplace=True)
        #random_hypergeom_pvals = calc_hypergeom_pvals(random_external_top_ranking, full_ranked_df)


        # ------------------------- Start plotting ----------------------------
            fig, ax = plt.subplots(figsize=(18, 13))

            colors = ['#2171b5', '#9ecae1', '#238b45', '#a1d99b', '#cb181d', '#fc9272',
                      '#6a51a3', '#bcbddc', '#fe9929', '#fff7bc']
            cnt = 0

            for external_top_dataset, hypergeom_pvals in hypergeom_results.items():
                print("external_top_dataset: ", external_top_dataset)

                plot_hypergeom_stepwise(hypergeom_pvals, ax, label=external_top_dataset, color=colors[cnt],\
                                        plot_pval_thres=False)
                cnt += 1

            #plot_hypergeom_stepwise(random_hypergeom_pvals, ax, label='random', color='black')

            signif_thres = 0.05
            signif_thres = calc_phred_score(signif_thres)
            ax.axhline(y=signif_thres, linestyle='--', color='red', label='p-val: 0.05')

            fig.savefig(os.path.join(folder, level, full_rank + ".pdf") , bbox_inches='tight')
            plt.close()


    #         print(hypergeom_ordered_genes)
            intersection_genes_fh = os.path.join(folder, level, full_rank + '-hits.intersection.txt')
            print(intersection_genes_fh)
            out_fh =  open(intersection_genes_fh, 'w')
            for clf, genes in hypergeom_ordered_genes.items():
                out_fh.write(clf + '\t' + ','.join(genes) + '\n')

            out_fh.close()

In [14]:
run_hypergeom_test(disease='Chronic obstructive pulmonary disease', folder='pulmonary_drugnomefull', top=0.15)
# run_hypergeom_test(label='pharos', folder='all-pharos')

(11095,)
external_top_ranking:  0        PDGFRA
7453     CEP104
9899     EPSTI1
7802       MICA
8259    SLC22A5
Name: Gene_Name, dtype: object
(9492,)
external_top_ranking:  0       PDGFRA
552       NOS3
336       IRS1
4389    ATP2B4
3147       TNC
Name: Gene_Name, dtype: object
(3992,)
external_top_ranking:  0      PDGFRA
935     MEIS1
24       JAG1
57      ITGA3
8      PTPN11
Name: Gene_Name, dtype: object
(7749,)
external_top_ranking:  0       PDGFRA
2164     KCNJ2
2702      RBM6
455       TPP1
24        JAG1
Name: Gene_Name, dtype: object
(3094,)
external_top_ranking:  0      PDGFRA
86      FGFR1
144     UBE4B
702      TCF4
24       JAG1
Name: Gene_Name, dtype: object
(1972,)
external_top_ranking:  0         PDGFRA
676         EPRS
453         FLNA
141        LRRK2
11344    RAPGEF3
Name: Gene_Name, dtype: object
(6430,)
external_top_ranking:  0          PDGFRA
4123     TRAPPC11
18251      OR51B4
14416      CARD18
14299        SYBU
Name: Gene_Name, dtype: object
(7153,)
external_top

In [15]:
run_hypergeom_test(disease='Coronary artery disease', folder='coronary_drugnomefull', top=0.1)

(2205,)
external_top_ranking:  0       PDGFRA
4         TBX3
135     EFEMP1
9        LTBP1
1367      LDLR
Name: Gene_Name, dtype: object
(2038,)
external_top_ranking:  0         PDGFRA
24          JAG1
120         PTK2
1074       ATXN2
2363    MAPK8IP3
Name: Gene_Name, dtype: object
(3142,)
external_top_ranking:  0       PDGFRA
1306     KAT2B
122      PSMC3
567       TYK2
25       STAT1
Name: Gene_Name, dtype: object
(1967,)
external_top_ranking:  0       PDGFRA
641       ZEB2
6378    PIK3CG
334         C3
1172     HMGCR
Name: Gene_Name, dtype: object
(802,)
external_top_ranking:  0       PDGFRA
1367      LDLR
858         TF
1634    RAD51C
1462     ERCC4
Name: Gene_Name, dtype: object
(961,)
external_top_ranking:  0      PDGFRA
160      FBN1
742    STXBP1
702      TCF4
232     STAT3
Name: Gene_Name, dtype: object
(638,)
external_top_ranking:  0      PDGFRA
458       TEK
236    TGFBR2
615       RET
172      IL6R
Name: Gene_Name, dtype: object
(1066,)
external_top_ranking:  0       PDGFR

In [16]:
run_hypergeom_test(disease='Kidney disease', folder='kidney_drugnomefull', top=0.1)

(3092,)
external_top_ranking:  0       PDGFRA
935      MEIS1
114     CREBBP
1487     PRMT5
1939     CASZ1
Name: Gene_Name, dtype: object
(1793,)
external_top_ranking:  0         PDGFRA
676         EPRS
453         FLNA
141        LRRK2
11344    RAPGEF3
Name: Gene_Name, dtype: object
(881,)
external_top_ranking:  0      PDGFRA
314      SPTB
731     TRRAP
338      MDM2
490      HIRA
Name: Gene_Name, dtype: object
(3304,)
external_top_ranking:  0       PDGFRA
4836    TAPBPL
4880     PMPCA
1143     DNAH5
678      EXOC4
Name: Gene_Name, dtype: object
(1499,)
external_top_ranking:  0        PDGFRA
4138    LAPTM4B
899         ATM
766         WRN
2612     RNF213
Name: Gene_Name, dtype: object
(824,)
external_top_ranking:  0        PDGFRA
328       ERBB3
470       SYNE2
57        ITGA3
3454    RALGAPB
Name: Gene_Name, dtype: object
(2582,)
external_top_ranking:  0       PDGFRA
369       TJP1
1387    SEMA5A
1895     FGFR3
4144    CEP192
Name: Gene_Name, dtype: object
(3781,)
external_top_ranking

In [19]:
run_hypergeom_test(disease='Hypertension', folder='hypertension_drugnomefull', top=0.20)

(838,)
external_top_ranking:  0       PDGFRA
1099       CBL
275       BMP4
2588      FRS2
1160     KNTC1
Name: Gene_Name, dtype: object
(4840,)
external_top_ranking:  0        PDGFRA
401       ITGA5
704      NOTCH2
232       STAT3
256    SERPINH1
Name: Gene_Name, dtype: object
(3604,)
external_top_ranking:  0       PDGFRA
458        TEK
6873     PHTF1
1         TBX5
5237     CEBPE
Name: Gene_Name, dtype: object
(3088,)
external_top_ranking:  0      PDGFRA
458       TEK
236    TGFBR2
615       RET
172      IL6R
Name: Gene_Name, dtype: object
(3406,)
external_top_ranking:  0       PDGFRA
363      FGFR2
2846     HAND1
4343     PEX13
237      ADCY3
Name: Gene_Name, dtype: object
(2764,)
external_top_ranking:  0       PDGFRA
296        ERG
1058       NBN
1161     PROCR
598       LRP6
Name: Gene_Name, dtype: object
(4940,)
external_top_ranking:  0       PDGFRA
583      IL2RA
304      GATA3
1568    MAP2K5
8       PTPN11
Name: Gene_Name, dtype: object
(7183,)
external_top_ranking:  0        PD

In [22]:
run_hypergeom_test(disease='Epilepsy', folder='epilepsy_drugnomefull', top=0.20)

(766,)
external_top_ranking:  0       PDGFRA
3747    PABPC4
836      ACTG1
8316      SDC1
1638      CTSA
Name: Gene_Name, dtype: object
(5496,)
external_top_ranking:  0       PDGFRA
4305      RFX3
6259     BRSK1
296        ERG
5022     WASF1
Name: Gene_Name, dtype: object
(2738,)
external_top_ranking:  0        PDGFRA
2255      KMT2E
3230       MYRF
499      MAP2K2
8906    HSPA12A
Name: Gene_Name, dtype: object
(3585,)
external_top_ranking:  0       PDGFRA
521      KPNB1
7394      FGD1
2046    CALCRL
1159      ANK3
Name: Gene_Name, dtype: object
(2725,)
external_top_ranking:  0       PDGFRA
5837      MTO1
928      APAF1
1316     HBEGF
6040     TRPM3
Name: Gene_Name, dtype: object
(4429,)
external_top_ranking:  0          PDGFRA
7236         GNL1
13983    ANKRD34A
652          DDX6
578          CCT5
Name: Gene_Name, dtype: object
(5774,)
external_top_ranking:  0        PDGFRA
9468      HCFC2
7529      USP11
5136     U2SURP
10401    FBXL19
Name: Gene_Name, dtype: object
(9918,)
external_

In [23]:
run_hypergeom_test(disease='Polycystic kidney disease', folder='polycystic_kidney_drugnomefull', top=0.1)

(824,)
external_top_ranking:  0        PDGFRA
328       ERBB3
470       SYNE2
57        ITGA3
3454    RALGAPB
Name: Gene_Name, dtype: object
(881,)
external_top_ranking:  0      PDGFRA
314      SPTB
731     TRRAP
338      MDM2
490      HIRA
Name: Gene_Name, dtype: object
(1793,)
external_top_ranking:  0         PDGFRA
676         EPRS
453         FLNA
141        LRRK2
11344    RAPGEF3
Name: Gene_Name, dtype: object
(1499,)
external_top_ranking:  0        PDGFRA
4138    LAPTM4B
899         ATM
766         WRN
2612     RNF213
Name: Gene_Name, dtype: object
(2582,)
external_top_ranking:  0       PDGFRA
369       TJP1
1387    SEMA5A
1895     FGFR3
4144    CEP192
Name: Gene_Name, dtype: object
(1974,)
external_top_ranking:  0       PDGFRA
201       TSC2
1013      BRD1
1321      ATRX
418     PLXND1
Name: Gene_Name, dtype: object
(3781,)
external_top_ranking:  0        PDGFRA
10078    LANCL1
8479      FARP2
3357      STAP2
1465       IGF2
Name: Gene_Name, dtype: object
(1405,)
external_top_ra

In [26]:
# f = read_full_ranking('Polycystic kidney disease')
s = pd.read_csv('synonyms.csv', index_col=0)
s.disease.unique()

array(['Polycystic kidney disease', 'Epilepsy', 'Hypertension',
       'Kidney disease', 'Coronary artery disease',
       'Chronic obstructive pulmonary disease'], dtype=object)

In [27]:
s.synonym.unique()

array(['Polycystic_kidney_dysplasia', 'Ectopic_kidney',
       'Chronic_kidney_disease', 'Stage_5_chronic_kidney_disease',
       'Renal_insufficiency', 'Multiple_renal_cysts',
       'Abnormality_of_renal_excretion',
       'Abnormal_renal_insterstitial_morphology',
       'Abnormality_of_the_kidney', 'Horseshoe_kidney', 'Seizure',
       'Myoclonic_seizure', 'Psychogenic_non-epileptic_seizure',
       'Generalized_myoclonic_seizure', 'Dialeptic_seizure',
       'Epileptic_encephalopathy', 'Atonic_seizure',
       'Focal_motor_seizure', 'Focal-onset_seizure',
       'Symptomatic_seizures', 'Hypertension',
       'Pulmonary_arterial_hypertension', 'Portal_hypertension',
       'Elevated_pulmonary_artery_pressure', 'Diabetes_mellitus',
       'Abnormal_systemic_blood_pressure',
       'Abnormality_of_intracranial_pressure',
       'Type_II_diabetes_mellitus', 'Insulin-resistant_diabetes_mellitus',
       'Hyperlipidemia', 'Renal_tubular_dysfunction',
       'Abnormality_of_renal_glomeru