# Validation using DIGEST

Notebook to recreate the results presented in the paper. Keep in mind, that the results can slighty differentiate as the random background will not create the exact same random sets as before. 

## Setup

In [2]:
import sys
import json
import pandas as pd
from IPython.display import IFrame
# ==== import single validation script ====
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from single_validation import single_validation

## Cluster itself

Compare a target cluster of diseases or genes based on dunn index, sillhouette score or david bouldin index, while the random runs are simply cluster size preserving perturbation of cluster assignments.

## Gene Cluster

### Set input parameters

In [4]:
# ==== define required input ====
tar_set = "input/target_gene_cluster.txt"
tar_id_type = "symbol"
mode = "cluster"
# ==== define optional input ====
out_dir = "results/gene_cluster" 
background_model="complete" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
plot=True # create plots for vizualisation from results
verbose=True # printing additional information during the run
prefix="cluster_gene_symbol_complete"

### Run script

In [5]:
single_validation(tar=tar_set, tar_id=tar_id_type, mode=mode, out_dir=out_dir, prefix=prefix,
                  background_model=background_model, runs=runs, replace=perc, verbose=verbose, plot=plot)

[00:00:00|2396.57MB] Starting validation ...
[00:00:00|2396.57MB] Load mappings for input into cache ...
[00:00:01|2397.34MB] Load distances for input into cache ...
[00:00:10|2457.59MB] Load input data ...
[00:00:10|2457.83MB] Validation of input ...
Missing values for pathway.kegg :9/20
[00:00:11|2457.83MB] Validation of random runs ...
[00:03:46|2459.87MB] Save files
[00:03:46|2459.87MB] Finished validation


### Inspect results

In [6]:
with open(out_dir+prefix+'_result.json', 'r') as f:
    results = json.load(f)

Check all P-values

In [7]:
pd.DataFrame(results["p_values"])

Unnamed: 0,di,ss,dbi
go.BP,0.051948,0.35964,0.135864
go.CC,0.372627,1.0,0.81019
go.MF,0.425574,1.0,0.764236
pathway.kegg,0.37962,1.0,0.664336


Check validation values of input cluster

In [8]:
pd.DataFrame(results["input_values"])

Unnamed: 0,di,ss,ss_inter,dbi,mapped_ids
go.BP,0.097523,-0.7,"{'1': -0.6, '0': -0.8}",0.180988,"[AGR2, AR, CLSTN2, COL9A3, DNAH7, EGFR, ERBB4,..."
go.CC,0.181698,-1.0,"{'1': -1.0, '0': -1.0}",1.835614,"[AGR2, AR, CLSTN2, COL9A3, DNAH7, EGFR, ERBB4,..."
go.MF,0.184877,-0.9,"{'1': -0.8, '0': -1.0}",1.962328,"[AGR2, AR, CLSTN2, COL9A3, DNAH7, EGFR, ERBB4,..."
pathway.kegg,0.279293,-0.909091,"{'0': -1.0, '1': -0.8}",0.275175,"[AR, COL9A3, DNAH7, EGFR, ERBB4, ESR1, KRT16, ..."


#### Show visualization

If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.

In [18]:
IFrame(out_dir+prefix+'_di_p-value.pdf', width=600, height=400)

In [20]:
IFrame(out_dir+prefix+'_mappability.pdf', width=600, height=400)

## Disease Cluster

### Set input parameters

In [3]:
# ==== define required input ====
tar_set = "input/target_disease_cluster.txt"
tar_id_type = "ICD-10"
mode = "cluster"
# ==== define optional input ====
out_dir = "results/" 
background_model="complete" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
plot=True # create plots for vizualisation from results
verbose=True # printing additional information during the run
prefix="cluster_disease_ICD-10_complete"

### Run script

In [4]:
single_validation(tar=tar_set, tar_id=tar_id_type, mode=mode, out_dir=out_dir, prefix=prefix,
                  background_model=background_model, runs=runs, replace=perc, verbose=verbose, plot=plot)

[00:00:00|152.57MB] Starting validation ...
[00:00:00|152.57MB] Load mappings for input into cache ...
[00:00:01|590.84MB] Load distances for input into cache ...
[00:00:02|803.79MB] Load input data ...
[00:00:02|804.43MB] Validation of input ...
Missing values for disgenet.genes_related_to_disease :2/64
Missing values for disgenet.variants_related_to_disease :7/64
Missing values for ctd.pathway_related_to_disease :18/64
[00:00:03|809.26MB] Validation of random runs ...
[00:22:18|817.12MB] Save files
[00:22:18|817.12MB] Finished validation


### Inspect results

In [9]:
with open(out_dir+prefix+'_result.json', 'r') as f:
    results = json.load(f)

Check all P-values

In [10]:
pd.DataFrame(results["p_values"])

Unnamed: 0,di,ss,dbi
disgenet.genes_related_to_disease,0.080919,0.346653,0.144855
disgenet.variants_related_to_disease,0.133866,0.06993,0.845155
ctd.pathway_related_to_disease,0.060939,0.225774,0.252747


Check validation values of input cluster

In [11]:
pd.DataFrame(results["input_values"])

Unnamed: 0,di,ss,ss_inter,dbi,mapped_ids
disgenet.genes_related_to_disease,0.000348,-0.990557,"{'8': -1.0, '4': -0.986219324225794, '7': -1.0...",780.645289,"[I00, I01, I02, I05, I06, I09, I10, I11, I12, ..."
disgenet.variants_related_to_disease,2e-05,-0.664652,"{'8': -0.19666292222898646, '7': -0.3540285312...",6075.650277,"[I00, I01, I02, I05, I06, I09, I10, I11, I12, ..."
ctd.pathway_related_to_disease,0.001232,-0.881844,"{'4': -1.0, '5': -1.0, '3': -1.0, '1': 0.0, '6...",140.398352,"[I00, I06, I10, I11, I12, I13, I14, I15, I20, ..."


#### Show visualization

If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.

In [12]:
IFrame(out_dir+prefix+'_di_p-value.pdf', width=600, height=400)

In [13]:
IFrame(out_dir+prefix+'_mappability.pdf', width=600, height=400)

## Set itself

Validate a set by reference, either reference set or id, or reference-free. Two background models can be used. 

## Disease Set

### Set input parameters

In [15]:
# ==== define required input ====
tar_set = "input/target_disease_set.txt"
tar_id_type = "mesh"
mode = "set"
# ==== define optional input ====
out_dir = "results/" 
background_model="term-pres" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
plot=True # create plots for vizualisation from results
verbose=True # printing additional information during the run
prefix="set_disease_mesh_term-pres"

### Run script

In [17]:
single_validation(tar=tar_set, tar_id=tar_id_type, mode=mode, out_dir=out_dir, prefix=prefix,
                  background_model=background_model, runs=runs, replace=perc, verbose=verbose, plot=plot)

[00:00:00|857.46MB] Starting validation ...
[00:00:00|857.46MB] Load mappings for input into cache ...
[00:00:01|856.39MB] Load distances for input into cache ...
[00:00:02|906.50MB] Validation of input ...
[00:00:02|906.50MB] Validation of random runs ...
[00:00:27|907.19MB] Calculating p-values ...
[00:00:27|907.19MB] Save files
[00:00:27|907.19MB] Finished validation


### Inspect results

In [20]:
with open(out_dir+prefix+'_result.json', 'r') as f:
    results = json.load(f)

Check all P-values

In [23]:
results["p_values"]

{'disgenet.genes_related_to_disease': 0.027972027972027972,
 'disgenet.variants_related_to_disease': 0.00999000999000999,
 'ctd.pathway_related_to_disease': 0.026973026973026972}

Check validation values of input cluster

In [24]:
pd.DataFrame(results["input_values"])

Unnamed: 0,value,mapped_ids
disgenet.genes_related_to_disease,1.188217,"[D000544, D001249, D003704, D003924, D006333, ..."
disgenet.variants_related_to_disease,0.200375,"[D000544, D001249, D003704, D003924, D006333, ..."
ctd.pathway_related_to_disease,2.738386,"[D000544, D001249, D003704, D003924, D006333, ..."


#### Show visualization

If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.

In [25]:
IFrame(out_dir+prefix+'_p-value.pdf', width=600, height=400)

In [26]:
IFrame(out_dir+prefix+'_mappability.pdf', width=600, height=400)

## Gene Set

### Set input parameters

In [28]:
# ==== define required input ====
tar_set = "input/target_gene_set.txt"
tar_id_type = "symbol"
ref_set = "input/reference_gene_set.txt"
ref_id_type = "symbol"
mode = "set-set"
# ==== define optional input ====
out_dir = "results/" 
enriched=True # only enriched values of reference set are used
background_model="term-pres" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
plot=True # create plots for vizualisation from results
verbose=True # printing additional information during the run
prefix="set-set_gene_symbol_term-pres"

### Run script

In [36]:
single_validation(tar=tar_set, tar_id=tar_id_type, ref=ref_set, ref_id=ref_id_type, mode=mode, 
                  out_dir=out_dir, runs=runs, background_model=background_model, verbose=verbose, 
                  enriched=enriched, prefix=prefix, plot=plot)

[00:00:00|933.12MB] Starting validation ...
[00:00:00|933.12MB] Load mappings for input into cache ...
[00:00:17|932.88MB] Validation of input ...
[00:00:17|932.88MB] Validation of random runs ...
[00:02:03|933.33MB] Calculating p-values ...
[00:02:03|933.33MB] Save files
[00:02:03|933.33MB] Finished validation


### Inspect results

In [37]:
with open(out_dir+prefix+'_result.json', 'r') as f:
    results = json.load(f)

Check all P-values

In [38]:
results["p_values"]

{'go.BP': 0.000999000999000999,
 'go.CC': 0.000999000999000999,
 'go.MF': 0.004995004995004995,
 'pathway.kegg': 1.0}

Check validation values of input cluster

In [39]:
pd.DataFrame(results["input_values"])

Unnamed: 0,value,mapped_ids
go.BP,0.625,"[A2M, ABCA7, ACE, AD10, AD11, AD12, AD13, AD14..."
go.CC,0.65,"[A2M, ABCA7, ACE, AD10, AD11, AD12, AD13, AD14..."
go.MF,0.35,"[A2M, ABCA7, ACE, AD10, AD11, AD12, AD13, AD14..."
pathway.kegg,0.0,"[A2M, ABCA7, ACE, AD10, AD11, AD12, AD13, AD14..."


#### Show visualization

If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.

In [40]:
IFrame(out_dir+prefix+'_p-value.pdf', width=600, height=400)

In [41]:
IFrame(out_dir+prefix+'_mappability.pdf', width=600, height=400)