In [3]:
import pandas as pd
import gseapy as gp
from scipy.stats import rankdata
from itertools import combinations
import numpy as np
import warnings
from core_functions import *
warnings.filterwarnings('ignore')

Note on parameters: 
<p> All file names should be written between comma (e.g. 'filename')</p>
<p> By default GSEA will generate the enrichment plots of the top-20 enriched pathways if you want to change this number you can change it in "output_plots"</p>
<p> By default it will do 1,000 random permutations. This means the lowest P-value you will obtain is 10-3. You may lower this number to speed-up the enrichment or increase it to have more statistical power (you may have a memory error here). You can change it in "permutations_number"</p>
<p> If you input a ranked file it is assumed that lower values means downregulation and higher up-regulation </p>

In [4]:
#File with the ranked list of genes of the expression matrix
#If it is the expression matrix the first column, named GENE, should contain the gene symbols
#Genes with negative expression values or zero will be discarded from analysis
filename = 'example_expression_matrix.xls'

#Is it the expression matrix? True or False
exp_mat = True

#Do you want to do paired combinations? (set as True) 
#or one conditions against all the others? (set as False)
paired_combi = False

#Database to perform the GSEA
db_name = '../data/MSigDB/hallmarks.gmt'

#Advanced GSEA parameters
permutations_number = 10
output_plots = 5

#Name of the analysis it will be used for output folder and summary file in results/ folder
analysis_name = 'example_ranked'

In [5]:
if exp_mat: 
    #If it is an expression matrix
    exp = pd.read_excel('../data/'+filename,sep='\t',index_col=False,header=0)
    print('Expression data loaded')
    if paired_combi:
        print('>>>>>>>>Running GSEA with paired combinations<<<<<<<<<')
        paired_combinations(exp,db_name,analysis_name,permutations_number,output_plots)
    else:
        print('>>>>>>>>Running GSEA with one against all combinations<<<<<<<<<')
        one_againts_all_combinations(exp,db_name,analysis_name,permutations_number,output_plots)
else:
    #It is a ranked list of genes
    genes_rankval = pd.read_excel('../data/'+filename,names=['GENE','FC']) #Load data
    genes_rankval['RANK'] = rankdata(genes_rankval['FC'].tolist()) #Compute rank 
    run_GSEA_function(genes_rankval,db_name,analysis_name) #run GSEA

expression data loaded
>>>>>>>>Running GSEA with one against all combinations<<<<<<<<<
---> working on combination: condition1 vs the rest
	 3847  genes discarded for having null (0 or <0) values in one or both conditions
	running GSEA for condition1  vs the rest, this might take hours
---> working on combination: condition2 vs the rest
	 3490  genes discarded for having null (0 or <0) values in one or both conditions
	running GSEA for condition2  vs the rest, this might take hours
---> working on combination: condition3 vs the rest
	 3640  genes discarded for having null (0 or <0) values in one or both conditions
	running GSEA for condition3  vs the rest, this might take hours


Explanation of the columns in the Excel summary file
<ul>
<li>Term: name of the pathway enriched</li>
<li>es: enrichment score</li>
<li>nes: normalized enrichment score (the one to look at!)</li>
<li>pval: P-value of the enrichment</li>
<li>fdr: Corrected(P-value) of the enrichment by FDR (the one to look at!). Recommended threshold: fdr < 0.25</li>
<li>gene_set_size: number of genes in the pathway</li>
<li>matched_size: number of genes in the pathway and in your input file</li>
<li>genes: genes in the pathway and in your input file</li>
</ul>