# Perform GSEA using GSEAPY  

Following the potocol defined here: https://gseapy.readthedocs.io/en/latest/gseapy_tutorial.html#use-gsea-command-or-gsea


In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina' # mac
import pandas as pd
import gseapy as gp
import numpy as np
import matplotlib.pyplot as plt

In [5]:
gp.__version__

'0.9.9'

## alpha

### 1. Prepare prerank file

* eg: https://github.com/zqfang/GSEApy/blob/master/tests/data/temp.rnk



In [6]:
gene_exp_alpha = pd.read_csv("../dat/figdata/fig2_prom_ttest_res.csv",index_col=0)
#gene_exp_alpha.head()
df = gene_exp_alpha.loc[gene_exp_alpha["celltype"]=="alpha"].sort_values(by='odds', ascending=False)["odds"]
df=np.log2(df)

# replace inf to max exclude inf
#df=df.replace(np.inf,df[~df.isin([np.inf])].max(0))
#df=df.replace(-np.inf,df[~df.isin([-np.inf])].min(0))
df.to_csv('../dat/figdata/res.genes.a.rnk',sep='\t')
rnk = pd.read_table("../dat/figdata/res.genes.a.rnk", header=None)

for i in  rnk.index[(rnk[1]==np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([np.inf])].max(0)*(1+np.random.uniform()/100)

for i in  rnk.index[(rnk[1]==-np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([-np.inf])].min(0)*(1+np.random.uniform()/100)

#rnk.set_index(0)
rnk.head()

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1
0,DOT1L_64253,4.837272
1,MATN4_84320,4.850401
2,TPM4_66944,4.865813
3,KRTAP10-10_86424,4.848259
4,FUBP1_5220,4.860186


## 2. define gene sets

In our case, we will use the beta gene sets from the three literature. And save to [gmt](http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29) format.

It was done in another [notebook](./compare_with_previous_glists.ipynb)

## 3. Run gsea

```python
gseapy.prerank(rnk='gsea_data.rnk', gene_sets='ene_sets.gmt', outdir='gseapy_out', min_size=15,
               max_size=1000, permutation_n=1000, weighted_score_type=1, ascending=False,
               figsize=(6.5,6), format='png')
```

In [None]:
# run gsea
# enrichr libraries are supported by gsea module. Just provide the name

gs_res = gp.prerank(rnk=rnk, # or data='./P53_resampling_data.txt'
                 gene_sets='../dat/glists/gsea_new.gmt', # enrichr library names or gmt file
                 #set permutation_type to phenotype if samples >=15
                 permutation_num=1000, # reduce number to speed up test
                 outdir=None,  # do not write output to disk
                 no_plot=True, # Skip plotting
                 #weighted_score_type=1,
                 #ascending=False,
                    seed=1000,
                 min_size=10,
                 max_size=2100,
                 processes=4)
                 #format='png')
gs_res.res2d

In [None]:
gs_res.res2d.to_csv('../dat/figdata/GSEA_alpha_res_seed1000.csv')

### show the results 
The **gsea** module will generate heatmap for genes in each gene sets in the backgroud.
But if you need to do it yourself, use the code below

In [None]:
from gseapy.plot import gseaplot, heatmap
terms = gs_res.res2d.index
for i in range(len(terms)):
    gseaplot(gs_res.ranking, term=terms[i], **gs_res.results[terms[i]],ofname=terms[i]+'_a_seed1000.pdf')


## beta

In [9]:
gene_exp_alpha = pd.read_csv("../dat/figdata/fig2_prom_ttest_res.csv",index_col=0)
#gene_exp_alpha.head()
df = gene_exp_alpha.loc[gene_exp_alpha["celltype"]=="beta"].sort_values(by='odds', ascending=False)["odds"]
df=np.log2(df)

# replace inf to max exclude inf
#df=df.replace(np.inf,df[~df.isin([np.inf])].max(0))
#df=df.replace(-np.inf,df[~df.isin([-np.inf])].min(0))
df.to_csv('../dat/figdata/res.genes.a.rnk',sep='\t')
rnk = pd.read_table("../dat/figdata/res.genes.a.rnk", header=None)

for i in  rnk.index[(rnk[1]==np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([np.inf])].max(0)*(1+np.random.uniform()/100)

for i in  rnk.index[(rnk[1]==-np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([-np.inf])].min(0)*(1+np.random.uniform()/100)

#rnk.set_index(0)
rnk.head()



  after removing the cwd from sys.path.
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1
0,PI4KA_870521,5.049474
1,NELL1_18204,5.048925
2,PRKCH_37496,5.024383
3,KLKB1_102789,5.04668
4,UXS1_77396,5.049096


In [11]:
rnk.shape

(21825, 2)

In [13]:
gs_res = gp.prerank(rnk=rnk, # or data='./P53_resampling_data.txt'
                 gene_sets='../dat/glists/gsea_new.gmt', # enrichr library names or gmt file
                 #set permutation_type to phenotype if samples >=15
                 permutation_num=1000, # reduce number to speed up test
                 outdir=None,  # do not write output to disk
                 no_plot=True, # Skip plotting
                 #weighted_score_type=1,
                 #ascending=False,
                   seed=1000,
                 min_size=10,
                 max_size=4000,
                 processes=8)
print(gs_res.res2d.loc["Alpha2"].nes)

2019-03-11 17:33:56,968 No gene sets passed through filtering condition!!!, try new parameters again!
Note: check gene name, gmt file format, or filtering size.


SystemExit: 0

In [9]:
import math

# run gsea
# enrichr libraries are supported by gsea module. Just provide the name

for i in range(1000):
    print(i)
    gs_res = gp.prerank(rnk=rnk, # or data='./P53_resampling_data.txt'
                 gene_sets='../dat/glists/gsea_new.gmt', # enrichr library names or gmt file
                 #set permutation_type to phenotype if samples >=15
                 permutation_num=1000, # reduce number to speed up test
                 outdir=None,  # do not write output to disk
                 no_plot=True, # Skip plotting
                 #weighted_score_type=1,
                 #ascending=False,
                   seed=i,
                 min_size=10,
                 max_size=4000,
                 processes=8)
    print(gs_res.res2d.loc["Alpha2"].nes)
    if(not math.isnan(gs_res.res2d.loc["Alpha2"].nes)): 
        print(i)
        break


0


2019-03-11 16:10:55,107 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
1


2019-03-11 16:10:57,334 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
2


2019-03-11 16:10:59,563 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
3


2019-03-11 16:11:01,794 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
4


2019-03-11 16:11:04,021 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
5


2019-03-11 16:11:06,251 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
6


2019-03-11 16:11:08,478 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
7


2019-03-11 16:11:10,699 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
8


2019-03-11 16:11:12,927 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
9


2019-03-11 16:11:15,135 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
10


2019-03-11 16:11:17,362 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
11


2019-03-11 16:11:19,596 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
12


2019-03-11 16:11:21,838 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
13


2019-03-11 16:11:24,050 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
14


2019-03-11 16:11:26,285 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
15


2019-03-11 16:11:28,519 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


nan
16


2019-03-11 16:11:30,753 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


KeyboardInterrupt: 

In [None]:
type(gs_res.res2d)

In [None]:
gs_res.res2d.to_csv('../dat/figdata/GSEA_beta_res_seed1000.csv')

In [None]:
from gseapy.plot import gseaplot, heatmap
terms = gs_res.res2d.index 
for i in range(len(terms)):
    gseaplot(gs_res.ranking, term=terms[i], **gs_res.results[terms[i]],ofname=terms[i]+'_b_seed1000.pdf')
