# Perform GSEA using GSEAPY  

Following the potocol defined here: https://gseapy.readthedocs.io/en/latest/gseapy_tutorial.html#use-gsea-command-or-gsea


In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina' # mac
import pandas as pd
import gseapy as gp
import numpy as np
import matplotlib.pyplot as plt

In [3]:
gp.__version__

'0.9.15'

## Create gene lists

In [21]:
import csv 
df = pd.read_csv('~/Dropbox (UCSD_Epigenomics)/workReports/2019-09_islet_rev/Mawla_2019_tabS2a.csv')
gmt = '../dat/1908/20191001_Mawla_tabS2A.gmt'

with open(gmt,'w') as f:
    tsv_writer = csv.writer(f, delimiter='\t')
    for i in range(1,3):
        s='Mawla_Beta_'+str(i)
        print(s)
        x=df[df.logFC==i].gene.tolist()
        print(len(x))
        x.insert(0,s)
        x.insert(0,s)
        tsv_writer.writerow(x)



Mawla_Beta_1
14
Mawla_Beta_2
38


## alpha

In [17]:
def run_GSEA_gene(celltype,nperm=1000,rseed=1000,
             rnkfile="../dat/figdata/fig2_prom_ttest_res.csv",
             glist='../dat/glists/gsea.gmt',col=1,
             rank_metrics='odds',zth=3,nTop=500,**kwargs):
    '''
    col=1 - Z, 0 - logFC
    '''

    gene_exp_alpha = pd.read_csv(rnkfile)[['celltype','gene',rank_metrics]]
            
    #gene_exp_alpha.head()
    rnk = gene_exp_alpha.loc[gene_exp_alpha["celltype"]==celltype].drop(columns='celltype').sort_values(by=rank_metrics)
    if(rank_metrics=='odds'):
        rnk['odds'] =np.log2(rnk['odds'])
        rnk=rnk.reset_index(drop=True)
        df=rnk.drop(columns='gene')        
        np.random.seed(seed=rseed)
        for i in  rnk.index[(rnk['odds']==np.inf).tolist()].tolist():
            rnk.iloc[i,1]= float(df[~df.isin([np.inf])].max(0)*(1+np.random.uniform()/100))

        for i in  rnk.index[(rnk['odds']==-np.inf).tolist()].tolist():
            rnk.iloc[i,1]= float(df[~df.isin([-np.inf])].min(0)*(1+np.random.uniform()/100))
        rnk['odds']=rnk['odds']/max(abs(rnk['odds']))
        
    print(rnk.shape)
    print(rnk.head(1))
    print(rnk.tail(1))
    print('zth:{0}'.format(zth))
    #createGlists()
    gs_res_a = gp.prerank(rnk=rnk, # or data='./P53_resampling_data.txt'
                     gene_sets=glist, # enrichr library names or gmt file
                     #set permutation_type to phenotype if samples >=15
                     permutation_num=nperm, # reduce number to speed up test
                     outdir=None,  # do not write output to disk
                     no_plot=True, # Skip plotting
                     #weighted_score_type=1,
                     #ascending=False,
                        seed=rseed,
                     min_size=1,
                     max_size=5000,
                     processes=8,**kwargs)
                     #format='png')
    return(gs_res_a)


#### On Z value

In [22]:
gs_res_a=run_GSEA_gene(celltype='alpha',
                       nperm=6000,
                       rseed=1000,
                       glist=gmt)

gs_res_b=run_GSEA_gene(celltype='beta',
                       nperm=4000,
                       rseed=2000,
                       glist=gmt)

display(gs_res_a.res2d)
gs_res_b.res2d

(21715, 2)
     gene      odds
0  ZNF106 -0.994725
          gene      odds
21714  PLEKHS1  0.806376
zth:3


2019-10-01 11:31:48,251 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


(21825, 2)
   gene      odds
0  PDHX -0.993998
        gene      odds
21824  TIGIT  0.968675
zth:3


2019-10-01 11:31:54,971 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mawla_Beta_2,-0.365183,-4.186711,0.153548,0.070662,38,32,CPA1;SERPINA1;NPY;SYNGR4;C10orf10;S100A11;PRG4...,ACTG1;HSPA6;PPY;GADD45B;IL11;HS3ST2
Mawla_Beta_1,0.494201,2.0236,0.278441,0.167012,14,13,IGSF1;G6PC2;IAPP;SORL1;CECR1;ASB9;EDN3;TSPAN1;...,IGSF1;G6PC2;IAPP;SORL1;CECR1


Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mawla_Beta_1,0.621505,2.119198,0.072469,0.035847,14,14,SORL1;ASB9;HADH;G6PC2;IGSF1;IAPP;CECR1;PCSK1;T...,SORL1;ASB9;HADH;G6PC2;IGSF1;IAPP;CECR1;PCSK1
Mawla_Beta_2,-0.332739,-5.767171,0.232587,0.095238,38,31,CPA1;SYNGR4;C10orf10;CTRB2;NPY;TMSB4X;SOD2;S10...,SAT1;SST;TIMP1;NAMPT;PTGS2;BIRC3;SPP1;GADD45B;...


#### one ODDS

In [24]:
gs_res_a=run_GSEA_gene(celltype='alpha',
                       nperm=6000,col=0,
                       rseed=2000,
                       glist=gmt)

gs_res_b=run_GSEA_gene(celltype='beta',
                       nperm=4000,col=0,
                       rseed=2000,
                       glist=gmt)

display(gs_res_a.res2d)
gs_res_b.res2d

(21715, 2)
     gene      odds
0  ZNF106 -0.997682
          gene      odds
21714  PLEKHS1  0.806516
zth:3


2019-10-01 11:34:51,244 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


(21825, 2)
   gene      odds
0  PDHX -0.993998
        gene      odds
21824  TIGIT  0.968675
zth:3


2019-10-01 11:34:57,601 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mawla_Beta_2,-0.365183,-4.133133,0.148168,0.074644,38,32,CPA1;SERPINA1;NPY;SYNGR4;C10orf10;S100A11;PRG4...,ACTG1;HSPA6;PPY;GADD45B;IL11;HS3ST2
Mawla_Beta_1,0.494201,2.010933,0.269904,0.170621,14,13,IGSF1;G6PC2;IAPP;SORL1;CECR1;ASB9;EDN3;TSPAN1;...,IGSF1;G6PC2;IAPP;SORL1;CECR1


Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mawla_Beta_1,0.621505,2.119198,0.072469,0.035847,14,14,SORL1;ASB9;HADH;G6PC2;IGSF1;IAPP;CECR1;PCSK1;T...,SORL1;ASB9;HADH;G6PC2;IGSF1;IAPP;CECR1;PCSK1
Mawla_Beta_2,-0.332739,-5.767171,0.232587,0.095238,38,31,CPA1;SYNGR4;C10orf10;CTRB2;NPY;TMSB4X;SOD2;S10...,SAT1;SST;TIMP1;NAMPT;PTGS2;BIRC3;SPP1;GADD45B;...


### Save results

In [26]:
#gs_res.res2d[['es','nes','pval','fdr','geneset_size','matched_size']].to_csv('../dat/figdata/GSEA_beta_arrogo_f_seed2000.csv')
#gs_res_a.res2d[['es','nes','pval','fdr','geneset_size','matched_size']].to_csv('../dat/figdata/GSEA_alpha_arrogo_f_seed2000.csv')

from gseapy.plot import gseaplot, heatmap
terms = gs_res_b.res2d.index 
for i in range(len(terms)):
    gseaplot(gs_res_b.ranking, term=terms[i], **gs_res_b.results[terms[i]],ofname=terms[i]+'_Mawla_b_seed2000_filtered.pdf')
terms = gs_res_a.res2d.index 
for i in range(len(terms)):
    gseaplot(gs_res_a.ranking, term=terms[i], **gs_res_a.results[terms[i]],ofname=terms[i]+'_Mawla_a_seed2000_filtered.pdf')