# Perform GSEA using GSEAPY  

Following the potocol defined here: https://gseapy.readthedocs.io/en/latest/gseapy_tutorial.html#use-gsea-command-or-gsea


In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina' # mac
import pandas as pd
import gseapy as gp
import numpy as np
import matplotlib.pyplot as plt

In [2]:
gp.__version__

'0.9.9'

## Create gene lists

In [9]:
import csv 
from openpyxl import load_workbook
wb = load_workbook(filename = '../dat/1908/Arrojo_e_Drigo_biorxiv.alpha_states.xlsx')

with open('../dat/1908/20190812_Arrojo_genesets_filtered.gmt','w') as f:
    tsv_writer = csv.writer(f, delimiter='\t')
    for s in ['Alpha-state '+str(i) for i in range(1,7)]:
        print(s)
        ws=wb[s]
        df=pd.DataFrame(ws.values)
        df2 = df[1:]
        df2.columns=df.loc[0].tolist()
        df2=df2.set_index('gene_name')
        cols=df2.columns
        x=df2[(df2[cols[0]]>1)&(df2[cols[1]]>3)].index.tolist()
        print(len(x))
        x.insert(0,s)
        x.insert(0,s)
        tsv_writer.writerow(x)

Alpha-state 1
2045
Alpha-state 2
665
Alpha-state 3
54
Alpha-state 4
121
Alpha-state 5
1163
Alpha-state 6
637


In [10]:
wb = load_workbook(filename = '../dat/1908/Arrojo_e_Drigo_biorxiv.beta_states.xlsx')

with open('../dat/1908/20190812_Arrojo_genesets_filtered.gmt','a+') as f:
    tsv_writer = csv.writer(f, delimiter='\t')
    for s in ['Beta-state{0} genes'.format(str(i)) for i in range(1,5)]:
        print(s)
        ws=wb[s]
        df=pd.DataFrame(ws.values)
        df2 = df[1:]
        df2.columns=df.loc[0].tolist()
        df2=df2.set_index('gene_name')
        cols=df2.columns
        x=df2[(df2[cols[0]]>1)&(df2[cols[1]]>3)].index.tolist()
        print(len(x))
        x.insert(0,s)
        x.insert(0,s)
        tsv_writer.writerow(x)

Beta-state1 genes
1674
Beta-state2 genes
665
Beta-state3 genes
158
Beta-state4 genes
973


## alpha

In [25]:
gene_exp_alpha = pd.read_csv("../dat/figdata/fig2_prom_ttest_res.csv",index_col=1)
#gene_exp_alpha.head()
df = gene_exp_alpha.loc[gene_exp_alpha["celltype"]=="alpha"].sort_values(by='odds', ascending=False)["odds"]
df=np.log2(df)

# replace inf to max exclude inf
#df=df.replace(np.inf,df[~df.isin([np.inf])].max(0))
#df=df.replace(-np.inf,df[~df.isin([-np.inf])].min(0))
df.to_csv('../dat/figdata/res.genes.a.rnk',sep='\t')
rnk = pd.read_table("../dat/figdata/res.genes.a.rnk", header=None)

for i in  rnk.index[(rnk[1]==np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([np.inf])].max(0)*(1+np.random.uniform()/100)

for i in  rnk.index[(rnk[1]==-np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([-np.inf])].min(0)*(1+np.random.uniform()/100)

#rnk.set_index(0)
#rnk.head()

# run gsea
# enrichr libraries are supported by gsea module. Just provide the name

gs_res_a = gp.prerank(rnk=rnk, # or data='./P53_resampling_data.txt'
                 gene_sets='../dat/1908/20190812_Arrojo_genesets_filtered.gmt', # enrichr library names or gmt file
                 #set permutation_type to phenotype if samples >=15
                 permutation_num=6000, # reduce number to speed up test
                 outdir=None,  # do not write output to disk
                 no_plot=True, # Skip plotting
                 #weighted_score_type=1,
                 #ascending=False,
                    seed=1000,
                 min_size=1,
                 max_size=5000,
                 processes=6)
                 #format='png')
    
gs_res_a.res2d.sort_index()

Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alpha-state 1,-0.232409,,,1.0,2045,1993,MIIP;LHPP;KMT2D;ERI3;WNK2;SLC27A5;CANT1;MEGF8;...,HLA-A;NT5C;PLA2G15;RNF187;PRR14;OTUD5;TMEM63B;...
Alpha-state 2,0.257668,1.027081,0.416331,1.0,665,641,STON2;ZNF587;NFIB;CSNK2A2;DDAH1;MYSM1;EYA3;LIM...,STON2;ZNF587;NFIB;CSNK2A2;DDAH1;MYSM1;EYA3;LIM...
Alpha-state 3,0.321503,1.023287,0.429026,1.0,54,51,GSTZ1;SIL1;AGT;CMC1;RCN3;G6PC2;PEMT;ANGPTL4;TM...,GSTZ1;SIL1;AGT;CMC1;RCN3;G6PC2;PEMT;ANGPTL4
Alpha-state 4,-0.211246,-0.954905,0.58998,0.573824,121,113,TPM4;ATP2B4;C1orf168;CDHR3;NEDD9;TIPARP;PDLIM5...,CACNA1C;RPL5;RPL39;ERRFI1;RPL9;BAG3;RPS2;FOS;R...
Alpha-state 5,0.159779,0.642807,1.0,0.992933,1163,1134,KIAA1429;STT3A;PRLR;ST8SIA3;TMEM117;RALGAPA1;P...,KIAA1429;STT3A;PRLR;ST8SIA3;TMEM117;RALGAPA1;P...
Alpha-state 6,0.190015,0.755172,0.977002,1.0,637,612,MOK;DOT1L;TNS3;KIAA0430;MAML3;RAB30;DTNA;ILF3;...,MOK;DOT1L;TNS3;KIAA0430;MAML3;RAB30;DTNA;ILF3;...
Beta-state1 genes,-0.223725,-1.260108,0.0,0.172941,1674,1636,MIIP;LHPP;KMT2D;TMEM108;WWC1;WNK2;SLC27A5;CANT...,UAP1L1;TBC1D10B;NME1-NME2;REEP4;AGPAT2;TIMM17B...
Beta-state2 genes,0.234298,0.933637,0.70258,1.0,665,645,FAM193A;PPP2R2C;ZNF587;CSNK2A2;DDAH1;EYA3;LIMC...,FAM193A;PPP2R2C;ZNF587;CSNK2A2;DDAH1;EYA3;LIMC...
Beta-state3 genes,-0.222374,-1.035473,0.364154,0.508627,158,145,MOK;C22orf42;SPAG1;SGMS2;SLC37A4;KRT8;BAIAP3;U...,RFX3;SYVN1;SYT5;U2AF1;CCDC173;LZTFL1;MAK;PPIB;...
Beta-state4 genes,0.178106,0.712791,0.997497,1.0,973,948,GMDS;INPP4B;SLC7A8;ST8SIA3;RALGAPA1;PRUNE;TMEM...,GMDS;INPP4B;SLC7A8;ST8SIA3;RALGAPA1;PRUNE;TMEM...


## beta

In [24]:
gene_exp_alpha = pd.read_csv("../dat/figdata/fig2_prom_ttest_res.csv",index_col=1)
#gene_exp_alpha.head()
df = gene_exp_alpha.loc[gene_exp_alpha["celltype"]=="beta"].sort_values(by='odds', ascending=False)["odds"]
df=np.log2(df)

# replace inf to max exclude inf
#df=df.replace(np.inf,df[~df.isin([np.inf])].max(0))
df=df.replace(-np.inf,df[~df.isin([-np.inf])].min(0))
df.to_csv('../dat/figdata/res.genes.b.rnk',sep='\t')
rnk = pd.read_table("../dat/figdata/res.genes.b.rnk", header=None)

for i in  rnk.index[(rnk[1]==np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([np.inf])].max(0)*(1+np.random.uniform()/100)

for i in  rnk.index[(rnk[1]==-np.inf).tolist()].tolist():
    rnk.iloc[i,1]= df[~df.isin([-np.inf])].min(0)*(1+np.random.uniform()/100)

#rnk.set_index(0)
#rnk.head()


# run gsea
# enrichr libraries are supported by gsea module. Just provide the name

gs_res = gp.prerank(rnk=rnk, # or data='./P53_resampling_data.txt'
                 gene_sets='../dat/1908/20190812_Arrojo_genesets_filtered.gmt', # enrichr library names or gmt file
                 #set permutation_type to phenotype if samples >=15
                 permutation_num=4000, # reduce number to speed up test
                 outdir=None,  # do not write output to disk
                 no_plot=True, # Skip plotting
                 weighted_score_type=1,
                 #ascending=False,
                    seed=2000,
                 min_size=10,
                 max_size=5000,
                 processes=8)
                 #format='png')
    
gs_res.res2d.sort_index()

Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alpha-state 1,-0.185698,,,1.0,2045,1986,TRIM41;ATP2A3;NCOR2;KMT2D;PRPF40B;NPEPL1;INS;S...,TMEM259;ATG9A;ETNK2;GIPC1;SLC25A39;PRR24;TMEM1...
Alpha-state 2,0.212311,0.732689,0.993,1.0,665,642,EDIL3;ARID1B;CCNYL1;PPP2R5C;TLE3;SLC39A11;SENP...,EDIL3;ARID1B;CCNYL1;PPP2R5C;TLE3;SLC39A11;SENP...
Alpha-state 3,0.30809,0.913401,0.611565,1.0,54,51,GSTZ1;ANGPTL4;AGT;G6PC2;PEMT;CDO1;SIL1;LOXL4;C...,GSTZ1;ANGPTL4;AGT;G6PC2;PEMT;CDO1;SIL1;LOXL4;C...
Alpha-state 4,-0.234361,-1.079642,0.243902,0.273191,121,112,FABP5;ANXA2;TUBA1C;TDRD9;C1orf168;CDHR3;PDE4D;...,SERTAD1;RPL9;ERRFI1;NR4A3;YWHAH;WEE1;HBEGF;HSP...
Alpha-state 5,0.212683,0.737763,0.99925,1.0,1163,1133,CKAP5;FTO;TVP23B;KPNB1;ATF7IP;MPZL1;SUGP2;FABP...,CKAP5;FTO;TVP23B;KPNB1;ATF7IP;MPZL1;SUGP2;FABP...
Alpha-state 6,0.259325,0.893709,0.83975,1.0,637,610,MAP3K2;KCNK1;FAM214A;RUFY3;ELL;NINJ1;BTG3;DTNA...,MAP3K2;KCNK1;FAM214A;RUFY3;ELL;NINJ1;BTG3;DTNA...
Beta-state1 genes,0.191234,0.664415,1.0,0.985546,1674,1633,TRIM41;ATP2A3;NCOR2;KMT2D;PRCC;NPEPL1;INS;SPAT...,TRIM41;ATP2A3;NCOR2;KMT2D;PRCC;NPEPL1;INS;SPAT...
Beta-state2 genes,0.218411,0.752953,0.99,1.0,665,644,CKAP5;ARID1B;MAP3K2;PPP2R5C;TLE3;FAM193A;CEP35...,CKAP5;ARID1B;MAP3K2;PPP2R5C;TLE3;FAM193A;CEP35...
Beta-state3 genes,0.207192,0.682976,0.964332,1.0,158,148,RFX3;BTG3;ASB9;PLCH2;KRT8;SGMS2;MOK;SPAG1;DLK1...,RFX3;BTG3;ASB9;PLCH2;KRT8;SGMS2;MOK;SPAG1;DLK1...
Beta-state4 genes,0.228851,0.792808,0.99325,1.0,973,949,UXS1;INPP4B;TVP23B;CASR;ATF7IP;MPZL1;MCTP2;SUG...,UXS1;INPP4B;TVP23B;CASR;ATF7IP;MPZL1;MCTP2;SUG...


### Save results

In [15]:
gs_res.res2d[['es','nes','pval','fdr','geneset_size','matched_size']].to_csv('../dat/figdata/GSEA_beta_arrogo_f_seed2000.csv')
gs_res_a.res2d[['es','nes','pval','fdr','geneset_size','matched_size']].to_csv('../dat/figdata/GSEA_alpha_arrogo_f_seed2000.csv')

from gseapy.plot import gseaplot, heatmap
terms = gs_res.res2d.index 
for i in range(len(terms)):
    gseaplot(gs_res.ranking, term=terms[i], **gs_res.results[terms[i]],ofname=terms[i]+'_b_seed2000_filtered.pdf')
terms = gs_res_a.res2d.index 
for i in range(len(terms)):
    gseaplot(gs_res_a.ranking, term=terms[i], **gs_res_a.results[terms[i]],ofname=terms[i]+'_a_seed2000_filtered.pdf')