Aim: Pathway enrichment analysis

# Drug Nomenclature

    T0: First timepoint 
    (Time Zero, after selection of cells with sgRNA library, and starting point of drug dosing)
    DMSO: DMSO treated 
    Pi: PARP inhibitor (olaparib; AZD2281)
    Ri: ATR inhibitor (AZD6738)
    Wi: WEE1 inhibitor (AZD1775)
    Mi: ATM inhibitor (AZD0156)
    Ki: DNAPK inhibitor (AZD7648)


    PiRi: PARP inhibitor + ATR inhibitor
    PiWi: PARP inhibitor + WEE1 inhibitor
    PiMi: PARP inhibitor + ATM inhibitor
    PiKi: PARP inhibitor + DNAPK inhibitor

In [32]:
import sys
import pandas as pd
from scipy import stats
from functools import reduce
import matplotlib.pyplot as plt
import seaborn as sns
from venn import venn 
from upsetplot import from_contents,UpSet
from matplotlib.backends.backend_pdf import PdfPages
import stringdb

from screenpro.processing.plots import loadData

from utils import *

## Load data

In [33]:
help(loadData)

Help on function loadData in module screenpro.processing.plots:

loadData(experimentName, collapsedToTranscripts=True, premergedCounts=False)
    Parameters
    ----------
    experimentName :
        
    collapsedToTranscripts :
         (Default value = True)
    premergedCounts :
         (Default value = False)
    
    Returns
    -------



In [34]:
Ci = loadData('CRISPRi/Analysis/PiKiMiRiWi/PiKiMiRiWi_2reps')

Ca_k562 = loadData('CRISPRa/Analysis/k562/k562a_PiRi')
Ca_A549 = loadData('CRISPRa/Analysis/single_replicate_A549/PiRiWi_1stRep/CRISPRa_PiWiRi')

## CRISPRi Analysis

        gamma:T0:DMSO
        rho1:DMSO:Pi
        rho2:DMSO:Ki
        rho3:DMSO:Mi
        rho4:DMSO:Ri
        rho5:DMSO:Wi
        rho6:DMSO:PiKi
        rho7:DMSO:PiMi
        rho8:DMSO:PiRi
        rho9:DMSO:PiWi

In [35]:
Ci_data = {
    'Pi':{'rho':  get_score(Ci,'rho1','Pi')},
    'Ki':{'rho':  get_score(Ci,'rho2','Ki')},
    'Mi':{'rho':  get_score(Ci,'rho3','Mi')},
    'Ri':{'rho':  get_score(Ci,'rho4','Ri')},
    'Wi':{'rho':  get_score(Ci,'rho5','Wi')},
    'PiKi':{'rho':  get_score(Ci,'rho6','PiKi')},
    'PiMi':{'rho':  get_score(Ci,'rho7','PiMi')},
    'PiRi':{'rho':  get_score(Ci,'rho8','PiRi')},
    'PiWi':{'rho':  get_score(Ci,'rho9','PiWi')}
}

Pi ->
	mean(neg control gRNAs rho score): -0.0001401984993996408
	std(neg control gRNAs rho score): 0.12540825080834397
Ki ->
	mean(neg control gRNAs rho score): -0.0005673308068281182
	std(neg control gRNAs rho score): 0.16614558840684682
Mi ->
	mean(neg control gRNAs rho score): 0.0017851335633094487
	std(neg control gRNAs rho score): 0.14801638476027543
Ri ->
	mean(neg control gRNAs rho score): 0.0010093567889871565
	std(neg control gRNAs rho score): 0.1327425021656452
Wi ->
	mean(neg control gRNAs rho score): 0.0022317642323502387
	std(neg control gRNAs rho score): 0.1487822724803441
PiKi ->
	mean(neg control gRNAs rho score): -0.0033752033251339595
	std(neg control gRNAs rho score): 0.17291015247254843
PiMi ->
	mean(neg control gRNAs rho score): -0.004468641076695257
	std(neg control gRNAs rho score): 0.16403386577453327
PiRi ->
	mean(neg control gRNAs rho score): -0.00398767757688899
	std(neg control gRNAs rho score): 0.15472963895091643
PiWi ->
	mean(neg control gRNAs rho score)

### Data normalization 

In [36]:
Ci_df = pd.concat([Ci_data[drug]['rho'] for drug in Ci_data.keys()],axis=1).add_prefix('Ci.')
Ci_df_rho = Ci_df.filter(regex='rho$', axis=1)
Ci_df_rho_norm = Ci_df.filter(regex='rho.norm$', axis=1)
Ci_df_rho_norm.columns = Ci_df_rho.columns

### Run iPAGE

In [37]:
!mkdir -p CRISPRi_ipage

In [38]:
data = Ci_df_rho_norm
print ('CIRSPRi')
print ('_______')

for screen in {c.split(".")[1] for c in data.columns}:
    print (screen)
    data.loc[:,f'Ci.{screen}.rho'].to_csv(f'CRISPRi_ipage/{screen}.txt',sep='\t',header=False) 

CIRSPRi
_______
Ki
Ri
Mi
Pi
PiRi
PiMi
Wi
PiKi
PiWi


In [None]:
%%bash
export PAGEDIR='/data_gilbert/home/aarab/iPAGE'

cd CRISPRi_ipage
ls *.txt | parallel -j30 -k bash ../ipage_loop.sh
cd ../

In [39]:
ls *_ipage/

CRISPRa_ipage/:
[0m[38;5;27mPi[0m/  [38;5;27mPiRi[0m/  PiRi.txt  Pi.txt  [38;5;27mPiWi[0m/  PiWi.txt  [38;5;27mRi[0m/  Ri.txt  [38;5;27mWi[0m/  Wi.txt

CRISPRi_ipage/:
[38;5;27mKi[0m/     [38;5;27mPi[0m/         PiKi.txt  [38;5;27mPiRi[0m/     [38;5;27mPiWi[0m/     Ri.pdf  Wi.txt
Ki.txt  Pi_all.pdf  [38;5;27mPiMi[0m/     PiRi.pdf  PiWi.pdf  Ri.txt
[38;5;27mMi[0m/     [38;5;27mPiKi[0m/       PiMi.txt  PiRi.txt  PiWi.txt  [38;5;27mWi[0m/
Mi.txt  PiKi.pdf    Pi.pdf    Pi.txt    [38;5;27mRi[0m/       Wi.pdf


## CRISPRa Analysis

- __k562 cell line__

        ???
- __A549 cell line__

        1 replicate for all conditions
        PiRiWi_1stRep

        Phenotypes Reported as Log2(Fold Change)

        T0 = T0 
        DMSO = DMSO 
        Pi = Parpi 
        Ri = ATRi
        Wi = Wee1i 
        PiWi = Parpi/Wee1i combo 
        PiRi = Parpi/ATRi combo 

        Gamma:T0|DMSO
        Tau1:T0|Pi
        Tau2:T0|Wi
        Tau3:T0|Ri
        Tau4:T0|PiRi
        Tau5:T0|PiWi
        Rho1:DMSO|Pi
        Rho2:DMSO|Ri
        Rho3:DMSO|Wi
        Rho4:DMSO|PiRi
        Rho5:DMSO|PiWi


In [40]:
{score for score,_,_ in Ca_k562['gene scores'].columns.tolist()}

{'gamma', 'rho1', 'rho2'}

In [41]:
{score for score,_,_ in Ca_A549['gene scores'].columns.tolist()}

{'gamma',
 'rho1',
 'rho2',
 'rho3',
 'rho4',
 'rho5',
 'tau1',
 'tau2',
 'tau3',
 'tau4',
 'tau5'}

In [42]:
Ca_data = {
    'Pi':{'rho':  get_score(Ca_A549,'rho1','Pi',rep='Rep1')},
    'Ri':{'rho':  get_score(Ca_A549,'rho2','Ri',rep='Rep1')},
    'Wi':{'rho':  get_score(Ca_A549,'rho3','Wi',rep='Rep1')},
    'PiRi':{'rho':  get_score(Ca_A549,'rho4','PiRi',rep='Rep1')},
    'PiWi':{'rho':  get_score(Ca_A549,'rho5','PiWi',rep='Rep1')}
}

Pi ->
	mean(neg control gRNAs rho score): 0.015326290841790316
	std(neg control gRNAs rho score): 0.1291778609597079
Ri ->
	mean(neg control gRNAs rho score): 0.001196942012703095
	std(neg control gRNAs rho score): 0.12916627606309067
Wi ->
	mean(neg control gRNAs rho score): 0.0013140504852827142
	std(neg control gRNAs rho score): 0.13720420505235054
PiRi ->
	mean(neg control gRNAs rho score): 0.0007338311091316385
	std(neg control gRNAs rho score): 0.1369632742612412
PiWi ->
	mean(neg control gRNAs rho score): -0.0047454161159122855
	std(neg control gRNAs rho score): 0.13496411753300616


### Data normalization 

In [43]:
Ca_df = pd.concat([Ca_data[drug]['rho'] for drug in Ca_data.keys()],axis=1).add_prefix('Ca.')
Ca_df_rho = Ca_df.filter(regex='rho$', axis=1)
Ca_df_rho_norm = Ca_df.filter(regex='rho.norm$', axis=1)
Ca_df_rho_norm.columns = Ca_df_rho.columns

### Run iPAGE

In [44]:
!mkdir CRISPRa_ipage

mkdir: cannot create directory ‘CRISPRa_ipage’: File exists


In [45]:
data = Ca_df_rho_norm
print ('CIRSPRa')
print ('_______')

for screen in {c.split(".")[1] for c in data.columns}:
    print (screen)
    data.loc[:,f'Ca.{screen}.rho'].to_csv(f'CRISPRa_ipage/{screen}.txt',sep='\t',header=False) 

CIRSPRa
_______
Ri
Pi
Wi
PiRi
PiWi


In [None]:
%%bash
export PAGEDIR='/data_gilbert/home/aarab/iPAGE'

cd CRISPRa_ipage
ls *.txt | parallel -j10 -k bash ../ipage_loop.sh
cd ../

### ipager

In [46]:
import re
import sys
from glob import glob

sys.path.append('/data_gilbert/home/aarab/Projects/ipage-msigdb/')

import upsetplot as us
import ipage_down as ipd

from collections import ChainMap


In [47]:
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()


@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

In [48]:
def read_ipage_intersections_file(gs_cluster_path,clust,gs=None,gene=None):
    with open(f'{gs_cluster_path}/output.ipage_intersections') as raw:
        lines = [line for line in raw.read().splitlines()]
        if gs:
            lines = [line.split('\t') for line in lines if re.search(gs, line)]
        elif gene:
            lines = [line.split('\t') for line in lines if re.search(gene, line)]
        else:
            lines = [line.split('\t') for line in lines]
    #     left  = dict([(line[0].split(' ')[0],line[2:]) for line in lines if line[1] == '0' ])
    #     right = dict([(line[0].split(' ')[0],line[2:]) for line in lines if line[1] == '10'])
    # return left,right
    return dict([(line[0].split(' ')[0],line[2:]) for line in lines if line[1] == clust ])


def merge_multiple_pvmat(pvmat_list):
    df = ipd.clean_bins_range(
        ipd.read_pvmatrix(pvmat_list[0])
    )

    cols = df.columns

    df = pd.concat(
        [df] + [
            ipd.read_pvmatrix(pvmat).set_axis(cols, axis=1, inplace=False) 
            for pvmat in pvmat_list[1:]
        ]
    )

    df = df.groupby(df.index).first()
    # ipd.style_clean_pvmatrix(df.iloc[:,[0,10]])    
    return df


def pvmat2bio_signal(pvmat,side,n_clust=2):
    if side == 'up':
        out = pvmat.loc[
            (pd.DataFrame(pvmat.iloc  [:,-n_clust:]  > 2).all(axis=1)) & 
            (pd.DataFrame(pvmat.iloc  [:,:-n_clust]  < 2).all(axis=1))
            ,:
        ].sort_values(by=pvmat.columns[-n_clust:].to_list()[::-1],ascending=False,axis=0)
    elif side == 'both':
        out = pvmat.loc[
            (pd.DataFrame(pvmat.iloc  [:,:n_clust]  > 2).all(axis=1)) & 
            (pd.DataFrame(pvmat.iloc  [:,-n_clust:]  > 2).all(axis=1)) & 
            (pd.DataFrame(pvmat.iloc  [:,n_clust+1:-n_clust]  < 2).all(axis=1))
            ,:
        ].sort_values(
            by=pvmat.columns[-n_clust:].to_list()[::-1] + pvmat.columns[:n_clust].to_list(),
            ascending=False,axis=0
        )
    elif side == 'down':
        out = pvmat.loc[
            (pd.DataFrame(pvmat.iloc  [:,:n_clust]  > 2).all(axis=1)) & 
            (pd.DataFrame(pvmat.iloc  [:,n_clust+1:]  < 2).all(axis=1))
            ,:
        ].sort_values(by=pvmat.columns[:n_clust].to_list(),ascending=False,axis=0)
    return out


def detect_gs_cluster(pvmat_list, gs):
    return [
        pvmat for pvmat in pvmat_list
        if gs in ipd.read_pvmatrix(pvmat).index.to_list()
        
    ]


def get_signals(pvmatrix):
    dfs = []
    for side in ['up','both','down']:
        num = 1
        while 1:
            df = pvmat2bio_signal(pvmatrix,side,n_clust=num)
            if df.shape[0] > 0:
                dfs.append(df)
                num += 1
            else:
                break
    if dfs: 
        return pd.concat(dfs)
    else: 
        return []

### Ci

In [49]:
!wc -l CRISPRi_ipage/Pi/msigdb_v7.4_c5*/pvmatrix.txt

   334 CRISPRi_ipage/Pi/msigdb_v7.4_c5.all/pvmatrix.txt
   143 CRISPRi_ipage/Pi/msigdb_v7.4_c5.go.bp/pvmatrix.txt
    57 CRISPRi_ipage/Pi/msigdb_v7.4_c5.go.cc/pvmatrix.txt
    36 CRISPRi_ipage/Pi/msigdb_v7.4_c5.go.mf/pvmatrix.txt
   238 CRISPRi_ipage/Pi/msigdb_v7.4_c5.go/pvmatrix.txt
    82 CRISPRi_ipage/Pi/msigdb_v7.4_c5.hpo/pvmatrix.txt
   890 total


In [50]:
screen = 'Pi'
pvmatrix = f'CRISPRi_ipage/{screen}/msigdb_v7.4_c5.go.cc/pvmatrix.txt'
expfile  = f'CRISPRi_ipage/{screen}.txt'
expfile_all = expfile.replace('.txt','_all.txt')

pybash('',"cp '{expfile}' '{expfile_all}'")
pybash('',"bash ipage_draw_matrix.sh '{expfile_all}' '{pvmatrix}'")
pybash('',"rm '{expfile_all}'")

Reading matrix ... Done.
Cluster rows .. best merge is 33, 53, 1.37900058039264e-09
best merge is 4, 5, 0.000718709493904068
best merge is 0, 2, 0.00106500533446163
best merge is 4, 8, 0.00229320211062617
best merge is 0, 4, 0.00428128022942542
best merge is 22, 34, 0.00621008663277967
best merge is 15, 16, 0.00633353176239115
best merge is 0, 6, 0.00650237494591921
best merge is 7, 9, 0.00666275547251005
best merge is 18, 38, 0.00692224898745986
best merge is 0, 7, 0.00835945504497545
best merge is 1, 11, 0.0118266793755686
best merge is 33, 49, 0.0121791041028114
best merge is 13, 28, 0.0136974566040392
best merge is 14, 15, 0.0147441895268262
best merge is 0, 18, 0.0153313022002986
best merge is 10, 14, 0.0175789265027223
best merge is 0, 22, 0.0207700861524081
best merge is 25, 26, 0.0213383131890437
best merge is 51, 52, 0.0220722424628482
best merge is 0, 1, 0.0251085009638798
best merge is 21, 23, 0.0268833947702243
best merge is 0, 54, 0.0302938329032822
best merge is 33, 42, 0

In [51]:
data = Ci_df_rho_norm
print ('CIRSPRi')
print ('_______')

pvmats = {}

for screen in {c.split(".")[1] for c in data.columns}:
    print (screen)
    pvmats_list = [file for pat in ['c5.go','c2.cp','h.all'] for file in glob(f'CRISPRi_ipage/{screen}/*{pat}*/pvmatrix.txt')]
    pvmats[screen] = {'list':pvmats_list,'pvmatrix':merge_multiple_pvmat(pvmats_list)}

CIRSPRi
_______
Ki
Ri
Mi
Pi
PiRi
PiMi
Wi
PiKi
PiWi


In [52]:
results = {}
for screen in pvmats: 
    pw_genes = []
    for file in pvmats[screen]['list']:
        for clust in ['0','1','2']:
            res = read_ipage_intersections_file(file.replace('/pvmatrix.txt',''),clust,gs='PRDX1')
            if res: 
                pw_genes.append(res)
    pws = {pw for l in pw_genes for pw in l}
    
    pvmatrix = pvmats[screen]['pvmatrix'].loc[pws,:]
    if len(pvmatrix):
        signal = get_signals(pvmatrix)
        if len(signal):
            results[screen] = {
                'pvmatrix':signal,
                'pw_genes':dict(ChainMap(*pw_genes))
            }

In [54]:
for screen in results: 
    pvmatrix = f'CRISPRi_ipage/{screen}_PRDX1_left_pvmatrix.txt'
    expfile  = f'CRISPRi_ipage/{screen}.txt'
    results[screen]['pvmatrix'].to_csv(pvmatrix,sep='\t')
    pybash('',"bash ipage_draw_matrix.sh '{expfile}' '{pvmatrix}'")
    pybash('',"rm -v '{pvmatrix}'")

Reading matrix ... Done.
Start drawing
1.93	-1.84
REACTOME_TP53_REGULATES_METABOLIC_GENES, 
PID_AR_PATHWAY, 
Outputing EPS file CRISPRi_ipage/Ri.txt_PAGE/Ri.txt.summary.eps
Convert to PDF CRISPRi_ipage/Ri.txt_PAGE/Ri.txt.summary.pdf
Finished.
‘CRISPRi_ipage/Ri.txt_PAGE/Ri.txt.summary.pdf’ -> ‘CRISPRi_ipage/Ri.pdf’
removed ‘CRISPRi_ipage/Ri.txt_PAGE/Ri.txt.summary.eps’
removed directory: ‘CRISPRi_ipage/Ri.txt_PAGE’
removed ‘CRISPRi_ipage/Ri_PRDX1_left_pvmatrix.txt’
Reading matrix ... Done.
Start drawing
2.32	-1.66
REACTOME_TP53_REGULATES_METABOLIC_GENES, 
Outputing EPS file CRISPRi_ipage/Pi.txt_PAGE/Pi.txt.summary.eps
Convert to PDF CRISPRi_ipage/Pi.txt_PAGE/Pi.txt.summary.pdf
Finished.
‘CRISPRi_ipage/Pi.txt_PAGE/Pi.txt.summary.pdf’ -> ‘CRISPRi_ipage/Pi.pdf’
removed ‘CRISPRi_ipage/Pi.txt_PAGE/Pi.txt.summary.eps’
removed directory: ‘CRISPRi_ipage/Pi.txt_PAGE’
removed ‘CRISPRi_ipage/Pi_PRDX1_left_pvmatrix.txt’
Reading matrix ... Done.
Start drawing
2.85	-1.74
REACTOME_TP53_REGULATES_METAB

In [None]:
results[screen

In [55]:
print ('REACTOME_TP53_REGULATES_METABOLIC_GENES')
print ()
for screen in results:
    if 'REACTOME_TP53_REGULATES_METABOLIC_GENES' in results[screen]['pw_genes'].keys(): 
        print ('\t',screen)
        print ('\t','rho score range:',','.join(results[screen]['pvmatrix'].columns[-2:]))
        print ('\t',results[screen]['pw_genes']['REACTOME_TP53_REGULATES_METABOLIC_GENES'])
        print ()


REACTOME_TP53_REGULATES_METABOLIC_GENES

	 Ri
	 rho score range: [1.06 1.59],[1.59 1.93]
	 ['COX6B1', 'DDIT4', 'G6PD', 'GPX2', 'PRDX1', 'PRKAB2', 'PRKAG1', 'PTEN', 'SESN2', 'TSC1', 'TXN', 'YWHAB', 'YWHAE', 'YWHAZ']

	 Pi
	 rho score range: [1.31 1.96],[1.96 2.32]
	 ['DDIT4', 'G6PD', 'PRDX1', 'PRKAA1', 'PTEN', 'TSC1', 'YWHAB', 'YWHAE', 'YWHAZ']

	 PiRi
	 rho score range: [1.51 2.43],[2.43 2.85]
	 ['DDIT4', 'PRDX1', 'PTEN', 'TSC1', 'TXN', 'YWHAE', 'YWHAZ']

	 Wi
	 rho score range: [0.82 1.36],[1.36 1.77]
	 ['LAMTOR2', 'LAMTOR3', 'LAMTOR4', 'LAMTOR5', 'MLST8', 'MTOR', 'PRDX1', 'RHEB', 'RPTOR', 'TXN']

	 PiWi
	 rho score range: [0.97 1.57],[1.57 1.97]
	 ['AGO3', 'LAMTOR2', 'LAMTOR3', 'LAMTOR4', 'LAMTOR5', 'MLST8', 'MTOR', 'PRDX1', 'RHEB', 'RPTOR', 'SLC38A9', 'TXN', 'YWHAB']



__Conclusion:__ 

- __REACTOME_TP53_REGULATES_METABOLIC_GENES__ pathway enriched in the first two clusters for given _rho_ scores of all PiRi, PiWi, Pi, Wi, Ri CRISPRi screens.  

### Ca

In [358]:
data = Ca_df_rho_norm
print ('CIRSPRa')
print ('_______')

pvmats = {}

for screen in {c.split(".")[1] for c in data.columns}:
    print (screen)
    pvmats_list = [file for pat in ['c5.go','c2.cp','h.all'] for file in glob(f'CRISPRa_ipage/{screen}/*{pat}*/pvmatrix.txt')]
    pvmats[screen] = {'list':pvmats_list,'pvmatrix':merge_multiple_pvmat(pvmats_list)}

CIRSPRa
_______
PiRi
PiWi
Pi
Wi
Ri


In [366]:
results = {}
for screen in pvmats: 
    pws = []
    for file in pvmats[screen]['list']:
        for clust in ['10']:
            res = read_ipage_intersections_file(file.replace('/pvmatrix.txt',''),clust,gs='PRDX1')
            if res: 
                pws.append(res)
    pvmatrix = pvmats[screen]['pvmatrix'].loc[{pw for l in pws for pw in l},:]
    if len(pvmatrix):
        signal = get_signals(pvmatrix)
        if len(signal):
            results[screen] = {
                'pvmatrix':signal,
                'genes':pws
            }

In [367]:
results

{}

In [360]:
for screen in results: 
    pvmatrix = f'CRISPRa_ipage/{screen}_PRDX1_left_pvmatrix.txt'
    expfile  = f'CRISPRa_ipage/{screen}.txt'
    results[screen]['pvmatrix'].to_csv(pvmatrix,sep='\t')
    pybash('',"bash ipage_draw_matrix.sh '{expfile}' '{pvmatrix}'")
    pybash('',"rm -v '{pvmatrix}'")

In [218]:
!date

Fri Feb 25 17:43:04 PST 2022
