In [1]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr as pcor

## Functional genomic analysis
Here we will analyse the cell death associated signature from functional genomic aspect.

At first we will calculate gene expression - cell viability correlation coefficients form Achilles-L1000-96h (and also CTRP-L1000-24h) datasets.

In [2]:
#achilles data
sig_info=pd.read_table('../results/Achilles/sig_info_merged_lm.csv',
                      sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/Achilles/signatures_merged_lm.csv',
                        sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='96 h'
sig_info=sig_info[fil]
signatures=signatures.loc[sig_info.index]

In [3]:
#prepare a dataframe to store correlations
gene_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',
                       sep='\t',index_col=[0],header=0)
fil=gene_info['pr_is_lm']==1
gene_info=gene_info[fil]
gene_info.index=gene_info.index.astype(str)
correlations=pd.DataFrame(index=gene_info.index,
                           columns=['pr_gene_symbol','Pearson_r','p_val'])
correlations['pr_gene_symbol']=gene_info['pr_gene_symbol']

In [4]:
for gene_id in correlations.index:
    r,p=pcor(signatures[gene_id],sig_info['shRNA_abundance'])
    correlations.loc[gene_id,['Pearson_r','p_val']]=r,p
correlations.to_csv('../results/functional/achilles_cors.csv')

In [5]:
#just looking into the gene - viaiblity correlations
correlations.sort_values('Pearson_r',ascending=False).head()

Unnamed: 0_level_0,pr_gene_symbol,Pearson_r,p_val
pr_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
890,CCNA2,0.342784,0
332,BIRC5,0.328564,0
7153,TOP2A,0.306413,0
11065,UBE2C,0.306173,0
983,CDK1,0.300557,0


In [6]:
# do the same analysis with ctrp
sig_info=pd.read_table('../results/CTRP/sig_info_merged_lm.csv',
                      sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/CTRP/signatures_merged_lm.csv',
                        sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='24 h'
sig_info=sig_info[fil]
signatures=signatures.loc[sig_info.index]
correlations=pd.DataFrame(index=gene_info.index,
                           columns=['pr_gene_symbol','Pearson_r','p_val'])
correlations['pr_gene_symbol']=gene_info['pr_gene_symbol']
for gene_id in correlations.index:
    r,p=pcor(signatures[gene_id],sig_info['cpd_avg_pv'])
    correlations.loc[gene_id,['Pearson_r','p_val']]=r,p
correlations.to_csv('../results/functional/ctrp_cors.csv')

In [5]:
# some randomization for later use
sig_info=pd.read_table('../results/Achilles/sig_info_merged_lm.csv',
                      sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/Achilles/signatures_merged_lm.csv',
                        sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='96 h'
sig_info=sig_info[fil]
signatures=signatures.loc[sig_info.index]
gene_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',
                       sep='\t',index_col=[0],header=0)
fil=gene_info['pr_is_lm']==1
gene_info=gene_info[fil]
gene_info.index=gene_info.index.astype(str)
correlations=pd.DataFrame(index=gene_info.index,
                           columns=['pr_gene_symbol']+list(np.array(range(1000)).astype(str)))
correlations['pr_gene_symbol']=gene_info['pr_gene_symbol']
np.random.seed(19890904)
for perm in correlations.columns[1:]:
    if int(perm) % 100==0:
        print(perm)
    sig_info.index=np.random.choice(sig_info.index,len(sig_info.index),replace=False)
    signatures=signatures.loc[sig_info.index]
    for gene_id in correlations.index:
        r,p=pcor(signatures[gene_id],sig_info['shRNA_abundance'])
        correlations.loc[gene_id,perm]=r
correlations.to_csv('../results/functional/achilles_bg_cors.csv')

0
100
200
300
400
500
600
700
800
900


In [21]:
#same for ctrp
sig_info=pd.read_table('../results/CTRP/sig_info_merged_lm.csv',
                      sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/CTRP/signatures_merged_lm.csv',
                        sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='24 h'
sig_info=sig_info[fil]
signatures=signatures.loc[sig_info.index]
gene_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',
                       sep='\t',index_col=[0],header=0)
fil=gene_info['pr_is_lm']==1
gene_info=gene_info[fil]
gene_info.index=gene_info.index.astype(str)
correlations=pd.DataFrame(index=gene_info.index,
                           columns=['pr_gene_symbol']+list(np.array(range(1000)).astype(str)))
correlations['pr_gene_symbol']=gene_info['pr_gene_symbol']
np.random.seed(19890904)
for perm in correlations.columns[1:]:
    if int(perm) % 100==0:
        print(perm)
    sig_info.index=np.random.choice(sig_info.index,len(sig_info.index),replace=False)
    signatures=signatures.loc[sig_info.index]
    for gene_id in correlations.index:
        r,p=pcor(signatures[gene_id],sig_info['cpd_avg_pv'])
        correlations.loc[gene_id,perm]=r
correlations.to_csv('../results/functional/ctrp_bg_cors.csv')

0
100
200
300
400
500
600
700
800
900


Now using these cell viablity - gene expression correlations, we will perform several "pathway analysis" like methods to get more detailed functional insight.

In [25]:
# Gene Ontology Enrichments


In [23]:
# PROGENy
correlations=pd.read_table('../results/functional/achilles_cors.csv',
                          sep=',',header=0,index_col=[0])
progeny=pd.read_table('../../../Tools/PROGENy/models/model_NatComm+14_human.csv',
                     sep=',',header=0,index_col=[0])
correlations.index=correlations['pr_gene_symbol']
correlations=pd.DataFrame(correlations['Pearson_r'])

In [24]:
genes=set(progeny.index) & set(correlations.index)

In [25]:
scores=pd.DataFrame(np.dot(correlations.loc[genes].T,progeny.loc[genes]),columns=progeny.columns)

In [26]:
bg=pd.DataFrame(index=range(101),columns=progeny.columns)
bg.loc[0]=scores.values

In [28]:
for i in range(1,101):
    correlations.index=np.random.choice(correlations.index,len(correlations.index),False)
    bg.loc[i]=pd.DataFrame(np.dot(correlations.loc[genes].T,progeny.loc[genes]),columns=progeny.columns).values

In [32]:
(bg-np.mean(bg,0))/np.std(bg,0)

Unnamed: 0,EGFR,Hypoxia,JAK-STAT,MAPK,NFkB,PI3K,TGFb,TNFa,Trail,VEGF,p53,Androgen,Estrogen,WNT
0,0.0715276,0.726864,-1.25248,0.398889,-3.36943,-0.262405,0.465005,-2.94872,0.431859,-0.474312,-0.789967,0.194101,1.09104,-1.5572
1,-0.114085,-0.358337,0.128904,-1.00022,-0.693578,-0.792708,-0.459393,-1.53202,-0.460252,-1.20189,1.33882,0.328413,1.99532,-0.527992
2,-1.58498,1.45415,0.826117,-2.05057,-0.234923,-1.00224,-1.02556,-0.52703,-1.01006,0.789437,0.144637,0.386393,0.243618,0.628335
3,0.037781,-0.762484,0.322457,-0.253927,-2.36676,-0.61614,0.833067,-3.2586,-0.683243,-1.26225,0.0213599,-0.0309414,0.0958506,0.215025
4,-0.455839,-2.84878,1.61259,0.0546807,2.17105,-0.445492,-0.131617,1.10367,2.50661,2.57896,0.54812,-0.185701,0.0730438,-0.780912
5,0.777214,-0.703167,-1.12115,0.986049,0.056015,-2.24887,-0.148393,0.245069,-1.71266,0.0244466,-1.73841,0.756522,-2.17685,1.04961
6,0.378126,1.66551,1.3092,0.377864,0.36557,-2.11677,-1.19885,-0.855954,-0.32644,0.073768,-0.14126,1.92444,-0.882701,-0.954368
7,1.73632,-0.00600754,-0.287114,2.30022,-0.442614,-0.0450444,0.627041,-0.859186,-0.612185,0.291489,0.607422,-0.667888,-0.318699,1.21682
8,-1.08022,-0.153386,-2.29013,-1.01233,0.780421,0.426852,-2.03722,0.954884,0.777675,-1.08015,-0.172444,-1.13152,-0.710732,-0.336244
9,1.74444,0.127242,-0.955401,1.65624,-1.34981,1.2241,-0.934368,-1.27961,0.555153,0.917655,-0.344311,0.314785,0.632242,2.38204
