In [2]:
import pandas as pd
import numpy as np

from subprocess import call

from matplotlib import pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr as pcor

%matplotlib inline
sns.set_style('whitegrid')

## Functional genomic analysis
Here we will analyse the cell death associated signature from functional genomic aspect.

At first we will calculate gene expression - cell viability correlation coefficients form Achilles-L1000-96h (and also CTRP-L1000-24h) datasets.

In [38]:
#achilles data
sig_info=pd.read_table('../results/Achilles/sig_info_merged_lm.csv',
                      sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/Achilles/signatures_merged_lm.csv',
                        sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='96 h'
sig_info=sig_info[fil]
signatures=signatures.loc[sig_info.index]

In [39]:
#prepare a dataframe to store correlations
gene_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',
                       sep='\t',index_col=[0],header=0)
fil=gene_info['pr_is_lm']==1
gene_info=gene_info[fil]
gene_info.index=gene_info.index.astype(str)
correlations=pd.DataFrame(index=gene_info.index,
                           columns=['pr_gene_symbol','Pearson_r','p_val'])
correlations['pr_gene_symbol']=gene_info['pr_gene_symbol']

In [40]:
for gene_id in correlations.index:
    r,p=pcor(signatures[gene_id],sig_info['shRNA_abundance'])
    correlations.loc[gene_id,['Pearson_r','p_val']]=r,p
correlations.to_csv('../results/functional/achilles_cors_lm.csv')

In [41]:
#just looking into the gene - viaiblity correlations
correlations.sort_values('Pearson_r',ascending=False).head()

Unnamed: 0_level_0,pr_gene_symbol,Pearson_r,p_val
pr_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
890,CCNA2,0.342784,0
332,BIRC5,0.328564,0
7153,TOP2A,0.306413,0
11065,UBE2C,0.306173,0
983,CDK1,0.300557,0


In [42]:
# do the same analysis with ctrp
sig_info=pd.read_table('../results/CTRP/sig_info_merged_lm.csv',
                      sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/CTRP/signatures_merged_lm.csv',
                        sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='24 h'
sig_info=sig_info[fil]
signatures=signatures.loc[sig_info.index]
correlations=pd.DataFrame(index=gene_info.index,
                           columns=['pr_gene_symbol','Pearson_r','p_val'])
correlations['pr_gene_symbol']=gene_info['pr_gene_symbol']
for gene_id in correlations.index:
    r,p=pcor(signatures[gene_id],sig_info['cpd_avg_pv'])
    correlations.loc[gene_id,['Pearson_r','p_val']]=r,p
correlations.to_csv('../results/functional/ctrp_cors_lm.csv')

## Gene Ontology Enrichment
We perform Gene Ontology (Biological Process) enrichment with piano R package (*fgsea* method) using correlation values as gene level statistics. The enriched GO terms are related to cell death / prolifeation and inflamation process.

In [14]:
# Gene Ontology Enrichments
call(['Rscript','GO_BP_enrichments.R'])

0

In [18]:
#just look into GO results
data=pd.read_table('../results/functional/enrichments/GOBP_achilles.tsv',
                   sep='\t',header=0,index_col=[0])
data.sort_values('p adj (dist.dir.dn)').head(10)

Unnamed: 0,Name,Genes (tot),Stat (dist.dir),p (dist.dir.up),p adj (dist.dir.up),p (dist.dir.dn),p adj (dist.dir.dn),Genes (up),Genes (down)
1732,GO_REGULATION_OF_SECRETION,56,-0.48515,,,0.000192,0.02028,16,40
2054,GO_VASCULATURE_DEVELOPMENT,51,-0.50775,,,0.000194,0.02028,12,39
1880,GO_RESPONSE_TO_OXYGEN_CONTAINING_COMPOUND,162,-0.36686,,,0.000183,0.02028,58,104
184,GO_CELLULAR_RESPONSE_TO_ORGANIC_SUBSTANCE,193,-0.34273,,,0.000181,0.02028,66,127
1819,GO_RESPONSE_TO_BACTERIUM,55,-0.51211,,,0.000192,0.02028,15,40
497,GO_INFLAMMATORY_RESPONSE,41,-0.59845,,,0.00019,0.02028,6,35
1612,GO_REGULATION_OF_MULTICELLULAR_ORGANISMAL_DEVE...,160,-0.35962,,,0.000183,0.02028,57,103
287,GO_DEFENSE_RESPONSE,93,-0.47241,,,0.000188,0.02028,27,66
40,GO_ANATOMICAL_STRUCTURE_FORMATION_INVOLVED_IN_...,75,-0.50289,,,0.000189,0.02028,20,55
1840,GO_RESPONSE_TO_EXTERNAL_STIMULUS,159,-0.37962,,,0.000183,0.02028,60,99


## KEGG pathway enrichment
We also performed KEGG pathway enrichment analisis, similary to GO enrichment.

In [19]:
# KEGG pathway Enrichments
call(['Rscript','KEGG_enrichments.R'])

0

In [21]:
data=pd.read_table('../results/functional/enrichments/KEGG_achilles.tsv',
                   sep='\t',header=0,index_col=[0])
data.sort_values('p adj (dist.dir.dn)').head(10)

Unnamed: 0,Name,Genes (tot),Stat (dist.dir),p (dist.dir.up),p adj (dist.dir.up),p (dist.dir.dn),p adj (dist.dir.dn),Genes (up),Genes (down)
23,KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION,17,-0.73142,,,0.000193,0.006919,2,15
62,KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY,18,-0.67053,,,0.000195,0.006919,1,17
97,KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY,17,-0.62931,,,0.001741,0.041199,3,14
49,KEGG_INSULIN_SIGNALING_PATHWAY,28,-0.52477,,,0.002896,0.0514,9,19
8,KEGG_APOPTOSIS,23,-0.54688,,,0.003684,0.052317,7,16
16,KEGG_CALCIUM_SIGNALING_PATHWAY,20,-0.56626,,,0.00465,0.055028,6,14
43,KEGG_HEDGEHOG_SIGNALING_PATHWAY,6,-0.79044,,,0.007429,0.058608,0,6
73,KEGG_PATHWAYS_IN_CANCER,75,-0.36908,,,0.006781,0.058608,27,48
79,KEGG_PRION_DISEASES,9,-0.6982,,,0.006868,0.058608,1,8
15,KEGG_B_CELL_RECEPTOR_SIGNALING_PATHWAY,19,-0.54774,,,0.00934,0.066316,3,16


## Transcription factor regulon enrichment
To infer Transcription Factors (TFs) activities - potentially having causal role in the observered gene expression changes - we performed TF regulon enrichment analysis. Regulons for TFs are already included in the repository, but can manually dowloaded from [DoRothEA repository](https://github.com/saezlab/DoRothEA).

In [44]:
# Transcription Factor regulon enrichment using DoRothEA and viper
call(['Rscript','TF_regulon_enrichments.R'])

0

In [45]:
# we remove low confidence level TFs from enrichment results (C,D and E)
data=pd.read_table('../results/functional/enrichments/DoRothEA.csv',
                  sep=',',header=0,index_col=[0])
tf_names=pd.Series(data.index).apply(lambda x:x.split('_')[0]).values
good_msk=pd.Series(data.index).apply(lambda x:x.split('_')[1][0] in ['A','B']).values
data.index=tf_names
data=data[good_msk]
data.columns=['NES']

In [46]:
data.to_csv('../results/functional/enrichments/DoRothEA.csv')
print(data.sort_values('NES'))

             NES
FOXO3  -3.816854
PRDM14 -3.440174
ESR2   -2.888764
POU2F1 -2.772940
TP53   -2.701049
USF2   -2.554411
STAT1  -2.442873
FOS    -2.313918
ETS1   -2.248835
JUN    -2.153735
ESR1   -1.969761
SPI1   -1.950526
SMAD4  -1.942263
TWIST1 -1.937114
FOXO1  -1.922666
SMAD3  -1.901023
FOXO4  -1.836478
TFAP2A -1.746101
ETV4   -1.743225
FLI1   -1.582792
SOX2   -1.581124
ATF2   -1.571681
AR     -1.498928
KLF4   -1.479860
USF1   -1.413601
VDR    -1.411384
HIF1A  -1.381960
PAX8   -1.331544
CTCF   -1.326600
ATF4   -1.325524
...          ...
CREB1  -0.429856
MYB    -0.287150
FOXL2  -0.286375
SREBF1 -0.276869
RELA   -0.213521
STAT3  -0.149852
WT1    -0.081892
RARA   -0.008986
FOXP1   0.014709
ELK1    0.103107
SP1     0.127504
CEBPD   0.134245
HNF4A   0.218703
ZNF263  0.450042
STAT5A  0.451467
TFAP2C  0.461781
ETS2    0.795876
E2F3    1.058977
TCF7L2  1.080351
FOXA1   1.217310
YY1     1.519722
SREBF2  1.527549
BACH1   1.759120
MYC     1.821395
ATF1    1.863959
FOXM1   2.468741
TFDP1   3.0689

We also compared these results with average gene essentialities of TF genes (from project Achilles). Our hypotesis was that as TFs can be the causal factors behind cell death / proliferation related gene expression changes, so TF activity changes and gene essentialities should correlate.

In [55]:
gene_dep=pd.read_table('../data/Achilles/D2_combined_gene_dep_scores.csv',sep=',',
                      header=0,index_col=[0])
genes=pd.Series(gene_dep.index).apply(lambda x :x.split(' (')[0])
gene_dep.index=genes
gene_dep=pd.DataFrame(np.mean(gene_dep,1),columns=['Mean gene essentiality'])

In [59]:
tfs=list(set(gene_dep.index) & set(data.index))

print('Correlation between activity and essentiality of transcription factors: r=%f p=%f' %
      pcor(data.loc[tfs,'NES'],gene_dep.loc[tfs,'Mean gene essentiality']))

Correlation between activity and essentiality of transcription factors: r=-0.322410 p=0.005087


## Signaling pathway footrpint analysis (PROGENy)
We also calculated pathway activity changes associated with cell death / proliferation. For this we used PROGENy framework. Model matrix for PROGENy is already included in the repository, but can manually dowloaded from [PROGENy repository](https://github.com/saezlab/progeny)

In [30]:
# for PROGENy we have low overlap between landmark and PROGENY genes
# so we use bing gene - cell viability correlations
# to calculate this, you have to re-run a first part of this
# notebook with bing genes: replacing fil=gene_info['pr_is_lm']==1
# to fil=gene_info['pr_is_bing']==1 in second cell (and renaming output files *_bing.*)
correlations=pd.read_table('../results/functional/achilles_cors_bing.csv',
                          sep=',',header=0,index_col=[0])
progeny=pd.read_table('../data/Functional/PROGENy.csv',
                     sep=',',header=0,index_col=[0])
correlations.index=correlations['pr_gene_symbol']
correlations=pd.DataFrame(correlations['Pearson_r'])

In [31]:
genes=list(set(progeny.index)&set(correlations.index))
print(len(genes))

898


In [32]:
scores=pd.DataFrame(index=range(1001),columns=progeny.columns)
#first real pathway activity
scores.loc[0]=np.dot(correlations.loc[genes].T,progeny.loc[genes])
# than 1000 random permutation of genes in correlations
np.random.seed(19890904)
for i in range(1,1001):
    correlations.index=np.random.choice(correlations.index,len(correlations.index),False)
    scores.loc[i]=np.dot(correlations.loc[genes].T,progeny.loc[genes])

In [33]:
scores=((scores-np.mean(scores.iloc[1:],0))/np.std(scores.iloc[1:],0)).loc[0]
scores=pd.DataFrame(scores)
scores.columns=['Pathway activity']

In [34]:
scores.to_csv('../results/functional/enrichments/PROGENy.csv')
print(scores)

         Pathway activity
EGFR              2.94481
Hypoxia           2.29985
JAK-STAT         -3.26629
MAPK              4.33919
NFkB             -1.26535
PI3K               2.9206
TGFb             -1.38173
TNFa             -1.18426
Trail            0.914288
VEGF              0.33155
p53              -1.99774
Androgen          1.53871
Estrogen          2.36513
WNT              -1.76168


## Association with drug sensitivity
We will calculte the "cell viability signature score" for the cancer cell lines in the GDSC panel, and see that this score is associated with drug sensitivity. We calculate "cell viaiblity signature score" by predicting cell viaiblity (using Achilles model) from the baseline gene expression of GDSC cell lines.

In [2]:
# we use normalised gene expression, but with lowess smoothening for standard deviation
from statsmodels.nonparametric.smoothers_lowess import lowess

In [267]:
# translating gene ids
call(['Rscript','gene_translate.R'])

0

In [268]:
#calculating gene expression
expression=pd.read_table('../data/GDSC/sanger1018_brainarray_ensemblgene_rma.txt',
                        sep='\t',header=0,index_col=[0])
gene_anno=pd.read_table('../data/GDSC/ensembl_hgnc.csv',sep=',',header=0,index_col=[0])
# remove not 1on1 translations and NaNs
gene_anno=gene_anno.drop_duplicates('ensembl_gene_id')
gene_anno=gene_anno.drop_duplicates('hgnc_symbol')
msk=np.sum(pd.isnull(gene_anno),1)==0
gene_anno=gene_anno[msk]
# translate gdsc gene ids gene symbol
gene_anno.index=gene_anno['ensembl_gene_id']
gene_anno=gene_anno['hgnc_symbol']
genes=list(set(gene_anno.index)&set(expression.index))
expression=expression.loc[genes]
gene_anno=gene_anno[genes]
expression.index=gene_anno.values
expression.to_csv('../data/GDSC/gex.csv',sep=',')
std_pred=lowess(np.std(expression,1),np.mean(expression,1),return_sorted=False)
expression_norm=((expression.T-np.mean(expression,1))/std_pred).T
expression_norm.to_csv('../data/GDSC/norm_gex.csv')

In [80]:
expression_norm=pd.read_table('../data/GDSC/norm_gex.csv',sep=',',header=0,index_col=[0])
cell_anno=pd.read_excel('../data/GDSC/Cell_Lines_Details.xlsx',skip_footer=1)
fil=(~pd.isnull(cell_anno['Microsatellite \ninstability Status (MSI)'])) &\
    (~pd.isnull(cell_anno['Cancer Type\n(matching TCGA label)'])) &\
    (cell_anno['Cancer Type\n(matching TCGA label)']!='UNABLE TO CLASSIFY')
cell_anno=cell_anno[fil]
cell_anno.loc[:,'COSMIC identifier']=cell_anno['COSMIC identifier'].astype(str)
cosmics=list(set(cell_anno['COSMIC identifier'])&set(expression_norm.columns))
expression_norm=expression_norm[cosmics]

We will calculate the association between "cell viaiblity signature score" and drug sensitivity, and compare it with random signatures (produced by training our Achilles model on sample or gene wise normalised data) and with drug -gene expression associations.

In [7]:
from sklearn.linear_model import Ridge

In [8]:
#creating random models
np.random.seed(19890904)
signature=pd.read_table('../results/Achilles/signatures_merged_lm.csv',
                            sep=',',header=0,index_col=[0])
sig_info=pd.read_table('../results/Achilles/sig_info_merged_lm.csv',
                            sep=',',header=0,index_col=[0])
fil=sig_info['pert_itime']=='96 h'
sig_info=sig_info[fil]
signature=signature[fil]
randomised_genes=pd.DataFrame(index=signature.columns,columns=range(1000))
randomised_samples=pd.DataFrame(index=signature.columns,columns=range(1000))
#gene randomisation
for i in range(1000):
    if (i%100) == 0:
        print(i,flush=True)
    signature_rand=signature.copy()
    sig_info_rand=sig_info.copy()
    # here we shuffle genes #actually we do not have to recalculte the model every time...
    signature_rand.columns=np.random.choice(signature_rand.columns,
                                            len(signature_rand.columns),False)
    model=Ridge()
    model.fit(signature_rand,sig_info_rand['shRNA_abundance'])
    randomised_genes.loc[signature_rand.columns,i]=model.coef_
randomised_genes.to_csv('../results/functional/linear_models/random_gene_coef.csv',sep=',')
#sample randomisation
for i in range(1000):
    if (i%100) == 0:
        print(i,flush=True)
    signature_rand=signature.copy()
    sig_info_rand=sig_info.copy()
    # here we shuffle samples - ie.the corresponding viability values
    sig_info_rand['shRNA_abundance']=np.random.choice(sig_info_rand['shRNA_abundance'],
                                                    len(sig_info_rand['shRNA_abundance']),False)
    model=Ridge()
    model.fit(signature_rand,sig_info_rand['shRNA_abundance'])
    randomised_samples[i]=model.coef_
randomised_samples.to_csv('../results/functional/linear_models/random_sample_coef.csv',sep=',')

0
100
200
300
400
500
600
700
800
900
0
100
200
300
400
500
600
700
800
900


We create a matrix containing real and randomised model coefficients, and will calculate "signature scores" using them.

In [144]:
model_true=pd.read_table('../results/model/final_models/achilles.csv',sep=',',
                        header=0,index_col=[0]).iloc[:-1,:]
model_true.index=model_true.index.astype(int)
model_random_sample=pd.read_table('../results/functional/linear_models/random_sample_coef.csv',sep=',',
                                 header=0,index_col=[0])
model_random_gene=pd.read_table('../results/functional/linear_models/random_gene_coef.csv',sep=',',header=0,
                               index_col=[0])
model_true.columns=['pr_gene_symbol','real_model']
model_random_gene.columns=pd.Series(np.array(range(1000)).astype(str)).apply(lambda x: 'gene_'+x).values
model_random_sample.columns=pd.Series(np.array(range(1000)).astype(str)).apply(lambda x: 'sample_'+x).values
all_models=pd.concat([model_true,model_random_gene,model_random_sample],1)
all_models.index=all_models['pr_gene_symbol']
del all_models['pr_gene_symbol']

In [145]:
genes=list(set(expression_norm.index) & set(all_models.index))
print('We have %i common genes between cancer cell lines and LINCS models.' % len(genes))
all_models=all_models.loc[genes]
all_models.to_csv('../results/functional/linear_models/all_models_coef.csv',sep=',')
all_scores=pd.DataFrame(np.dot(expression_norm.loc[genes].T,all_models.loc[genes]),
                        index=expression_norm.columns,columns=all_models.columns)
all_scores=pd.concat([all_scores,expression_norm.loc[genes,all_scores.index].T],1)
all_scores.to_csv('../results/functional/linear_models/all_scores.csv',sep=',')

We have 903 common genes between cancer cell lines and LINCS models.


Now we will compare these signarture scores with drug sensitivity for the GDSC panel. We will fit linear models between drug sensitvity (LN_IC50) and signature score (using histology type and microsatelite instability as covariates). We will store the p_values, coefficients of these models, and will also calculate partial correlation between sensitivity and signature score.

In [1]:
import statsmodels.formula.api as smf
from statsmodels.sandbox.stats.multicomp import multipletests
from scipy.stats import pearsonr

In [3]:
all_scores=pd.read_table('../results/functional/linear_models/all_scores.csv',sep=',',
                         header=0,index_col=0)
all_scores.index=all_scores.index.astype(str)
drug_sensitivity=pd.read_excel('../data/GDSC/v17.3_fitted_dose_response.xlsx')
drug_sensitivity['COSMIC_ID']=drug_sensitivity['COSMIC_ID'].astype(str)
cell_anno=pd.read_excel('../data/GDSC/Cell_Lines_Details.xlsx',skip_footer=1)
cell_anno.loc[:,'COSMIC identifier']=cell_anno['COSMIC identifier'].astype(str)
cell_anno.index=cell_anno['COSMIC identifier']

In [4]:
fil=np.in1d(drug_sensitivity['COSMIC_ID'],all_scores.index)
drug_sensitivity=drug_sensitivity[fil]
drug_sensitivity['Tissue']=cell_anno.loc[drug_sensitivity['COSMIC_ID'].values,
                                         'Cancer Type\n(matching TCGA label)'].values
drug_sensitivity['MSI']=cell_anno.loc[drug_sensitivity['COSMIC_ID'].values,
                                     'Microsatellite \ninstability Status (MSI)'].values

In [114]:
#prepare matrices to store data
drugs=list(set(drug_sensitivity['DRUG_ID']))
results_coefs=pd.DataFrame(index=drugs,columns=all_scores.columns)
results_pvals=pd.DataFrame(index=drugs,columns=all_scores.columns)
results_pcors=pd.DataFrame(index=drugs,columns=all_scores.columns)

In [130]:
for m in all_scores.columns:
    drug_sensitivity.loc[:,'Score']=all_scores.loc[drug_sensitivity['COSMIC_ID'].values,
                                                  m].values
    for drug in drugs:
        fil=drug_sensitivity['DRUG_ID']==drug
        model_data=drug_sensitivity[fil].copy()[['COSMIC_ID','Tissue','Score','MSI','LN_IC50','AUC']]
        model_0=smf.ols('LN_IC50 ~ Score + Tissue + MSI',data=model_data).fit()
        results_coefs.loc[drug,m]=model_0.params['Score']
        results_pvals.loc[drug,m]=model_0.pvalues['Score']
        model_1=smf.ols('LN_IC50 ~ Tissue + MSI',data=model_data).fit()
        model_2=smf.ols('Score ~ Tissue + MSI',data=model_data).fit()
        results_pcors.loc[drug,m]=pcor(model_1.resid,model_2.resid)[0]
results_coefs.to_csv('../results/functional/linear_models/all_scores_lm_coef.csv',sep=',')
results_pcors.to_csv('../results/functional/linear_models/all_scores_lm_pcor.csv',sep=',')
results_pvals.to_csv('../results/functional/linear_models/all_scores_lm_pval.csv',sep=',')

Based on our results it our real signature score shows signaificant associations with drug sensitivity for several drugs. Most of these drugs are not targeted therapies but "cytotoxic" agents. This rises the question whether our signature score is associated with "general drug senstivity" [Geelher et al. Genome Biology](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1050-9). To calculate general drug sensitivty, we fitted a linear model for drug sensitivity (LN_IC50) using cell line and drug IDs. We used the coefficients for cell lines as general drug sensitivity.

In [29]:
drug_sensitivity.loc[:,'DRUG_ID']=drug_sensitivity['DRUG_ID'].astype(str)
model=smf.ols('LN_IC50 ~ 0 + COSMIC_ID + DRUG_ID',data=drug_sensitivity).fit()

In [39]:
general_sensitivity=model.params
general_sensitivity=general_sensitivity[pd.Series(general_sensitivity.index).apply(lambda x:x[0]=='C').values]
general_sensitivity.index=pd.Series(general_sensitivity.index).apply(lambda x:x.split('[')[1][:-1]).values
general_sensitivity=pd.DataFrame(general_sensitivity,columns=['GLDS'])
general_sensitivity=pd.concat([general_sensitivity,all_scores.loc[general_sensitivity.index]],1)
general_sensitivity['Tissue']=cell_anno.loc[general_sensitivity.index,'Cancer Type\n(matching TCGA label)']
general_sensitivity['MSI']=cell_anno.loc[general_sensitivity.index,'Microsatellite \ninstability Status (MSI)']
general_sensitivity.to_csv('../results/functional/linear_models/GLDS_data.csv',sep=',')

In [41]:
correlations=pd.DataFrame(index=general_sensitivity.columns[1:-2],columns=['cor','pcor'])
model_1=smf.ols('GLDS ~ Tissue + MSI',data=general_sensitivity).fit()
for m in correlations.index:
    general_sensitivity['Score']=general_sensitivity[m]
    r=pcor(general_sensitivity['GLDS'],general_sensitivity['Score'])[0]
    correlations.loc[m,'cor']=r
    model_2=smf.ols('Score ~ Tissue + MSI',data=general_sensitivity).fit()
    r=pcor(model_1.resid,model_2.resid)[0]
    correlations.loc[m,'pcor']=r
correlations.to_csv('../results/functional/linear_models/GLDS_cor.csv',sep=',')

In [46]:
print('Smaller partial correaltion:'
      ,np.sum(correlations['pcor']<correlations.loc['real_model','pcor'])/len(correlations),)

Smaller partial correaltion: 0.0361570247934


Based on this our signature score shows significantly (p=0.0361) better correlation with general drug sensitivity than random signatures.

As previously [showed]() drug sensitivity (using classical metrics like IC50 and AUC) correlates with division time of the cells (i.e. cells dividing faster are more sensitive to drugs). Based on this we analysed the associations between signature score and cell division time. At first we analysed the data form cell division rate. We need to download **DS1.zip** from [Hafner et al, Scientific Data 2017](https://datadryad.org/resource/doi:10.5061/dryad.03n60). **DS1_datafile.xls** goes to *'../data/GDSC/*.

In [93]:
division_time=pd.read_excel('../data/GDSC/DS1_datafile.xlsx')

In [94]:
division_time=division_time.drop_duplicates('Biological Replicate ID')
division_time=division_time[['Cell Name','Nominal Division Rate']]
division_time=division_time.groupby('Cell Name').mean()

In [95]:
cell_anno.index=cell_anno['Sample Name']
division_time.loc[:,'COSMIC_ID']=cell_anno.loc[division_time.index,'COSMIC identifier']
fil=~pd.isnull(division_time['COSMIC_ID'])
division_time=division_time[fil]
division_time.index=division_time['COSMIC_ID']
division_time=pd.DataFrame(division_time['Nominal Division Rate'])

In [96]:
division_time=pd.concat([division_time,all_scores.loc[division_time.index]],1)
fil=np.sum(pd.isnull(division_time),1)==0
division_time=division_time[fil]

In [97]:
pcor(division_time['Nominal Division Rate'],division_time['real_model'])

(-0.11779684036549784, 0.5138347487074193)

We did not get significant correlation between Division Rate and Signature Scores. However the division times and gene expression (i.e. calculated signature scores) were coming from different studies (Hafner et al. and GDSC, respectively). It has been previously shown, that laboratory specific factors (like used media etc.) can influence division time. In the [gCSI study](https://www.nature.com/articles/nature17987) gene expression and division time was measured in the same place, so we tried to analyse these data also. We can access the [data](http://research-pub.gene.com/gCSI-cellline-data/) and use the intstructions to install it.

In [98]:
# just extracts data from gCSI library
call(['Rscript','get_gCSI_data.R'])

0

In [181]:
#read data
anno=pd.read_table('../data/gCSI/genomics_info.csv',sep=',',header=0,index_col=[0])
genomics=pd.read_table('../data/gCSI/genomics.csv',sep=',',header=0,index_col=[0])

In [182]:
fil=(pd.Series(genomics.columns).apply(lambda x:x.split('.')[0])=='vsd').values
gex=genomics[genomics.columns[fil]]
gex.columns=pd.Series(gex.columns).apply(lambda x:x.split('.')[1]).values
fil=~pd.isnull(anno['Symbol'])
anno=anno[fil]
anno=anno.drop_duplicates('Symbol')
genes=list(set(anno.index)&set(gex.columns))
anno=anno.loc[genes]
gex=gex[genes]
anno=anno['Symbol']
gex.columns=anno[gex.columns].values

In [183]:
#calculate signature and random scores
model_true=pd.read_table('../results/model/final_models/achilles.csv',sep=',',
                        header=0,index_col=[0]).iloc[:-1,:]
model_true.index=model_true.index.astype(int)
model_random_sample=pd.read_table('../results/functional/linear_models/random_sample_coef.csv',sep=',',
                                 header=0,index_col=[0])
model_random_gene=pd.read_table('../results/functional/linear_models/random_gene_coef.csv',sep=',',header=0,
                               index_col=[0])
model_true.columns=['pr_gene_symbol','real_model']
model_random_gene.columns=pd.Series(np.array(range(1000)).astype(str)).apply(lambda x: 'gene_'+x).values
model_random_sample.columns=pd.Series(np.array(range(1000)).astype(str)).apply(lambda x: 'sample_'+x).values
models=pd.concat([model_true,model_random_gene,model_random_sample],1)
models.index=models['pr_gene_symbol']
del models['pr_gene_symbol']
genes=list(set(gex.columns)&set(models.index))
print(len(genes),'shared gene between model and expression data')
scores=pd.DataFrame(np.dot(gex[genes],models.loc[genes]),index=gex.index,columns=models.columns)
scores=pd.concat([scores,gex.loc[scores.index,genes]],1)

962 shared gene between model and expression data


In [184]:
# get Doubling time and tissue
cell_anno=pd.read_table('../data/gCSI/cell_line_info.csv',sep=',',header=0,index_col=[0])
fil=~pd.isnull(cell_anno['DoublingTime'])
cell_anno=cell_anno[fil]
fil=cell_anno['TissueMetaclass']!='Unclassified'
cell_anno=cell_anno[fil]
cells=list(set(scores.index)&set(cell_anno.index))
cell_anno=cell_anno.loc[cells]
scores=scores.loc[cells]

In [185]:
scores.loc[:,'Tissue']=cell_anno.loc[scores.index,'TissueMetaclass']
scores.loc[:,'DoublingTime']=cell_anno.loc[scores.index,'DoublingTime']
scores.to_csv('../results/functional/linear_models/gCSI_data.csv',sep=',')

In [186]:
correlations=pd.DataFrame(index=scores.columns[:-2],columns=['cor','pcor'])
model_1=smf.ols('DoublingTime ~ Tissue',data=scores).fit()
for m in correlations.index:
    scores['Score']=scores[m]
    r=pcor(scores['DoublingTime'],scores['Score'])[0]
    correlations.loc[m,'cor']=r
    model_2=smf.ols('Score ~ Tissue',data=scores).fit()
    r=pcor(model_1.resid,model_2.resid)[0]
    correlations.loc[m,'pcor']=r
correlations.to_csv('../results/functional/linear_models/gCSI_cor.csv',sep=',')

In [187]:
print('Smaller partial correaltion:'
      ,np.sum(correlations['pcor']<correlations.loc['real_model','pcor'])/len(correlations),)

Smaller partial correaltion: 0.00236247046912


Based on these results, we our signature score associates with doubling time also.