# Generate list of stromal genes with negative association with TCGAov tumour cellularity (a.k.a. purity)

Script related to Consensus gene sets

### Aim:
- Generate list of stromal genes with negative correlation with TCGAov tumour purity, as TIMER has done for immune cells, in this case for stromal cells (endothelial and fibroblasts so far).

In [1]:
!python --version

Python 3.6.8 :: Anaconda custom (64-bit)


## Import packages

In [1]:
import scipy.stats
import pandas as pd

In [2]:
home = "~/git_repos/HGSOC_TME_Heterogeneity/"

## Load ESTIMATE's stromal genes

In [3]:
gene_sets = pd.read_csv(
    home + "Data/3/Hallmarks_ConsensusTME_GeneSets.txt",
    sep = "\t"
)

gene_sets.columns.values

array(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', 'HALLMARK_HYPOXIA',
       'HALLMARK_CHOLESTEROL_HOMEOSTASIS', 'HALLMARK_MITOTIC_SPINDLE',
       'HALLMARK_WNT_BETA_CATENIN_SIGNALING',
       'HALLMARK_TGF_BETA_SIGNALING', 'HALLMARK_IL6_JAK_STAT3_SIGNALING',
       'HALLMARK_DNA_REPAIR', 'HALLMARK_G2M_CHECKPOINT',
       'HALLMARK_APOPTOSIS', 'HALLMARK_NOTCH_SIGNALING',
       'HALLMARK_ADIPOGENESIS', 'HALLMARK_ESTROGEN_RESPONSE_EARLY',
       'HALLMARK_ESTROGEN_RESPONSE_LATE', 'HALLMARK_ANDROGEN_RESPONSE',
       'HALLMARK_MYOGENESIS', 'HALLMARK_PROTEIN_SECRETION',
       'HALLMARK_INTERFERON_ALPHA_RESPONSE',
       'HALLMARK_INTERFERON_GAMMA_RESPONSE', 'HALLMARK_APICAL_JUNCTION',
       'HALLMARK_APICAL_SURFACE', 'HALLMARK_HEDGEHOG_SIGNALING',
       'HALLMARK_COMPLEMENT', 'HALLMARK_UNFOLDED_PROTEIN_RESPONSE',
       'HALLMARK_PI3K_AKT_MTOR_SIGNALING', 'HALLMARK_MTORC1_SIGNALING',
       'HALLMARK_E2F_TARGETS', 'HALLMARK_MYC_TARGETS_V1',
       'HALLMARK_MYC_TARGETS_V2',
       'HALLMARK_

In [4]:
stroma_geneset = gene_sets["StromalSignature"]

stroma_geneset.head()

0      DCN
1    PAPPA
2    SFRP4
3    THBS2
4     LY86
Name: StromalSignature, dtype: object

In [5]:
type(stroma_geneset)

pandas.core.series.Series

## Load TCGAov gene expression

In [6]:
TCGAov_rna = pd.read_csv(
    home + "Data/1/TCGAov_RNAseq.txt",
    sep = "\t",
    index_col = "GeneSymbol"
)

TCGAov_rna.head()

Unnamed: 0_level_0,TCGA-04-1348-01,TCGA-04-1357-01,TCGA-04-1362-01,TCGA-04-1364-01,TCGA-04-1365-01,TCGA-04-1514-01,TCGA-04-1519-01,TCGA-09-0364-01,TCGA-09-0366-01,TCGA-09-0367-01,...,TCGA-61-2102-01,TCGA-61-2104-01,TCGA-61-2109-01,TCGA-61-2110-01,TCGA-61-2111-01,TCGA-61-2113-01,TCGA-OY-A56P-01,TCGA-OY-A56Q-01,TCGA-VG-A8LO-01,TCGA-WR-A838-01
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,66.4695,65.5664,41.6412,187.0368,23.9295,32.8123,46.3418,162.8522,88.5518,192.7354,...,63.7012,99.1485,38.3684,140.3668,94.7103,70.4394,43.1736,33.6854,27.9604,16.2092
A1BG-AS1,36.3243,28.1315,23.2465,114.6008,10.4957,20.19,17.2007,83.0906,65.6758,55.1265,...,30.3602,71.1797,26.0239,77.2415,61.489,106.1623,47.8864,25.3482,17.9853,42.945
A1CF,0.0,0.0,0.331,0.0,0.0,0.0,0.0,0.9034,0.0,0.0,...,0.0,0.0,0.0,1.9057,0.0,0.0,0.0,0.0,0.0,0.0
A2M,5899.8279,9384.4401,3350.4207,1455.2316,3999.3792,3224.5797,1986.1593,1487.3247,10557.8325,4603.9013,...,10264.1373,7220.7664,7200.1144,1819.9428,6179.7358,5365.7972,8773.5936,10006.7877,1888.1746,4205.6055
A2M-AS1,118.4566,111.0026,71.3613,67.8607,52.1501,224.7582,182.065,120.2838,53.6419,92.9112,...,146.613,45.8648,64.4638,28.5565,100.6605,23.5625,84.2153,582.3492,59.8084,143.491


## Load TCGAov ABSOLUTE tumour cellularity values

In [7]:
TCGA_ABSOLUTE = home + 'Data/3/TCGA_mastercalls.abs_tables_JSedit.fixed.txt'

TCGA_ABSOLUTE = pd.read_csv(TCGA_ABSOLUTE,
                            sep='\t',
                            index_col='array')

TCGA_ABSOLUTE.head()

Unnamed: 0_level_0,sample,call status,purity,ploidy,Genome doublings,Coverage for 80% power,Cancer DNA fraction,Subclonal genome fraction,solution
array,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-OR-A5J1-01,TCGA-OR-A5J1-01A-11D-A29H-01,called,0.9,2.0,0.0,9.0,0.9,0.02,new
TCGA-OR-A5J2-01,TCGA-OR-A5J2-01A-11D-A29H-01,called,0.89,1.3,0.0,6.0,0.84,0.16,new
TCGA-OR-A5J3-01,TCGA-OR-A5J3-01A-11D-A29H-01,called,0.93,1.27,0.0,5.0,0.89,0.11,new
TCGA-OR-A5J4-01,TCGA-OR-A5J4-01A-11D-A29H-01,called,0.87,2.6,1.0,12.0,0.89,0.08,new
TCGA-OR-A5J5-01,TCGA-OR-A5J5-01A-11D-A29H-01,called,0.93,2.79,1.0,12.0,0.95,0.15,new


### Select purity (tumour cellularity) column

In [8]:
TCGA_ABSOLUTE = TCGA_ABSOLUTE.purity

TCGA_ABSOLUTE.head()

array
TCGA-OR-A5J1-01    0.90
TCGA-OR-A5J2-01    0.89
TCGA-OR-A5J3-01    0.93
TCGA-OR-A5J4-01    0.87
TCGA-OR-A5J5-01    0.93
Name: purity, dtype: float64

## Correlate gene expression with tumour cellularity

### For each gene calculate its  Pearson's correlation with tumour cellularity across samples

#### NOTE: Pearson correlation as TIMER has done for immune related genes

In [9]:
TCGAov_rna = TCGAov_rna.T

TCGAov_rna.head()

GeneSymbol,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GALT,A4GNT,AAAS,AACS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA-04-1348-01,66.4695,36.3243,0.0,5899.8279,118.4566,7.5289,92.498,0.5378,2186.0715,1151.1159,...,928.9002,794.5684,36.0312,235.2783,827.1041,5.6467,560.0968,15871.2019,505.7811,475.9344
TCGA-04-1357-01,65.5664,28.1315,0.0,9384.4401,111.0026,54.6875,298.1771,0.0,1073.5677,249.349,...,195.1107,217.4479,110.026,583.3333,1242.1875,3.9062,600.2604,5378.9062,805.3385,415.3646
TCGA-04-1362-01,41.6412,23.2465,0.331,3350.4207,71.3613,5.6263,697.9919,0.0,1140.8147,581.1635,...,421.6182,605.3235,93.3304,788.6746,1575.0325,56.5939,915.7627,6137.2982,901.5315,803.8987
TCGA-04-1364-01,187.0368,114.6008,0.0,1455.2316,67.8607,5.0883,147.9842,0.0,1607.473,785.7155,...,913.9953,1079.9878,13.9928,137.3836,1138.503,13.1447,811.5809,5972.3706,372.7166,444.8006
TCGA-04-1365-01,23.9295,10.4957,0.0,3999.3792,52.1501,3.3148,203.1038,0.6027,535.7842,928.7329,...,646.5632,1882.7784,35.8596,328.4617,1455.7782,7.2322,1031.7915,7211.9934,514.0877,787.1026


### Select only stromal genes in the TCGAov gene expression matrix

In [10]:
TCGAov_rna_stromal = TCGAov_rna[set(TCGAov_rna.columns.values) & set(stroma_geneset)]

TCGAov_rna_stromal.head()

GeneSymbol,C1QB,C1QA,COL8A2,ENPEP,GIMAP5,FRZB,SERPING1,COL10A1,DCN,ISLR,...,VCAM1,ARHGAP28,AOC3,CD33,ZNF423,WISP1,RGS4,MAF,ECM2,TLR7
TCGA-04-1348-01,10164.2915,6170.207,165.9048,177.736,418.9298,114.278,19434.2565,118.5803,1644.797,1044.9045,...,671.1482,38.1823,27.9645,108.6314,114.278,54.8535,86.0446,542.3501,55.6601,116.9669
TCGA-04-1357-01,19589.1927,10033.2031,209.6354,88.5417,671.875,154.2969,19902.3438,825.5208,4611.3281,2503.2552,...,521.4844,29.2969,77.474,149.0885,318.3594,68.3594,109.375,1046.224,91.1458,421.224
TCGA-04-1362-01,3141.1291,1940.0799,174.4152,93.3304,96.309,56.9249,6737.9883,18.5337,1208.9921,452.0896,...,56.263,26.1457,35.0816,40.046,223.3971,30.7792,55.2701,338.2398,32.103,150.2553
TCGA-04-1364-01,144.592,137.8076,44.5225,79.7164,30.9537,159.0088,9896.7079,4.2402,996.8792,296.3924,...,49.1867,183.1781,424.0235,2.1201,585.1524,14.4168,49.6107,100.4936,39.4342,7.2084
TCGA-04-1365-01,4292.9034,3111.9482,108.4827,179.5992,276.0283,1077.294,11157.752,34.3529,1119.4817,390.2366,...,178.9965,73.5272,45.8038,65.9937,68.1031,44.8998,91.3063,276.0283,26.2167,71.1165


### Select only TCGAov samples for ABSOLUTE tumour cellularity scores

In [11]:
TCGAov_ABSOLUTE = TCGA_ABSOLUTE[list(TCGA_ABSOLUTE.index & TCGAov_rna_stromal.index)]

TCGAov_ABSOLUTE.head()

array
TCGA-04-1348-01    0.76
TCGA-04-1357-01    0.52
TCGA-04-1362-01    0.86
TCGA-09-0364-01    0.89
TCGA-09-0369-01    0.90
Name: purity, dtype: float64

### Check nan

In [12]:
TCGAov_ABSOLUTE.isnull().values.any()

True

### Remove samples that do not have tumour cellularity data

In [13]:
TCGAov_ABSOLUTE = TCGAov_ABSOLUTE[~TCGAov_ABSOLUTE.isnull()]

### Match expression matrix samples to tumour cellularity data

In [14]:
TCGAov_rna_stromal = TCGAov_rna_stromal.loc[TCGAov_ABSOLUTE.index]

TCGAov_rna_stromal.head()

GeneSymbol,C1QB,C1QA,COL8A2,ENPEP,GIMAP5,FRZB,SERPING1,COL10A1,DCN,ISLR,...,VCAM1,ARHGAP28,AOC3,CD33,ZNF423,WISP1,RGS4,MAF,ECM2,TLR7
array,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-04-1348-01,10164.2915,6170.207,165.9048,177.736,418.9298,114.278,19434.2565,118.5803,1644.797,1044.9045,...,671.1482,38.1823,27.9645,108.6314,114.278,54.8535,86.0446,542.3501,55.6601,116.9669
TCGA-04-1357-01,19589.1927,10033.2031,209.6354,88.5417,671.875,154.2969,19902.3438,825.5208,4611.3281,2503.2552,...,521.4844,29.2969,77.474,149.0885,318.3594,68.3594,109.375,1046.224,91.1458,421.224
TCGA-04-1362-01,3141.1291,1940.0799,174.4152,93.3304,96.309,56.9249,6737.9883,18.5337,1208.9921,452.0896,...,56.263,26.1457,35.0816,40.046,223.3971,30.7792,55.2701,338.2398,32.103,150.2553
TCGA-09-0364-01,442.2165,279.152,39.7498,136.4141,28.0055,460.7363,9112.6421,4.0653,1506.8789,630.1247,...,402.015,261.5356,116.9909,8.1306,7247.1116,43.8151,46.0736,1014.0717,68.207,5.4204
TCGA-09-0369-01,3830.2561,2882.4549,305.3635,409.8917,149.1583,381.7043,12493.4762,14.0937,18318.5778,11582.0837,...,326.504,550.8287,343.8275,64.8897,859.1283,134.4774,214.3417,1424.0507,87.2048,45.5109


In [15]:
TCGAov_rna_stromal.shape

(293, 137)

In [16]:
TCGAov_rna_stromal.head()

GeneSymbol,C1QB,C1QA,COL8A2,ENPEP,GIMAP5,FRZB,SERPING1,COL10A1,DCN,ISLR,...,VCAM1,ARHGAP28,AOC3,CD33,ZNF423,WISP1,RGS4,MAF,ECM2,TLR7
array,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-04-1348-01,10164.2915,6170.207,165.9048,177.736,418.9298,114.278,19434.2565,118.5803,1644.797,1044.9045,...,671.1482,38.1823,27.9645,108.6314,114.278,54.8535,86.0446,542.3501,55.6601,116.9669
TCGA-04-1357-01,19589.1927,10033.2031,209.6354,88.5417,671.875,154.2969,19902.3438,825.5208,4611.3281,2503.2552,...,521.4844,29.2969,77.474,149.0885,318.3594,68.3594,109.375,1046.224,91.1458,421.224
TCGA-04-1362-01,3141.1291,1940.0799,174.4152,93.3304,96.309,56.9249,6737.9883,18.5337,1208.9921,452.0896,...,56.263,26.1457,35.0816,40.046,223.3971,30.7792,55.2701,338.2398,32.103,150.2553
TCGA-09-0364-01,442.2165,279.152,39.7498,136.4141,28.0055,460.7363,9112.6421,4.0653,1506.8789,630.1247,...,402.015,261.5356,116.9909,8.1306,7247.1116,43.8151,46.0736,1014.0717,68.207,5.4204
TCGA-09-0369-01,3830.2561,2882.4549,305.3635,409.8917,149.1583,381.7043,12493.4762,14.0937,18318.5778,11582.0837,...,326.504,550.8287,343.8275,64.8897,859.1283,134.4774,214.3417,1424.0507,87.2048,45.5109


### Check patient ID correspondance between data sets

In [17]:
False in (TCGAov_rna_stromal.index == TCGAov_ABSOLUTE.index)

False

### Check nan in expression matrix

In [18]:
TCGAov_rna_stromal.isnull().values.any()

False

### Calculate Spearman's correlation and keep genes with rho <= -0.2 and p-val <= 0.05

In [19]:
stromal_genes_kept = set()

for stromal_gene in TCGAov_rna_stromal:
    
    rho, p_val = scipy.stats.spearmanr(
        a = TCGAov_rna_stromal[stromal_gene],
        b = TCGAov_ABSOLUTE
    )
    
    if rho <= -0.2 and p_val <= 0.05:
    
        stromal_genes_kept.add(stromal_gene)
        

In [20]:
len(stromal_genes_kept)

125

### Create dataframe

In [24]:
stromal_genes_kept_df = pd.DataFrame({
    "Gene_Symbol" : pd.Series(list(stromal_genes_kept)),
    "Tumour_type" : "OV"
})

stromal_genes_kept_df.set_index(
    "Gene_Symbol",
    inplace = True
)

stromal_genes_kept_df.head()

Unnamed: 0_level_0,Tumour_type
Gene_Symbol,Unnamed: 1_level_1
C1QB,OV
C1QA,OV
COL8A2,OV
ENPEP,OV
GIMAP5,OV


In [26]:
stromal_genes_kept_df.shape

(125, 1)

### Save file

In [27]:
stromal_genes_kept_df.to_csv(
    "../../Data/3/TCGAov_ESTIMATE_stromal_genes_keep.txt",
    sep = "\t"
)

# End script