### Code for making Figure S9

In [1]:
library(ggplot2)
library("ggsci")
library(dplyr)
library(reshape)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘reshape’


The following object is masked from ‘package:dplyr’:

    rename




In [2]:
setwd("/R data")

load("ssGSEA_module_results.Rdata")

In [3]:
ls()

### Calculate median ssGSEA scores for each module in each patient in each cohort 

In [5]:
mod.names <- colnames( t(ssGSEA_module_results$tumour$THCA) )
mod.scores <- t(ssGSEA_module_results$tumour$THCA)
mod.meds <- apply( mod.scores, 2, median) 

summary( mod.meds )

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.10426 -0.01218  0.03200  0.07930  0.17144  0.35279 

In [6]:
top.quartile <- as.numeric(summary(mod.meds)[5])

list( "Top.25prct"=mod.names[ mod.meds >= top.quartile ], 
     "Bottom.75prct"=mod.names[ mod.meds < top.quartile ] )

In [7]:
Top.quartile.modules.each.tumour <- lapply( ssGSEA_module_results$tumour, 
    FUN=function(Scores) { 
        mod.scores <- t(Scores)
        mod.names <- colnames(mod.scores)
        mod.meds <- apply( mod.scores, 2, median) 
        
        top.quartile <- as.numeric(summary(mod.meds)[5])

        list( "Top.25prct"=mod.names[ mod.meds >= top.quartile ], 
         "Bottom.75prct"=mod.names[ mod.meds < top.quartile ] )
        }
                                           )

### Now apply to all cohorts

In [8]:
load("objects/all_preservation_n_to_t2.Rdata")
load("objects/all_preservation_t_to_n2.Rdata")

In [9]:
ls()

In [10]:
head(all_preservation_t_to_n2)

Unnamed: 0_level_0,Tumour,Cluster,Per_50,Age,Cluster_name,Diff_per_UC,Cluster_size,Preservation_ratio,Category
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<fct>,<chr>,<dbl>,<int>,<dbl>,<fct>
1,BLCA,skyblue,3,MC,BLCA_tumour_skyblue,-19.864468,104,0.028846154,High_score
2,BLCA,yellow,2,UC,BLCA_tumour_yellow,12.711385,1423,0.001405481,Low_score
3,BLCA,white,5,Mixed,BLCA_tumour_white,1.687936,105,0.047619048,High_score
4,BLCA,darkturquoise,3,Mixed,BLCA_tumour_darkturquoise,10.947195,201,0.014925373,Median_score
5,BLCA,purple,3,Mixed,BLCA_tumour_purple,11.341206,332,0.009036145,Low_score
6,BLCA,grey60,3,MC,BLCA_tumour_grey60,-1.894995,219,0.01369863,Median_score


In [11]:
summary(all_preservation_t_to_n2)

     Tumour           Cluster        Per_50       Age      Cluster_name      
 HNSC   : 58   darkgrey   : 14   Min.   :  1   UC   :102   Length:588        
 LUSC   : 58   black      : 13   1st Qu.:  2   Mixed:379   Class :character  
 BRCA   : 47   brown      : 12   Median :  3   MC   :107   Mode  :character  
 KICH   : 46   cyan       : 12   Mean   :Inf                                 
 LUAD   : 45   darkgreen  : 12   3rd Qu.:  4                                 
 ESCA   : 38   darkmagenta: 12   Max.   :Inf                                 
 (Other):296   (Other)    :513                                               
  Diff_per_UC        Cluster_size     Preservation_ratio          Category  
 Min.   :-29.1763   Min.   :  34.00   Min.   :0.0003027   Low_score   :195  
 1st Qu.:  0.7469   1st Qu.:  95.75   1st Qu.:0.0090909   Median_score:190  
 Median :  8.8115   Median : 174.00   Median :0.0176835   High_score  :202  
 Mean   :  7.3154   Mean   : 295.52   Mean   :      Inf   Inf_score 

### Get the proportion of High Novelty modules in each cohort that has matched tumour & normal samples

#### Iterate through each cohort, subsetting the module info from all_preservation_t_to_n2

#### Then extract top quartile modules from Top.quartile.modules.each.tumour

#### Calculate proportion of High, Med & Low novelty modules in top quartile vs all modules and record

In [None]:
prop.high.novelty <- list()

for (cohort in unique( all_preservation_t_to_n2$Tumour ) ) {
    
    print(cohort)
    
    subset <- filter(all_preservation_t_to_n2, Tumour==cohort )
    
    print(dim(subset))

    top.quart <- Top.quartile.modules.each.tumour[names(Top.quartile.modules.each.tumour) == cohort]
    
    #print( class(top.quart[[1]]["Top.25prct"]) )
    #print( unlist(top.quart[[cohort]]["Top.25prct"] ) )
    
    top.mods <- unlist(top.quart[[cohort]]["Top.25prct"] ) 
    
    f <- filter(subset, Cluster %in% top.mods )

   #print( dim(f) )
    
    prop.high.novelty[[cohort]] <- as.numeric( summary( f$Category ) )[-4] / as.numeric( summary( subset$Category ) )[-4]
    }

In [13]:
nov.props <- t(data.frame(prop.high.novelty))
colnames(nov.props) <- c("Low","Medium","High")
head(nov.props)

Unnamed: 0,Low,Medium,High
BLCA,0.2,0.2,0.4
BRCA,0.1875,0.2,0.375
COAD,0.1666667,0.08333333,0.4615385
ESCA,0.2307692,0.16666667,0.3846154
HNSC,0.3157895,0.26315789,0.2
KICH,0.2666667,0.26666667,0.25


In [14]:
m.nov.props <- melt( nov.props, as.is=TRUE)

colnames(m.nov.props) <- c("Cohort","Novelty","Proportion")

summary(m.nov.props)

head( m.nov.props)

aggregate(m.nov.props$Proportion, by=list(m.nov.props$Novelty), FUN=summary)

“'as.is' should be specified by the caller; using TRUE”
“'as.is' should be specified by the caller; using TRUE”


     Cohort     Novelty     Proportion     
 BLCA   : 3   Low   :16   Min.   :0.08333  
 BRCA   : 3   Medium:16   1st Qu.:0.19687  
 COAD   : 3   High  :16   Median :0.25000  
 ESCA   : 3               Mean   :0.26454  
 HNSC   : 3               3rd Qu.:0.34091  
 KICH   : 3               Max.   :0.46154  
 (Other):30                                

Unnamed: 0_level_0,Cohort,Novelty,Proportion
Unnamed: 0_level_1,<fct>,<fct>,<dbl>
1,BLCA,Low,0.2
2,BRCA,Low,0.1875
3,COAD,Low,0.1666667
4,ESCA,Low,0.2307692
5,HNSC,Low,0.3157895
6,KICH,Low,0.2666667


Group.1,x
<fct>,"<dbl[,6]>"
Low,"0.08333333, 0.1780303, 0.2052632, 0.2079543, 0.2500000, 0.3157895"
Medium,"0.08333333, 0.1954545, 0.2649123, 0.2703068, 0.3437500, 0.4285714"
High,"0.12500000, 0.2500000, 0.3484848, 0.3153530, 0.3884615, 0.4615385"


### Now use ggplot2 to make a boxplot showing the proportion of modules in each Novelty category across all cohorts.

In [16]:
g <- ggplot(m.nov.props, aes(x=Novelty, y=Proportion))

pdf("Modules_novelty_props_box.pdf")

#png("Modules_novelty_props_box.png")

g + geom_boxplot(aes(fill=Novelty), outlier.shape=NA) + scale_fill_jco() + theme_bw()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text = element_text(size = 20)) + geom_hline( yintercept = 0.25, col="darkorange", lwd=2, lty=2 )

dev.off()