/
Complete_Expression_Analysis.R
300 lines (247 loc) · 15.7 KB
/
Complete_Expression_Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# Script to analyze expression data
# Sadhika Malladi, Jonathan Ma, Andrew Beck
# The following script includes:
# 1. Expression and clinical data consolidation
# 2. Wilcoxon Rank-Sum test with respect to sex in each tissue
# 3. Benjamini-Hochberg method to adjust p-values
# 4. Fisher's Method to combine cancer-specific p-values
# 5. Fold Change Computation
# 6. GWAS-Like Analysis
# 7. Gene Set Analysis
# Set your working directory to wherever the RData file is stored
setwd('~/googledrive/ExpressionSexProject/Expression_Signatures/')
load("TCGA.Clin.RNAseq.v2.RData")
# Load relevant libraries
library(gdata)
library(MADAM)
# Uncomment to install GWASTools package:
# source("http://bioconductor.org/biocLite.R")
# biocLite("GWASTools")
library(GWASTools)
library(gap)
library(NCBI2R)
library(GSA)
# Create a data frame with clinical information ---------------------------
# Includes several overall graphs to describe trends
expression.data <- dat
clinical.data <- clin
expression.data <- t(expression.data)
# Create a data frame with tissue type, barcode, and sample type (tumor or normal)
expression.dataframe <- data.frame(t(data.frame(strsplit(rownames(expression.data),"-|\\."),stringsAsFactors=F)))
expression.dataframe$Barcode <- apply(expression.dataframe,MARGIN=1,function(x)paste(x[c(2:4)],collapse="-"))
expression.dataframe <- expression.dataframe[,c("X1","X5","Barcode")]
colnames(expression.dataframe) <- c("TissueType","SampleType","Barcode")
expression.dataframe$SampleType=substr(expression.dataframe$SampleType,1,2)
# Patterns for tumor and normal sample types
# Normal: Number 11 followed by any capital letter
# Tumor: Number 0 followed by 1 or 3 followed by any capital letter
Tumor.expression <- t(expression.data[ expression.dataframe$SampleType=="01" | expression.dataframe$SampleType=="03",])
Normal.expression <- t(expression.data[ expression.dataframe$SampleType=="11",])
Tumor.dataframe <- expression.dataframe[ expression.dataframe$SampleType=="01"|expression.dataframe$SampleType=="03",]
Normal.dataframe <- expression.dataframe[ expression.dataframe$SampleType=="11",]
# Rownames of clinical data are barcodes.
# To access rows of clinical data, you need a vector of barcodes.
rownames(clinical.data) <- toupper(gsub("[A-Z]+.(tcga).([a-z0-9]+).([a-z0-9]+)","\\1-\\2-\\3",rownames(clinical.data)))
colnames(Tumor.expression) <- Tumor.dataframe$Barcode
colnames(Normal.expression) <- Normal.dataframe$Barcode
# We only want to do computations on normals and tumors for which we have clinical data
# (in this case, gender)
Normal.dataframe <- Normal.dataframe[Normal.dataframe$Barcode%in% rownames(clinical.data),]
Normal.expression <- Normal.expression[ ,Normal.dataframe$Barcode]
Tumor.dataframe <- Tumor.dataframe[ Tumor.dataframe$Barcode %in% rownames(clinical.data),]
Tumor.expression <- Tumor.expression[ ,Tumor.dataframe$Barcode]
# Include age for future analyses
Normal.dataframe$yearstobirth <- clinical.data[ Normal.dataframe$Barcode,"yearstobirth"]
Normal.dataframe$sex <- clinical.data[ Normal.dataframe$Barcode,"gender"]
Tumor.dataframe$yearstobirth <- clinical.data[ Tumor.dataframe$Barcode,"yearstobirth"]
Tumor.dataframe$sex <- clinical.data[ Tumor.dataframe$Barcode,"gender"]
# Note: sort returns reordered values, order returns reordered indices
# Order dataframe by ordering the ages and returning reordered indices
Tumor.dataframe <- Tumor.dataframe[ order(Tumor.dataframe$yearstobirth),]
# Plotting age, colored by tissue type -- not used in later analyses, just for fun
plot(sort(Tumor.dataframe$yearstobirth),col=as.factor(Tumor.dataframe$TissueType), main="Age colored by tissue type",xlab="Tissue Types (as factors)",ylab="Age")
# Age distribution by tissue type
boxplot(as.numeric(Tumor.dataframe$yearstobirth)~Tumor.dataframe$TissueType, main="Age vs Tissue Type (Tumor)")
boxplot(as.numeric(Normal.dataframe$yearstobirth)~Normal.dataframe$TissueType, main="Age vs Tissue Type (Normal)")
Tumor.TypeBySex <- table(Tumor.dataframe$TissueType,Tumor.dataframe$sex)
Normal.TypeBySex <- table(Normal.dataframe$TissueType,Normal.dataframe$sex)
# Subset tumor and normal dataframes so that they only include the mixed tissue types
# Mixed tissue types have more than 10 cases in male and more than 10 cases in female
Tumor.dataframe <- Tumor.dataframe[ Tumor.dataframe$TissueType %in% rownames(Tumor.TypeBySex)[apply(Tumor.TypeBySex,1,function(x)sum(x>10)==2)],]
Normal.dataframe <- Normal.dataframe[ Normal.dataframe$TissueType %in% rownames(Normal.TypeBySex)[apply(Normal.TypeBySex,1,function(x)sum(x>10)==2)],]
Tumor.expression <- Tumor.expression[,Tumor.dataframe$Barcode]
Normal.expression <- Normal.expression[,Normal.dataframe$Barcode]
# Removes cases that have "0" (i.e. less than 10) males or females
Tumor.dataframe$TissueType=drop.levels(Tumor.dataframe$TissueType)
# Plot tissue type by sex
tissueBySex<-table(Tumor.dataframe$TissueType,Tumor.dataframe$sex)
tissueBySex<-as.table(tissueBySex[order(apply(tissueBySex,1,sum)),])
plot(tissueBySex,col=c("hotpink","lightblue"),las=3,main="Gender Distribution of 404 Normal Samples of 10 Types",ylab="",xlab="")
# p value matrices: no data is inputted here, only dimensions are set
# row = genes, col = tissue type (unique)
Tumor.WilcP <- matrix(NA,nrow=nrow(Tumor.expression),ncol=length(unique(Tumor.dataframe$TissueType)),dimnames=list(rownames(Tumor.expression),unique(Tumor.dataframe$TissueType)))
Normal.WilcP <- matrix(NA,nrow=nrow(Normal.expression),ncol=length(unique(Normal.dataframe$TissueType)),dimnames=list(rownames(Normal.expression),unique(Normal.dataframe$TissueType)))
Tumor.WilcEst <- Tumor.WilcP
Normal.WilcEst <- Normal.WilcP
# Remember: these will be saved to your working directory!
saveRDS(Tumor.expression,file="TumorExpression.RDS")
saveRDS(Normal.expression,file="NormalExpression.RDS")
saveRDS(Tumor.dataframe,file="TumorDataframe.RDS")
saveRDS(Normal.dataframe,file="NormalDataframe.RDS")
# Calculate p-values ------------------------------------------------------
for(i in colnames(Tumor.WilcP)) {
# Get all barcodes of the current tissue type
Tumor.tissueCases<-Tumor.dataframe$Barcode[Tumor.dataframe$TissueType==i]
# Expression values of all genes for current tissue type (genes=rows, barcodes=columns)
Tumor.tissueExpression<-Tumor.expression[,Tumor.tissueCases]
# A list of genders of all barcodes (cases) of the current tissue type
Tumor.tissueSex<-Tumor.dataframe$sex[Tumor.dataframe$TissueType==i]
# Compute the wilcoxon rank-sum test with respect to sex using the expression for the
# current tissue.
# In this step, we are saving the median differences
Tumor.WilcEst[,i]=apply(Tumor.tissueExpression,1,function(x)(
if(sum(x>0)>(length(x)/10)){
wilcox.test(x~Tumor.tissueSex,conf.int=T)$est
}else{
NA
})
)
# Extract the p-value
Tumor.WilcP[,i]=apply(Tumor.tissueExpression,1,function(x)((wilcox.test(x~Tumor.tissueSex)$p.value)))
cat("Tissue ", i, " Tumor")
cat("\n")
}
saveRDS(Tumor.WilcEst,"MedianDiffEstimate_Tumors_6.23.RDS")
saveRDS(Tumor.WilcP,"WilcoxonRawP_Tumors_6.23.RDS")
# Same procedure as above, but for normal tissues
for(i in colnames(Normal.WilcP)) {
Normal.tissueCases<-Normal.dataframe$Barcode[Normal.dataframe$TissueType==i]
Normal.tissueExpression<-Normal.expression[,Normal.tissueCases]
Normal.tissueSex<-Normal.dataframe$sex[Normal.dataframe$TissueType==i]
Normal.WilcEst[,i]=apply(Normal.tissueExpression,1,function(x)(
if(sum(x>0)>(length(x)/10)){
wilcox.test(x~Normal.tissueSex,conf.int=T)$est
}else{
NA
})
)
Normal.WilcP[,i]=apply(Normal.tissueExpression,1,function(x)((wilcox.test(x~Normal.tissueSex)$p.value)))
cat("Tissue ", i, " Normal")
cat("\n")
}
saveRDS(Normal.WilcEst,"MedianDiffEstimate_Normals_6.23.RDS")
saveRDS(Normal.WilcP,"WilcoxonRawP_Normals_6.23.RDS")
# Adjust p-values ---------------------------------------------------------
# Manually adjust p values in order to find the most significant genes in each cancer type separately
adj.Tumor.WilcP.vector=c(Tumor.WilcP)
adj.Tumor.WilcP.vector=p.adjust(adj.Tumor.WilcP.vector,method='fdr')
adj.Tumor.WilcP=matrix(adj.Tumor.WilcP.vector,ncol=ncol(Tumor.WilcP),byrow=F)
rownames(adj.Tumor.WilcP)=rownames(Tumor.WilcP)
colnames(adj.Tumor.WilcP)=colnames(Tumor.WilcP)
adj.Normal.WilcP.vector=c(Normal.WilcP)
adj.Normal.WilcP.vector=p.adjust(adj.Normal.WilcP.vector,method='fdr')
adj.Normal.WilcP=matrix(adj.Normal.WilcP.vector,ncol=ncol(Normal.WilcP),byrow=F)
rownames(adj.Normal.WilcP)=rownames(Normal.WilcP)
colnames(adj.Normal.WilcP)=colnames(Normal.WilcP)
saveRDS(adj.Tumor.WilcP,file='WilcoxonAdjustedP_Tumors_6.24.RDS')
saveRDS(adj.Normal.WilcP,file='WilcoxonAdjustedP_Normals_6.24.RDS')
# Fisher's Method -----------------------------------------------------------
# Combine the p values generated by the wilcoxon test to form pan-cancer p values
Tumor.MetaPs=fisher.method(Tumor.WilcP,na.rm=T)
write.table(Tumor.MetaPs,"Meta.Ps_Tumor_6.23.14.txt",col.names=NA,sep="\t")
Normal.MetaPs=fisher.method(Normal.WilcP,na.rm=T)
write.table(Normal.MetaPs,"Meta.Ps_Normals_6.23.14.txt",col.names=NA,sep="\t")
# Fold Change Computation -------------------------------------------------
# Fold Change - mean, median, sum
# male/female
Tumor.ExpressionMale=Tumor.expression[,Tumor.dataframe$sex=="male"]
Tumor.ExpressionFemale=Tumor.expression[,Tumor.dataframe$sex=="female"]
Normal.ExpressionMale=Normal.expression[,Normal.dataframe$sex=="male"]
Normal.ExpressionFemale=Normal.expression[,Normal.dataframe$sex=="female"]
Tumor.FC=rowMeans(Tumor.ExpressionMale)/rowMeans(Tumor.ExpressionFemale)
Normal.FC=rowMeans(Normal.ExpressionMale)/rowMeans(Normal.ExpressionFemale)
Tumor.FCMed=apply(Tumor.ExpressionMale,1,median)/apply(Tumor.ExpressionFemale,1,median)
Normal.FCMed=apply(Normal.ExpressionMale,1,median)/apply(Normal.ExpressionFemale,1,median)
Tumor.onRatio=apply(Tumor.ExpressionMale,1,sum)/apply(Tumor.ExpressionFemale,1,sum)
Normal.onRatio=apply(Normal.ExpressionMale,1,sum)/apply(Normal.ExpressionFemale,1,sum)
# Correlation between normal and tumor fold changes
cor.test(Tumor.FC,Normal.FC,method="sp")
cor.test(Tumor.FCMed,Normal.FCMed,method="sp")
cor.test(Tumor.onRatio,Normal.onRatio,method="sp")
plot(Tumor.FC,Tumor.onRatio,main="Fold Change vs Number of nonzero (Tumor)")
# Aggregate and save all fold change analyses
FCs=cbind(Tumor.FC,Normal.FC,Tumor.FCMed,Normal.FCMed,Tumor.onRatio,Normal.onRatio)
rownames(FCs)=colnames(expression.data)
write.table(FCs,"FoldChanges_Gender.txt",col.names=NA,sep="\t")
# GWAS Like Analysis ------------------------------------------------------
# Note: Meta.Ps.combined.txt combines the p-values of male and female cases through meta-analysis
Full.pVals=read.table("Meta.Ps.combined.txt",sep="\t",header=T,row.names=1)
Full.pVals=Full.pVals[complete.cases(Full.pVals),]
# Preliminary Manhattan Plot
mhtplot(data.frame(Full.pVals))
Full.pVals=Full.pVals[order(Full.pVals[,"chr"],Full.pVals[,"pos"]),]
# pdf("ManhattanPlot_GenderPs_TumorsAndNormals.pdf") # uncomment to save to PDF
par(mfrow=c(2,1))
manhattanPlot(Full.pVals[,"Tumor_P.Adjust"],Full.pVals[,"UCSC_chrom.as"],ylim=c(0,15),signif=NULL,main="Manhattan Plot of Gender-Associated Transcripts in Tumor")
manhattanPlot(Full.pVals[,"Normal_P.Adjust"],Full.pVals[,"UCSC_chrom.as"],ylim=c(0,11),signif=NULL,main="Manhattan Plot of Gender-Associated Transcripts in Normal")
# dev.off()
# pdf("SignificanceAlongXandY.pdf") # uncomment to save to PDF
par(mfrow=c(1,2))
plot(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="Y","UCSC_pos.as"],-log10(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="Y","Tumor_P.value"]),col="blue",main="Significance Along Y-Chromosome",ylab="-log10(p)",xlab = "Y-Chromosomal Position")
plot(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="X","UCSC_pos.as"],-log10(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="X","Tumor_P.value"]),col="blue",main="Significance Along X-Chromosome",ylab="-log10(p)",xlab = "X-Chromosomal Position")
# dev.off()
plot(Full.pVals[,"Tumor_S"],Full.pVals[,"Normal_S"])
# Spearman's correlation coefficient: significance of correlation between tumor and non-tumor pvalues
cor.test(Full.pVals[,"Tumor_S"],Full.pVals[,"Normal_S"],method="sp")
# correlation between p-values of tumor and non-tumor in males
cor.test(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="Y","Tumor_S"],Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="Y","Normal_S"],method="sp")
# correlation between p-values of tumor and non-tumor in females
cor.test(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="X","Tumor_S"],Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="X","Normal_S"],method="sp")
# tumors and non-tumors p-value significance in non-sex chromosomes
cor.test(Full.pVals[Full.pVals[,"UCSC_chrom.as"]!="X" & Full.pVals[,"UCSC_chrom.as"]!="Y","Tumor_S"],Full.pVals[Full.pVals[,"UCSC_chrom.as"]!="X" & Full.pVals[,"UCSC_chrom.as"]!="Y","Normal_S"])
# pdf("ScatterOfS_Stats_SexAndNonSex.pdf")
# par(mfrow=c(3,1))
#plots in same order as correlation tests above
plot(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="Y","Tumor_S"],Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="Y","Normal_S"],method="sp")
plot(Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="X","Tumor_S"],Full.pVals[Full.pVals[,"UCSC_chrom.as"]=="X","Normal_S"])
plot(Full.pVals[Full.pVals[,"UCSC_chrom.as"]!="X" & Full.pVals[,"UCSC_chrom.as"]!="Y","Tumor_S"],Full.pVals[Full.pVals[,"UCSC_chrom.as"]!="X" & Full.pVals[,"UCSC_chrom.as"]!="Y","Normal_S"])
# dev.off()
# similar analysis in Normals
plot(-log10(Full.pVals[,"Tumor_P.value"]))
#p value vs chromosome number
boxplot(-log10(Full.pVals[,"Tumor_P.value"])~Full.pVals[,"UCSC_chrom.as"])
#not really used for anything - still included.
miss=read.table("noChrom.txt",sep="\t")
miss=as.numeric(miss[,1])
genes=unlist(lapply(strsplit(row.names(Tumor.MetaPs),"|",fixed=T),function(x)(x[[2]])))
# Gene Set Analysis (GSA) -------------------------------------------------
# Gene Set Analysis -- this section was not used because analysis was too general
library(GSA)
# entrez is one of the forms used to identify genes
geneset.obj<- GSA.read.gmt("msigdb.v4.0.entrez.gmt")
# 3D array: genesets x number of attributes x number of tissue types
outArray=array(NA,dim=c(length(geneset.obj$genesets),5,length(unique(Tumor.dataframe$TissueType))),dimnames=list(genesets=geneset.obj$geneset.names,attributes=c("Gene_set","Gene_set_name","Score","p-value","FDR"),unique(Tumor.dataframe$TissueType)))
# x = all tumor expression data for one tissue type
# y = male/female matrix with male = 1 and female = 2 (a vector)
# gene names are row names of expression data
# multiple cases of one tissue type so only loop over unique tissue types
for(i in 1:length(unique(Tumor.dataframe$TissueType))){
x=Tumor.expression[,Tumor.dataframe$TissueType==unique(Tumor.dataframe$TissueType)[i]]
y=Tumor.dataframe$sex[Tumor.dataframe$TissueType==unique(Tumor.dataframe$TissueType)[i]]
y[y=="male"]=1
y[y=="female"]=2
# The second string of the split string is the numeric value of the gene
# genenames is a vector of all numeric values of genes
genenames=unlist(lapply(strsplit(rownames(Tumor.expression),"|",fixed=T),function(xx)(xx[2])))
genesets=geneset.obj$genesets
geneset.names=geneset.obj$geneset.names
# Note: 'genenames' and 'genesets' are the names of the parameters of the GSA function
# returns significance, measured in p-values, of sets of genes in genesets as applied to genenames
GSA.obj<-GSA(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired", nperms=100)
# list out the gene sets with false discovery rates (p values) lower than the false discovery rate cutpoint (FDRcut).
# FDR=false discovery rate (i.e. rate of false positives), which is the same as the significance level
out=GSA.listsets(GSA.obj, geneset.names=geneset.names,FDRcut=1,maxchar=9999)
# combine the positive and negative into a table by rows
outTable=rbind(out$positive,out$negative)
write.table(outTable,file=paste('GSA_',unique(Tumor.dataframe$TissueType)[i]),sep='\t')
}