# Methylation analysis tutorial
Authors: Owen Chapman, Lukas Chavez  
Nagoya City University and UC San Diego  
Last updated July 2025  

In [None]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(minfi)) # Package for loading and preprocessing methylation data from many sources including Illumina. See https://bioconductor.org/packages/devel/bioc/vignettes/minfi/inst/doc/minfi.html
# library(minfiData) # Example dataset for minfi vignette.
# library(conumee) # Package for estimating genomic copy number from methylation. See https://bioconductor.org/packages/devel/bioc/vignettes/conumee/inst/doc/conumee.html
suppressPackageStartupMessages(library(Rtsne))  # t-stochastic neighbor embedding.
suppressPackageStartupMessages(library(sva)) # one of many batch correction algorithms
suppressPackageStartupMessages(library(tictoc)) # System time functions tic() and toc()

# Stratification of patient medulloblastoma tumors by clustering of Illumina Infinium DNA methylation array data

Download the dataset from https://datasets.genepattern.org/data/chapman/MBlandscape_subset_400.RData .  
Published by Northcott et al., The whole-genome landscape of medulloblastoma subtypes, Nature 2017.  
Raw microarray available (access controlled, don't distribute) at European Genome-Phenome Archive (EGA, http://www.ebi.ac.uk/ega/), under accession number EGAS00001001953.  


In [None]:
## Load preprocessed beta values of 400 medulloblastoma patients 

tic("Load MB dataset")
load("data/MBlandscape_subset_400.RData") # for a local dataset stored in ./data
# If using Genepattern Notebook, can alternately use:
# load(url("https://datasets.genepattern.org/data/chapman/MBlandscape_subset_400.RData"))
toc()
## MB data is now in a variable called `allbeta'.

In [None]:
# For each CpG, calculate the standard deviation across the cohort and
# order the CpGs according to their standard deviation (from top to bottom)
# (This takes awhile)
tic("Order CpG sites by variance")
allbeta.sd <- apply(allbeta, 1, sd, na.rm=TRUE)
allbeta.ordered <- allbeta[order(allbeta.sd, decreasing=TRUE),]
head(allbeta.ordered)
toc() # 47s

In [None]:
#Calculate Pearson correlation between all tumors based
#on the 5k most variable CpGs
tic("Sample correlation heatmap")
n.var = 5000
b <- allbeta.ordered[1:n.var, ]

#Caluclate Pearson correlations between samples/patients
b.xcor <- cor(b, method="pearson")

# perform hierarchical sample clustering
b.xdend <- as.dendrogram(hclust(as.dist(1-b.xcor), method="average"))

# Visualisation of pre-computed hierarchical sample/patient clustering
cols <- colorRampPalette(c("blue", "white", "red"))(100)
heatmap(b.xcor, Rowv=b.xdend, col=cols, symm=TRUE, zlim=c(-1, 1), scale="none", useRaster=TRUE,
        cexCol=max(min(125*ncol(b)^-1.25, 1), 0.07), labRow=NA, main=paste(dim(b), collapse="x"))
#--> symmetric matrix of similarities between patients
toc() # 46s

In [None]:
# TSNE -  t-distributed stochastic neigborhood embedding
# non-linear dimension reduction
tic("t-sne")
set.seed("202401")
Y <- Rtsne(as.dist(1-b.xcor), verbose=FALSE, check_duplicates=FALSE, is_distance=TRUE,
             perplexity=min(floor((ncol(b)-1)/3), 30), theta=0, pca=FALSE, max_iter=10000)$Y
Y.range <- apply(Y, 2, range)
Y.diff <- apply(Y.range, 2, diff)
Y.center <- apply(Y.range, 2, mean)
plot(Y, xlim=Y.center[1] + c(-0.5, 0.5)*max(Y.diff), ylim=Y.center[2] + c(-0.5, 0.5)*max(Y.diff),
     xlab="TSNE 1", ylab="TSNE 2", pch=20, cex=1, col="black", main=paste(dim(b), collapse="x"), las=2)
toc() # 9s