# Differential Methylation Analysis Notebook

This notebook performs a series of data import, preprocessing, and differential methylation analyses. The analysis consists of:

- Installing and loading necessary packages
- Loading the data (SiTH and methylation data)
- Merging datasets and removing duplicates
- Differential analysis for primary samples (grouped by SITH value levels)
- Differential analysis for non-primary samples with SITH as a continuous variable
- Differential analysis for non-primary samples with INT_IQR as a continuous variable


In [1]:
## Section: Install Necessary Packages

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(version = "3.20")


'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org

Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.2 (2024-10-31)

Old packages: 'KernSmooth', 'MASS', 'Matrix', 'class', 'cluster', 'foreign',
  'nnet', 'rlang', 'rpart', 'spatial', 'survival'



In [2]:
BiocManager::install(c("limma"))

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org

Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.2 (2024-10-31)

"package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'limma'"
Old packages: 'KernSmooth', 'MASS', 'Matrix', 'class', 'cluster', 'foreign',
  'nnet', 'rlang', 'rpart', 'spatial', 'survival'



In [3]:
## Section: Load Libraries and Set Threading Options

library(data.table)
library(limma)
library(parallel)
library(Matrix)
library(RhpcBLASctl)

# Set the number of threads for BLAS operations and data.table
num_cores <- detectCores()
blas_set_num_threads(num_cores)
setDTthreads(threads = num_cores)


In [4]:
## Section: Load Data

# Load the SiTH data as data.table
sith <- fread("../_INPUTS_/pcawg_sith.tsv", sep = "\t")

# Load the methylation data as data.table
pcawg_methylation <- fread("../_INPUTS_/pcawg_methylation_long.tsv", sep = "\t")

pcawg_methylation_long <- pcawg_methylation  # Already a data.table


In [5]:
## Section: Preprocess and Merge Data

# Rename 'SampleId' to 'icgc_sample_id' to match the methylation data
setnames(sith, "SampleId", "icgc_sample_id", skip_absent=TRUE)

# Merge the methylation data with the SiTH data using data.table's merge
merged_long_data <- merge(
  pcawg_methylation_long,
  sith,
  by = "icgc_sample_id",
  all = FALSE,
  sort = FALSE
)

# Remove duplicates efficiently using data.table's unique and setkey
setkey(merged_long_data, icgc_sample_id, 
       #icgc_specimen_id, 
       probe_id)
merged_long_data <- unique(merged_long_data, by = key(merged_long_data))

# Remove samples with missing SiTH scores
merged_long_data <- merged_long_data[!is.na(SITH)]

# Partition data into primary and non-primary samples using data.table's syntax
primary_data <- merged_long_data[sample_type == "primary", ]
non_primary_data <- merged_long_data[sample_type != "primary", ]
non_primary_data1 <- merged_long_data[sample_type != "primary", ]


In [7]:
## Section: Differential Analysis for Primary Samples

### Define Groups and Create Sample Information

# Define groups based on SITH score within 0.7-0.8 and outside using data.table
primary_data[, SITH_Group := ifelse(SITH >= 0.7 & SITH <= 0.8, "Within", "Outside")]

# Create a sample information table as data.table
sample_info_primary <- unique(primary_data[, .(
  icgc_sample_id,
  SITH,
  SITH_Group
)])

### Map Sample and Probe Identifiers to Indices

# Create mappings for samples and probes to indices
samples_primary <- sample_info_primary$icgc_sample_id
probes <- unique(primary_data$probe_id)

sample_indices_primary <- setNames(seq_along(samples_primary), samples_primary)
probe_indices <- setNames(seq_along(probes), probes)

# Map sample IDs and probe IDs to indices in primary_data
primary_data[, `:=`(
  sample_idx = sample_indices_primary[icgc_sample_id],
  probe_idx = probe_indices[probe_id]
)]

### Create a Sparse Methylation Matrix

methylation_matrix_primary <- sparseMatrix(
  i = primary_data$probe_idx,
  j = primary_data$sample_idx,
  x = primary_data$methylation_value,
  dims = c(length(probes), length(samples_primary)),
  dimnames = list(probes, samples_primary)
)

# Ensure sample_info_primary is in the same order as columns in methylation_matrix_primary
sample_info_primary <- sample_info_primary[match(colnames(methylation_matrix_primary), sample_info_primary$icgc_sample_id)]

### Prepare the Design Matrix and Contrast

group_factor_primary <- factor(sample_info_primary$SITH_Group, levels = c("Outside", "Within"))
design_matrix_primary <- model.matrix(~0 + group_factor_primary)
colnames(design_matrix_primary) <- levels(group_factor_primary)

# Define contrast
contrast_primary <- makeContrasts(Within_vs_Outside = Within - Outside, levels = design_matrix_primary)

### Define and Run Differential Analysis Function for Primary Samples

perform_differential_analysis <- function(methylation_matrix, design_matrix, contrast_matrix, label) {
  # Fit the linear model
  fit <- lmFit(methylation_matrix, design_matrix)
  
  # Apply contrasts
  fit2 <- contrasts.fit(fit, contrast_matrix)
  
  # Apply empirical Bayes moderation
  fit2 <- eBayes(fit2)
  
  # Extract top differentially methylated probes
  tT <- topTable(
    fit2,
    adjust.method = "fdr",
    sort.by = "P",
    number = Inf
  )
  
  # Convert to data.table
  differential_results <- as.data.table(tT)
  
  # Write results to file
  write.table(tT, file=paste0("../_OUTPUTS_/dif_res_", label, ".csv"), sep=",")
  
  return(differential_results)
}

# Perform differential methylation analysis for primary samples
differential_results_primary <- perform_differential_analysis(
  methylation_matrix = methylation_matrix_primary,
  design_matrix = design_matrix_primary,
  contrast_matrix = contrast_primary,
  label = "primary_window"
)


In [8]:
## Section: Differential Analysis for Non-Primary Samples (SITH as a Continuous Variable)

### Prepare Sample Information and Map Indices

# Create a sample information table as data.table
sample_info_non_primary <- unique(non_primary_data[, .(
  icgc_sample_id,
  SITH
)])

# Create mappings for samples
samples_non_primary <- sample_info_non_primary$icgc_sample_id

sample_indices_non_primary <- setNames(seq_along(samples_non_primary), samples_non_primary)

# Map sample IDs and probe IDs to indices in non_primary_data
non_primary_data[, `:=`(
  sample_idx = sample_indices_non_primary[icgc_sample_id],
  probe_idx = probe_indices[probe_id]
)]

### Create a Sparse Methylation Matrix for Non-Primary Samples

methylation_matrix_non_primary <- sparseMatrix(
  i = non_primary_data$probe_idx,
  j = non_primary_data$sample_idx,
  x = non_primary_data$methylation_value,
  dims = c(length(probes), length(samples_non_primary)),
  dimnames = list(probes, samples_non_primary)
)

# Ensure sample_info_non_primary is in the same order as columns in methylation_matrix_non_primary
sample_info_non_primary <- sample_info_non_primary[match(colnames(methylation_matrix_non_primary), sample_info_non_primary$icgc_sample_id)]

### Prepare the Design Matrix (Continuous SITH)

design_matrix_corr <- model.matrix(~ SITH, data = sample_info_non_primary)

### Define and Run Differential Analysis Function for Continuous SITH

perform_differential_analysis_continuous <- function(methylation_matrix, design_matrix, coef_index, label_pos, label_neg) {
  # Fit the linear model
  fit <- lmFit(methylation_matrix, design_matrix)
  
  # Apply empirical Bayes moderation
  fit2 <- eBayes(fit)
  
  # Extract top differentially methylated probes
  tT <- topTable(
    fit2,
    coef = coef_index,
    adjust.method = "fdr",
    sort.by = "P",
    number = Inf
  )
  
  # Convert to data.table
  differential_results <- as.data.table(tT)
  
  # Save the positive correlations
  differential_results_pos <- differential_results[logFC > 0]
  
  # Save the negative correlations
  differential_results_neg <- differential_results[logFC < 0]
  
  write.table(tT, file=paste0("../_OUTPUTS_/dif_res_", label_pos, ".csv"), sep=",")
  
  return(list(pos = differential_results_pos, neg = differential_results_neg))
}

# Perform differential methylation analysis for non-primary samples with continuous SITH
results_corr <- perform_differential_analysis_continuous(
  methylation_matrix = methylation_matrix_non_primary,
  design_matrix = design_matrix_corr,
  coef_index = "SITH",
  label_pos = "corr_SITH",
  label_neg = "inverse_corr_SITH"
)


"Zero sample variances detected, have been offset away from zero"


In [9]:
## Section: Differential Analysis for Non-Primary Samples (INT_IQR as a Continuous Variable)

### Prepare Sample Information and Map Indices (Using INT_IQR)

# Create a sample information table as data.table
sample_info_non_primary1 <- unique(non_primary_data[, .(
  icgc_sample_id,
  INT_IQR
)])

# Create mappings for samples
samples_non_primary1 <- sample_info_non_primary1$icgc_sample_id

sample_indices_non_primary1 <- setNames(seq_along(samples_non_primary1), samples_non_primary1)

# Map sample IDs and probe IDs to indices in non_primary_data
non_primary_data1[, `:=`(
  sample_idx = sample_indices_non_primary1[icgc_sample_id],
  probe_idx = probe_indices[probe_id]
)]

### Create a Sparse Methylation Matrix for the INT_IQR Analysis

methylation_matrix_non_primary1 <- sparseMatrix(
  i = non_primary_data1$probe_idx,
  j = non_primary_data1$sample_idx,
  x = non_primary_data1$methylation_value,
  dims = c(length(probes), length(samples_non_primary)),
  dimnames = list(probes, samples_non_primary)
)

# Ensure sample_info_non_primary1 is in the same order as columns in methylation_matrix_non_primary1
sample_info_non_primary1 <- sample_info_non_primary1[match(colnames(methylation_matrix_non_primary1), sample_info_non_primary1$icgc_sample_id)]

### Prepare the Design Matrix (Continuous INT_IQR)

design_matrix_corr1 <- model.matrix(~ INT_IQR, data = sample_info_non_primary1)

### Define and Run Differential Analysis Function for Continuous INT_IQR

perform_differential_analysis_continuous1 <- function(methylation_matrix, design_matrix, coef_index, label_pos, label_neg) {
  # Fit the linear model
  fit <- lmFit(methylation_matrix, design_matrix)
  
  # Apply empirical Bayes moderation
  fit2 <- eBayes(fit)
  
  # Extract top differentially methylated probes
  tT <- topTable(
    fit2,
    coef = coef_index,
    adjust.method = "fdr",
    sort.by = "P",
    number = Inf
  )
  
  # Convert to data.table
  differential_results <- as.data.table(tT)
  
  # Save the positive correlations
  differential_results_pos <- differential_results[logFC > 0]
  
  # Save the negative correlations
  differential_results_neg <- differential_results[logFC < 0]
  
  write.table(tT, file=paste0("../_OUTPUTS_/dif_res_", label_pos, ".csv"), sep=",")
  
  return(list(pos = differential_results_pos, neg = differential_results_neg))
}

# Perform differential methylation analysis for non-primary samples with continuous INT_IQR
results_corr1 <- perform_differential_analysis_continuous1(
  methylation_matrix = methylation_matrix_non_primary1,
  design_matrix = design_matrix_corr1,
  coef_index = "INT_IQR",
  label_pos = "corr_INT_IQR",
  label_neg = "inverse_corr_INT_IQR"
)


"Zero sample variances detected, have been offset away from zero"
