In [None]:
library(data.table)
library(dplyr)
library(readxl)
library(ggplot2)
library(purrr)
library(stringr)
library(vsn)
library(hexbin)
library(FactoMineR)
library(factoextra)
library(variancePartition)
library(tidyr)
library(pheatmap)
library(limma)

In [None]:
# Read sample infor file
samples.info = read.table("/grehawi/splice-reg-prj/new-data/Diff-Analysis/combined_pheno_withCT.csv")

In [None]:
# Read logCPM and batch-corrected genes and trx files
gene.count.corrected.final = read.table("/grehawi/splice-reg-prj/new-data/Diff-Analysis/gene.logCPM.corrected.final.matrix")
trx.count.corrected.final = read.table("/grehawi/splice-reg-prj/new-data/Diff-Analysis/trx.logCPM.corrected.final.matrix")

In [None]:
head(gene.count.corrected.final)

In [None]:
head(trx.count.corrected.final)

In [None]:
dim(gene.count.corrected.final)

In [None]:
dim(trx.count.corrected.final)

In [None]:
# histogram of gene values after batch correction
options(repr.plot.width=10, repr.plot.height=8)
hist(as.matrix(gene.count.corrected.final))

In [None]:
# histogram of trx values after batch correction
hist(as.matrix(trx.count.corrected.final))

In [None]:
samples.cases = samples.info %>% filter(ltany_di == 1)

In [None]:
samples.controls = samples.info %>% filter(ltany_di == 0)

In [None]:
dim(samples.cases)

In [None]:
dim(samples.controls)

## 1. Calculate Isophorm ratios

In [None]:
trx.genes = read.table("/grehawi/splice-reg-prj/data/transcriptsID-geneID.txt")
head(trx.genes)
length(unique(trx.genes$transcript_id))
length(unique(trx.genes$gene_id))

In [None]:
#undefined ratios (0/0, when none of the isoforms of a gene were expressed) were imputed from
#the mean ratio per isoform across individuals.
isophorm.ratios = trx.count.corrected.final
for (t in rownames(trx.count.corrected.final)) {
    #extract the name of the transcript
    trans = t
    #get the gene id for this transcript
    trans.gene = trx.genes[trx.genes$transcript_id == trans
                                       , colnames(trx.genes) %in% c("gene_id")]
    if (!trans.gene %in% rownames(gene.count.corrected.final)){
        isophorm.ratios = isophorm.ratios[!row.names(isophorm.ratios) == trans, ]
    }
    else{
    #calculate the isophorm ratios: isophorm.counts/gene.counts
        counts.gene.level = as.numeric(gene.count.corrected.final[rownames(gene.count.corrected.final) == trans.gene,])
    
        counts.isophorm.level = as.numeric(trx.count.corrected.final[rownames(trx.count.corrected.final) == trans,])
    
        ir = counts.isophorm.level/counts.gene.level

        #replace NAs (0/0 divisions) with the mean ratio per isophorm across individuals
        ir[is.na(ir)] = mean(ir, na.rm=TRUE)
        #isophorm.ratios = rbind(isophorm.ratios, as.numeric(ir)) %>% rownames(.) <- trans
        isophorm.ratios[rownames(isophorm.ratios) == trans, ] = ir
    }
}

In [None]:
dim(isophorm.ratios)

In [None]:
write.table(isophorm.ratios, "/grehawi/splice-reg-prj/new-data/PreNet_processing/isoform_ratios.matrix")

In [None]:
isophorm.ratios = read.table("/grehawi/splice-reg-prj/new-data/PreNet_processing/isoform_ratios.matrix")

## 2. PCA analysis to remove outliers as a final processing step

In [None]:
IR.standarized = isophorm.ratios %>% mutate_all(~(scale(.) %>% as.vector))

In [None]:
gene.count.standarized = gene.count.corrected.final %>% mutate_all(~(scale(.) %>% as.vector))

In [None]:
genes.count.cases = gene.count.standarized[, colnames(gene.count.standarized) %in% samples.cases$combined_id]

In [None]:
genes.count.controls = gene.count.standarized[, colnames(gene.count.standarized) %in% samples.controls$combined_id]

In [None]:
dim (genes.count.cases)

In [None]:
IR.cases = IR.standarized[, colnames(IR.standarized) %in% samples.cases$combined_id]

In [None]:
IR.controls = IR.standarized[, colnames(IR.standarized) %in% samples.controls$combined_id]

In [None]:
hist(as.matrix(gene.count.standarized))

In [None]:
hist(as.matrix(isophorm.ratios))

In [None]:
hist(as.matrix(IR.standarized))

### 2.1 PCA on genes ( control samples )

In [None]:
# Controls
options(repr.plot.width=12, repr.plot.height=8)

pca_controls <- PCA(
  X = t(genes.count.controls),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_controls,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.controls$plates))
#outliers here: MUC23416_S234, MUC23424_S242, MUC23436_S254, MUC23418_S236, MUC23320_S138, MUC23344_S162, 
# X23L002691_S43, MUC23321_S139

In [None]:
head(genes.count.controls)

In [None]:
#stds of samples we remove
controls_genes_outliers = c('MUC23416_S234', 'MUC23424_S242', 'MUC23436_S254', 'MUC23418_S236',
                                'MUC23320_S138', 'MUC23344_S162', 'X23L002691_S43', 'MUC23321_S139')
pca_controls_genes = prcomp(t(genes.count.controls), scale = FALSE)
pc_scores <- pca_controls_genes$x
pc_scores_df = as.data.frame(pc_scores)

# Select specific samples by row index (e.g., rows 1, 5, 10)
selected_samples <- pc_scores_df[rownames(pc_scores_df) %in% controls_genes_outliers, ]

# Calculate mean and standard deviation for PC1 and PC2
pc1_mean <- mean(pc_scores_df$PC1)
pc1_sd <- sd(pc_scores_df$PC1)

pc2_mean <- mean(pc_scores_df$PC2)
pc2_sd <- sd(pc_scores_df$PC2)

# Compute z-scores (standard deviations from the mean)
selected_samples$PC1_zscore <- (selected_samples$PC1 - pc1_mean) / pc1_sd
selected_samples$PC2_zscore <- (selected_samples$PC2 - pc2_mean) / pc2_sd

selected_samples
# Outlier samples are those with z-score off at least 2 from the mean on either PC1 or PC2

### 2.2 PCA on genes ( cases samples )

In [None]:
# Cases

options(repr.plot.width=12, repr.plot.height=8)

pca_cases <- PCA(
  X = t(genes.count.cases),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_cases,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.cases$plates))
#outliers here: MUC23251_S69, MUC23279_S97, MUC23326_S144, MUC23338_S156, MUC23213_S31, MUC23362_S180

In [None]:
#stds of samples we remove
cases_genes_outliers = c('MUC23251_S69', 'MUC23279_S97', 'MUC23326_S144', 'MUC23338_S156', 'MUC23213_S31', 'MUC23362_S180')
pca_cases_genes = prcomp(t(genes.count.cases), scale = FALSE)
pc_scores <- pca_cases_genes$x
pc_scores_df = as.data.frame(pc_scores)

# Select specific samples by row index (e.g., rows 1, 5, 10)
selected_samples <- pc_scores_df[rownames(pc_scores_df) %in% cases_genes_outliers, ]

# Calculate mean and standard deviation for PC1 and PC2
pc1_mean <- mean(pc_scores_df$PC1)
pc1_sd <- sd(pc_scores_df$PC1)

pc2_mean <- mean(pc_scores_df$PC2)
pc2_sd <- sd(pc_scores_df$PC2)

# Compute z-scores (standard deviations from the mean)
selected_samples$PC1_zscore <- (selected_samples$PC1 - pc1_mean) / pc1_sd
selected_samples$PC2_zscore <- (selected_samples$PC2 - pc2_mean) / pc2_sd

selected_samples
# Outlier samples are those with z-score off at least 2 from the mean on either PC1 or PC2

### 2.3 PCA on genes (cases and controls samples)

In [None]:
# cases and controls

options(repr.plot.width=12, repr.plot.height=8)

pca_gene_all <- PCA(
  X = t(gene.count.standarized),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_gene_all,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.info$plates))

### 2.4 PCA on genes (cases and controls samples) after outliers filtering

In [None]:
gene.level.outliers = c('MUC23416_S234', 'MUC23424_S242', 'MUC23436_S254', 'MUC23418_S236', 'MUC23320_S138',
                        'MUC23344_S162', 'X23L002691_S43', 'MUC23321_S139', 'MUC23251_S69', 'MUC23279_S97', 
                        'MUC23326_S144', 'MUC23338_S156', 'MUC23213_S31', 'MUC23362_S180')

In [None]:
gene.count.tmp = gene.count.standarized[, !colnames(gene.count.standarized) %in% gene.level.outliers]
samples.info.tmp = samples.info[! samples.info$combined_id %in% gene.level.outliers,]
options(repr.plot.width=12, repr.plot.height=8)

pca_gene_all <- PCA(
  X = t(gene.count.tmp),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_gene_all,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.info.tmp$plates))

### 2.5 PCA on isophorms ( cases samples )

In [None]:
#PCA based on IR

options(repr.plot.width=12, repr.plot.height=8)

pca_IR_cases <- PCA(
  X = t(IR.cases),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_IR_cases,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.cases$plates))
#outliers here: MUC23411_S229, MUC23412_S230, MUC23245_S63, MUC23425_S243, MUC23423_S241,
# MUC23409_S227, MUC23492_S310, MUC23325_S143, MUC23356_S174, MUC23460_S278, MUC23456_S274, 
# MUC23410_S228, MUC23183_S1

### 2.6 PCA on isophorms and control samples

In [None]:
options(repr.plot.width=12, repr.plot.height=8)

pca_IR_controls <- PCA(
  X = t(IR.controls),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_IR_controls,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.controls$plates))
#outliers here: MUC23257_S75, X23L002725_S77, MUC23333_S151, MUC23416_S234, MUC23341_S159

### 2.7 PCA on isophorms (cases and controls samples )

In [None]:
options(repr.plot.width=12, repr.plot.height=8)

pca_IR_all <- PCA(
  X = t(IR.standarized),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_IR_all,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.info$plates))

### 2.8 PCA on isoform (cases and controls) after outliers filtering

In [None]:
trx.level.outliers = c('MUC23411_S229', 'MUC23412_S230', 'MUC23245_S63', 'MUC23425_S243', 'MUC23423_S241',
                        'MUC23409_S227', 'MUC23492_S310', 'MUC23325_S143', 'MUC23356_S174', 'MUC23460_S278',
                        'MUC23456_S274', 'MUC23410_S228', 'MUC23183_S1', 'MUC23257_S75', 'X23L002725_S77',
                       'MUC23333_S151', 'MUC23416_S234', 'MUC23341_S159')

In [None]:
IR.standarized.tmp = IR.standarized[, !colnames(IR.standarized) %in% trx.level.outliers]
samples.info.tmp = samples.info[! samples.info$combined_id %in% trx.level.outliers,]
options(repr.plot.width=12, repr.plot.height=8)

pca_trx_all <- PCA(
  X = t(IR.standarized.tmp),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_trx_all,
             geom = "text", labelsize=2,
             habillage = as.factor(samples.info.tmp$plates))

### 3. Remove outliers from gene, transcript, and samples tables

In [None]:
outliers = c(trx.level.outliers, gene.level.outliers)

In [None]:
length(unique(outliers))

In [None]:
sample_data_final = samples.info[!samples.info$combined_id %in% outliers, ]
IR.standarized.final = IR.standarized[, !colnames(IR.standarized) %in% outliers]
gene.count.standarized.final = gene.count.standarized[, !colnames(gene.count.standarized) %in% outliers]

In [None]:
dim(IR.standarized.final)
dim(gene.count.standarized.final)

In [None]:
# number of cases
dim(sample_data_final[sample_data_final$ltany_di ==0,])

In [None]:
# number of controls
dim(sample_data_final[sample_data_final$ltany_di ==1,])

In [None]:
write.table(sample_data_final, "/grehawi/splice-reg-prj/new-data/PreNet_processing/combined_pheno_withCT_Outfiltered.csv")
write.table(gene.count.standarized.final, "/grehawi/splice-reg-prj/new-data/PreNet_processing/gene_count_final.matrix")
write.table(IR.standarized.final, "/grehawi/splice-reg-prj/new-data/PreNet_processing/IR_final.matrix")