In [None]:
library(ggplot2)
library(rtracklayer)
library(dplyr)
library(tidyr)
library(data.table)
library(matrixStats)

library(readxl)
library(purrr)
library(stringr)
library(sva)
library(DESeq2)
library(vsn)
library(hexbin)
library(FactoMineR)
library(factoextra)
library(variancePartition)
library(pheatmap)
library(tximport)

## OPTIMA & BeCOME Samples Data file preprocessing

In [None]:
# information about plate location
samples_information <- read.table("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/01_become_optima_randomization_pheno_052020_emptyPos_jonas.txt",
                                 header = TRUE)
# also load the original Excel file as it contains info about the batches in
# which the PBMCs were handled
sample_information_excel <- read_excel("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/Ubersicht_RNA_AusPBMC_Mai2020.xlsx",
                                       sheet = "Randomisierung")
head(samples_information)
dim(samples_information)

The extraction batch is the same as the plate on which the samples were loaded, I’ve already have this information in the plates variable. Therefore, I only use the preparation batch info. This is the information which samples were handled together to put them onto the plate

In [None]:
sample_information_batches <- sample_information_excel %>% 
  dplyr::select(Proband_Code, `Vorbereitung PBMC Batch`) %>% 
  dplyr::rename(preparation_pbmc_batch = `Vorbereitung PBMC Batch`,
                ID = Proband_Code) %>% 
  tidyr::fill(preparation_pbmc_batch) %>% 
  filter(!is.na(ID))

samples_information = samples_information %>% 
  left_join(sample_information_batches, by = "ID")

In [None]:
head(samples_information)
dim(samples_information)
length(unique(samples_information$ID))

### Excluding samples

- Some samples have too high CRP (>20mg/L), they get excluded.

- Also, as detected in the file test_differential_expression, some samples got mixed up and the sex does not fit. I exclude these samples as well. As these samples (except one) are all on one plate in one column, I also exclude the two other samples in this column as I’m not sure if they have been mixed up as well. I can’t know this, as they are both female.

In [None]:
# IDs of sample to exclude due to high CRP
ids_exclude_crp <- readRDS("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/01_QC_hsCRP_high_CRP_excluded_ids.rds")

# IDs of samples to exclude due to mix-up
ids_exclude_mixup <- readRDS("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/sex_mismatch_id.rds")
# the other two samples in the same column of the other samples are
# "PTP0328" and "FOR0064"
# I've decided to remove the complete column where the mismatch happened
# FOR0055 is contained 3 times because it was sequenced several times with
# different RNA amount
ids_exclude_mixup$ID <- c(unique(ids_exclude_mixup$ID), "PTP0328", "FOR0064")

id_match <- readRDS("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/matched_ids.Rds")

ids_exclude_crp <- ids_exclude_crp %>% 
  left_join(id_match, by = "ID")

ids_exclude_mixup <- ids_exclude_mixup %>% 
  left_join(id_match, by = "ID")
ids_exclude <- bind_rows(ids_exclude_crp, ids_exclude_mixup)

In [None]:
id_match[id_match$ID == 'FOE0054', ]

In [None]:
samples_information = samples_information %>%
                    left_join(id_match %>% select(ID, combined_id, library_rna_amount), by = "ID")
head(samples_information)
dim(samples_information)

In [None]:
samples_information_filtered = samples_information[!samples_information$combined_id %in% c(ids_exclude_crp$combined_id,
                                                  ids_exclude_mixup$combined_id), ]
dim(samples_information_filtered)

In [None]:
length(unique(samples_information_filtered$ID))

### ADD GC content and RIN information to the samples file

In [None]:
gc_content <- readRDS("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/gc_content.Rds")
rin <- readRDS("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/rin.rds")
rin <- rin %>% 
  select(ID, RIN) %>% 
  filter(!is.na(ID), !is.na(RIN)) %>% distinct(ID, .keep_all = TRUE)

In [None]:
# add GC content to sample info
samples_information_filtered <- samples_information_filtered %>% 
  left_join(gc_content, by = c("combined_id" = "sample_id"))
dim(samples_information_filtered)
length(unique(samples_information_filtered$ID))

# add RIN
samples_information_filtered <- samples_information_filtered %>% 
  left_join(rin, by = "ID")
head(samples_information_filtered)
dim(samples_information_filtered)
length(unique(samples_information_filtered$ID))

In [None]:
length(unique(samples_information_filtered$combined_id))

In [None]:
length(unique(samples_information_filtered$ID))

### Case/Controls Split

In [None]:
# strict controls and cases and life-time diag as split by Janine
janine.diag.split = read.table("become_optima_lt_diag.txt", header=TRUE)
janine.diag.split = janine.diag.split %>% drop_na(ltany_di)
head(janine.diag.split)
dim(janine.diag.split)

In [None]:
# Number of controls
dim(janine.diag.split[janine.diag.split$ltany_di == 0, ])

In [None]:
# Number of cases
dim(janine.diag.split[janine.diag.split$ltany_di == 2, ])

In [None]:
# Number of life time diag
dim(janine.diag.split[janine.diag.split$ltany_di == 1, ])

In [None]:
# make the splitting into 0== controls and 1==rest (only for diff analysis and batch correction)
janine.diag.split$ltany_di[janine.diag.split$ltany_di == 2] <- 1

In [None]:
# Join the diag table with the pheno table to create the final pheno table
janine.diag.split.sub= janine.diag.split[, colnames(janine.diag.split) %in% c("ID", "ltany_di")]
dim(janine.diag.split.sub)
samples_information_filtered = janine.diag.split.sub %>% left_join(samples_information_filtered, by="ID")
# select non NAs
samples_information_filtered= samples_information_filtered[complete.cases(samples_information_filtered),]
dim(samples_information_filtered)

### Cutadapt pairs info

In [None]:
# cutadapt pairs passing the QC
cutadapt_pairs <- read.table("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/mqc_cutadapt_filtered_reads_plot_1.txt",
                             header = TRUE, sep = "\t")
head(cutadapt_pairs)

In [None]:
dim(cutadapt_pairs)

In [None]:
cutadapt_pairs_edited <- cutadapt_pairs %>% 
  mutate(sample_short = str_extract(Sample, ".+(?=_L00)")) %>% 
  group_by(sample_short) %>% 
  summarise(total_pairs = sum(Pairs.passing.filters) )
samples_information_filtered = samples_information_filtered %>%
            left_join(cutadapt_pairs_edited , by = c("combined_id" = "sample_short"))
head(samples_information_filtered)
dim(samples_information_filtered)

### BMI info

In [None]:
bmi <- readRDS("/grehawi/splice-reg-prj/data/Jonas-batchCorrection-data/bmi_observed_imputed_new.rds")
head(bmi)
length(unique(bmi$ID))

In [None]:
#add BMI
samples_information_filtered_final <- samples_information_filtered %>% 
  left_join(bmi, by = "ID")
head(samples_information_filtered_final)
dim(samples_information_filtered_final)

In [None]:
write.table(samples_information_filtered_final, "/grehawi/splice-reg-prj/data/samples_pheno.csv")

In [None]:
length(unique(samples_information_filtered_final$ID))

In [None]:
dim(samples_information_filtered_final[samples_information_filtered_final$ltany_di == 0,])

In [None]:
#cases including lifetime diagnosis
dim(samples_information_filtered_final[samples_information_filtered_final$ltany_di == 1,])

## New Data (IST samples): samples file processing

In [None]:
samples_information_newData = read.table("/grehawi/splice-reg-prj/new-data/RNA-seq/newData-pheno.csv", sep=" ",header = TRUE)
head(samples_information_newData)

In [None]:
unique(samples_information_newData$Status)

### Add Info to IST samples file :GC content and cutadapt (total_pairs)

In [None]:
gc_content_newData <- readRDS("/grehawi/splice-reg-prj/new-data/RNA-seq/gc_content.Rds")
head(gc_content_newData)
# add GC content to sample info
samples_information_newData <- samples_information_newData %>% 
  left_join(gc_content_newData, by = c("Sample_Name"))
dim(samples_information_newData)
length(unique(samples_information_newData$Sample_ID))

In [None]:
# cutadapt pairs passing the QC
cutadapt_pairs_newData <- read.table("/spliceReg-prj-Ghalia/MultiQC-newData/multiqc_data/mqc_cutadapt_filtered_reads_plot_1.txt",
                             header = TRUE, sep = "\t")
head(cutadapt_pairs_newData)
dim(cutadapt_pairs_newData)

In [None]:
cutadapt_pairs_edited <- cutadapt_pairs_newData %>% 
  mutate(Sample_Name_Full = str_extract(Sample, ".+(?=_L00)")) %>% 
  mutate(Sample_Name = str_extract(Sample, ".+(?=_S)")) %>%
  group_by(Sample_Name, Sample_Name_Full) %>% 
  summarise(total_pairs = sum(Pairs.passing.filters) )
head(cutadapt_pairs_edited)
samples_information_newData = samples_information_newData %>%
            left_join(cutadapt_pairs_edited , by = c("Sample_Name"))
head(samples_information_newData)
dim(samples_information_newData)

### Add BMI information to the samples sheet for new data

In [None]:
# in Column extra info the word 'cave' means that the values are not really accurate, in total there were 2 cave
# 'IST0007' and 'IST0014'
# proband IST0041 missing weight, I removed this proband when reading the file becasue it does not exist in our samples
# --> impuate the values for the missing+inaccurate samples later
bmi_newData <- read.table("/grehawi/splice-reg-prj/new-data/RNA-seq/BMI_Return_2_complete.txt",
                             header = TRUE, sep = "\t", na.strings = "NA")
bmi_newData[bmi_newData == ''] <- NA
bmi_newData = bmi_newData %>% mutate(BMI = Weight/Height**2)
head(bmi_newData)
dim(bmi_newData)

In [None]:
bmi_newData = bmi_newData[bmi_newData$Probanden_Nr %in% samples_information_newData$Probanden_Nr, colnames(bmi_newData) %in% c('Probanden_Nr', 'BMI')]
dim(bmi_newData)

In [None]:
merged_df <- merge(samples_information_newData, bmi_newData, by = "Probanden_Nr", all.x = TRUE, all.y = TRUE)
samples_information_newData$BMI <- ifelse(!is.na(merged_df$BMI.x), merged_df$BMI.x, merged_df$BMI.y)

In [None]:
bmi_na_values <- samples_information_newData[is.na(samples_information_newData$BMI),]
bmi_na_values

In [None]:
#Impute values for 3 probands in total IST0007, IST0014 and IST0029
#first create a subset of the dataframe containing only the controls from which we do the imputation
subdf = samples_information_newData[samples_information_newData$Status == 'control',]
samples_information_newData$BMI[is.na(samples_information_newData$BMI)] <- mean(subdf$BMI, na.rm = TRUE)
samples_information_newData$BMI[samples_information_newData$Probanden_Nr == 'IST0007'] <- mean(subdf$BMI, na.rm = TRUE)
samples_information_newData$BMI[samples_information_newData$Probanden_Nr == 'IST0014'] <- mean(subdf$BMI, na.rm = TRUE)

In [None]:
write.table(samples_information_newData, "/grehawi/splice-reg-prj/new-data/RNA-seq/newData-pheno-extended.csv")

In [None]:
# samples with very low mapped reads: 23L002733,  23L002687,  23L002675,  23L002659
# These samples are all treated with Dex and are later removed anyway from our samples
# samples with failed gc content: 23L002666, 23L002680, 23L002722, 23L002725
samples_faildGC_lowMapped = samples_information_newData[samples_information_newData$Sample_Name %in% c('23L002733',
                                '23L002687', '23L002675', '23L002659', '23L002666', '23L002680', '23L002722', '23L002725'), ]
samples_faildGC_lowMapped

In [None]:
write.table(samples_faildGC_lowMapped, "/grehawi/splice-reg-prj/new-data/RNA-seq/samples_pheno_newData_faildGC_lowMapped.csv")

## Read RSEM gene estimated count using tximport pkg

## First read transcript files

In [None]:
dir = "/spliceReg-prj-Ghalia/RSEM-ouput/RSEM-count"
transcript_files <- file.path(dir, paste0(samples_information_filtered_final$combined_id, ".isoforms.results"))
names(transcript_files) <- samples_information_filtered_final$combined_id
txi.rsem.transcripts <- tximport(transcript_files, type = "rsem", txIn = TRUE, txOut = TRUE)
head(txi.rsem.transcripts$counts)
#head(files)

In [None]:
# Abundance is the TPM proivided
# Counts are the estimated counts from the tool (in this case fractional)
head(txi.rsem.transcripts)

In [None]:
dim(txi.rsem.transcripts$counts)

In [None]:
tmp <- fread(transcript_files[1])
head(tmp)
tx2gene <- tmp[, c("transcript_id", "gene_id")]
head(tx2gene)

In [None]:
# new data
newData_dir = "/spliceReg-prj-Ghalia/RSEM-output-newData/RSEM-count"
transcript_files_newData <- file.path(newData_dir, paste0(samples_information_newData$Sample_Name_Full, ".isoforms.results"))
names(transcript_files_newData) <- samples_information_newData$Sample_Name_Full
txi.rsem.transcripts.newData <- tximport(transcript_files_newData, type = "rsem", txIn = TRUE, txOut = TRUE)
# head(txi.rsem.transcripts.newData$counts)

In [None]:
dim(txi.rsem.transcripts.newData$counts)

In [None]:
tmp <- fread(transcript_files_newData[1])
head(tmp)
tx2gene_newData <- tmp[, c("transcript_id", "gene_id")]
head(tx2gene_newData)
dim(tx2gene_newData)

In [None]:
write.table(tx2gene, "/grehawi/splice-reg-prj/data/transcriptsID-geneID.txt")

In [None]:
gene.files <- file.path(dir, paste0(samples_information_filtered_final$combined_id, ".genes.results"))
names(gene.files) <- samples_information_filtered_final$combined_id
txi.rsem.gene <- tximport(gene.files, type = "rsem", txIn = FALSE, txOut = FALSE, tx2gene=tx2gene)
head(txi.rsem.gene$counts)
dim(txi.rsem.gene$counts)

In [None]:
# new Data
gene.files.newData <- file.path(newData_dir, paste0(samples_information_newData$Sample_Name_Full, ".genes.results"))
names(gene.files.newData) <- samples_information_newData$Sample_Name_Full
txi.rsem.gene.newData <- tximport(gene.files.newData, type = "rsem", txIn = FALSE, txOut = FALSE, tx2gene=tx2gene_newData)
head(txi.rsem.gene.newData$counts)
dim(txi.rsem.gene.newData$counts)

In [None]:
gene.f <- fread(gene.files[1])
head(gene.f)
gene.lengths = gene.f[, c("gene_id", "effective_length")]
head(gene.lengths)

In [None]:
gene.f.newData <- fread(gene.files.newData[1])
head(gene.f.newData)
gene.lengths.newData = gene.f.newData[, c("gene_id", "effective_length")]
head(gene.lengths.newData)

In [None]:
write.table(gene.lengths, "/grehawi/splice-reg-prj/data/gene-lenghts.txt")

In [None]:
# are the count tables of old and new data identical in terms of gene names -> thus I can concatenate the columns 
c1 = c(rownames(txi.rsem.gene$counts))
c2 = c(rownames(txi.rsem.gene.newData$counts))
identical(c1,c2)

c3 = c(rownames(txi.rsem.transcripts$counts))
c4 = c(rownames(txi.rsem.transcripts.newData$counts))
identical(c3,c4)

## Filter out zero-length genes/trx and also those with 0 expression across all samples 

In [None]:
#filter out zero length genes
zero_length_and_unexpressed = (apply(txi.rsem.gene$abundance, 1, max) == 0) &
                              (apply(txi.rsem.gene$length, 1, min) == 0)

txi.rsem.gene$length = txi.rsem.gene$length[!zero_length_and_unexpressed,]
txi.rsem.gene$abundance = txi.rsem.gene$abundance[!zero_length_and_unexpressed,]
txi.rsem.gene$counts = txi.rsem.gene$counts[!zero_length_and_unexpressed,]
dim(txi.rsem.gene$counts)
dim(txi.rsem.gene$length)
dim(txi.rsem.gene$abundance)

In [None]:
#filter out zero length genes for new data
zero_length_and_unexpressed = (apply(txi.rsem.gene.newData$abundance, 1, max) == 0) &
                              (apply(txi.rsem.gene.newData$length, 1, min) == 0)

txi.rsem.gene.newData$length = txi.rsem.gene.newData$length[!zero_length_and_unexpressed,]
txi.rsem.gene.newData$abundance = txi.rsem.gene.newData$abundance[!zero_length_and_unexpressed,]
txi.rsem.gene.newData$counts = txi.rsem.gene.newData$counts[!zero_length_and_unexpressed,]
dim(txi.rsem.gene.newData$counts)
dim(txi.rsem.gene.newData$length)
dim(txi.rsem.gene.newData$abundance)

In [None]:
intersect_genes = intersect(rownames(txi.rsem.gene.newData$counts), rownames(txi.rsem.gene$counts))

In [None]:
#match the genes with the old data
txi.rsem.gene.newData$counts = txi.rsem.gene.newData$counts[rownames(txi.rsem.gene.newData$counts) %in% intersect_genes,]

In [None]:
#match the genes with the old data
txi.rsem.gene$counts = txi.rsem.gene$counts[rownames(txi.rsem.gene$counts) %in% intersect_genes,]

In [None]:
dim(txi.rsem.gene.newData$counts)

In [None]:
dim(txi.rsem.gene$counts)

In [None]:
#filter out zero length transcripts
zero_length_and_unexpressed = (apply(txi.rsem.transcripts$abundance, 1, max) == 0) &
                              (apply(txi.rsem.transcripts$length, 1, min) == 0)

txi.rsem.transcripts$length = txi.rsem.transcripts$length[!zero_length_and_unexpressed,]
txi.rsem.transcripts$abundance = txi.rsem.transcripts$abundance[!zero_length_and_unexpressed,]
txi.rsem.transcripts$counts = txi.rsem.transcripts$counts[!zero_length_and_unexpressed,]
dim(txi.rsem.transcripts$counts)
dim(txi.rsem.transcripts$length)
dim(txi.rsem.transcripts$abundance)

In [None]:
#filter out zero length transcripts for new data
zero_length_and_unexpressed = (apply(txi.rsem.transcripts.newData$abundance, 1, max) == 0) &
                              (apply(txi.rsem.transcripts.newData$length, 1, min) == 0)

txi.rsem.transcripts.newData$length = txi.rsem.transcripts.newData$length[!zero_length_and_unexpressed,]
txi.rsem.transcripts.newData$abundance = txi.rsem.transcripts.newData$abundance[!zero_length_and_unexpressed,]
txi.rsem.transcripts.newData$counts = txi.rsem.transcripts.newData$counts[!zero_length_and_unexpressed,]
dim(txi.rsem.transcripts.newData$counts)
dim(txi.rsem.transcripts.newData$length)
dim(txi.rsem.transcripts.newData$abundance)

In [None]:
# save the counts of genes and txs for the new data
write.table(txi.rsem.gene.newData$counts, '/grehawi/splice-reg-prj/new-data/RNA-seq/gene_counts_newData.matrix')
write.table(txi.rsem.transcripts.newData$counts, '/grehawi/splice-reg-prj/new-data/RNA-seq/trx_counts_newData.matrix')

In [None]:
# Concatenate the two tables on columns
final.txi.rsem.transcripts = cbind(txi.rsem.transcripts$counts, txi.rsem.transcripts.newData$counts)
final.txi.rsem.genes = cbind(txi.rsem.gene$counts, txi.rsem.gene.newData$counts)

In [None]:
head(final.txi.rsem.transcripts)

In [None]:
head(final.txi.rsem.genes)

## Check for Dex effect on gene expression in the new data

In [None]:
options(repr.plot.width=12, repr.plot.height=8)
pca_gene.count <- PCA(
  X = t(txi.rsem.gene.newData$counts),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_gene.count, 
             geom = "text", labelsize= 2,
            habillage = as.factor(samples_information_newData$Dex))

## Filter genes/transcripts to keep thoses with at least 10 reads in at least 95% of samples

In [None]:
# 95% of (343 +96 =439) = 417
genes_toKeep = apply( X = final.txi.rsem.genes, MARGIN = 1, FUN = function( row, count_thrsh, min_samples){
  sum( row >= count_thrsh ) >= min_samples
}, count_thrsh = 10, min_samples = 417 )

final.txi.rsem.genes = final.txi.rsem.genes[ genes_toKeep, ]
dim(final.txi.rsem.genes)

In [None]:
transcripts_toKeep = apply( X = final.txi.rsem.transcripts, MARGIN = 1, FUN = function( row, count_thrsh, min_samples){
  sum( row >= count_thrsh ) >= min_samples
}, count_thrsh = 10, min_samples = 417 )

final.txi.rsem.transcripts = final.txi.rsem.transcripts[transcripts_toKeep, ]
dim(final.txi.rsem.transcripts)

In [None]:
write.table(final.txi.rsem.genes, '/grehawi/splice-reg-prj/new-data/RNA-seq/gene_counts_combined.matrix')

In [None]:
write.table(final.txi.rsem.transcripts, '/grehawi/splice-reg-prj/new-data/RNA-seq/trx_counts_combined.matrix')

In [None]:
final.txi.rsem.genes = read.table('/grehawi/splice-reg-prj/new-data/RNA-seq/gene_counts_combined.matrix')

In [None]:
final.txi.rsem.transcripts = read.table('/grehawi/splice-reg-prj/new-data/RNA-seq/trx_counts_combined.matrix')

In [None]:
head(final.txi.rsem.genes)

## Check for Dex effect on gene expression in the new data after filtering step

In [None]:
options(repr.plot.width=12, repr.plot.height=8)
pca_gene.count <- PCA(
  X = t(final.txi.rsem.genes[, 344:dim(final.txi.rsem.genes)[2]]),
  ncp = 10,
  graph = FALSE)

fviz_pca_ind(pca_gene.count, 
             geom = "text", labelsize= 2,
            habillage = as.factor(samples_information_newData$Dex))

In [None]:
# We can see a clear Dex effect hence we merge the two datasets without including DEX samples (use baseline only)
# first add the letter 'x' to sample names becasue its been added directly to column names of the count matrices
# by 'write.table'
samples_information_newData$Sample_Name_Full = paste0("X", samples_information_newData$Sample_Name_Full)

# Now filter out Dex samples from the pheno file 
samples_information_newData_noDex = samples_information_newData[!samples_information_newData$Dex == 1, ]

# Remove some unnecessary columns from the new pheno file to make it more readable 
samples_information_newData_noDex = samples_information_newData_noDex[, !colnames(samples_information_newData_noDex)
                                                                      %in% c('Library.Protocol', 'Mergefastq', 'Read.length', 'Organism',
                                                                             'Sample_Project', 'Sample_ID', 'sampleId', 'Genotyped')]


In [None]:
#Common important columns that should be retained and merged for the two tables are:
# Age, Sex, BMI, well, row, column, plate, ltany_di (control-case- status) this is equals to Status in the new pheno file
# mean_gc_content, RIN, total_pairs
# We can also merge ID=Probenbezeichung and Sample_Name_Full=combined_id

names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "Probenbezeichung"] = "ID"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "Sample_Name_Full"] = "combined_id"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "Age"] = "age"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "Sex"] = "sex"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "plate"] = "plates"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "column"] = "column_newData"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "row"] = "row_newData"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "well"] = "well_newData"
names(samples_information_newData_noDex)[names(samples_information_newData_noDex) == "Status"] = "ltany_di"
# Binarize the 'ltany_di' column
samples_information_newData_noDex$ltany_di <- ifelse(samples_information_newData_noDex$ltany_di == 'case', 1, 0)

# set the plate column to 5 to indicate an extra plate for the new sequence run
samples_information_newData_noDex$plates = 5
head(samples_information_newData_noDex)

In [None]:
dim(samples_information_newData_noDex)

In [None]:
samples_information_filtered_final

In [None]:
write.table(samples_information_newData_noDex, "/grehawi/splice-reg-prj/new-data/RNA-seq/newData-pheno-extended-noDex.csv")

In [None]:
t = read.table("/grehawi/splice-reg-prj/new-data/RNA-seq/newData-pheno-extended-noDex.csv")

dim(t[t$ltany_di == 0,])

In [None]:
# Add a column to the pheno file indicating the sequence-run becasue this is going to be the largest confounder
samples_information_newData_noDex$sequence_run = 2
samples_information_filtered_final$sequence_run = 1

In [None]:
# Now concatenate the two tables row-wise based on the intersect columns
combined_pheno <- bind_rows(samples_information_filtered_final, samples_information_newData_noDex)
combined_pheno

In [None]:
rownames(combined_pheno) = combined_pheno$combined_id
head(combined_pheno)

In [None]:
write.table(combined_pheno, "/grehawi/splice-reg-prj/new-data/RNA-seq/combined-pheno.csv")

In [None]:
combined_pheno = read.table('/grehawi/splice-reg-prj/new-data/RNA-seq/combined-pheno.csv')

In [None]:
# We need to remove the new added cases later in our pipeline so I save them from now
cases_newData = samples_information_newData_noDex[samples_information_newData_noDex$ltany_di == 1, colnames(samples_information_newData_noDex) %in% c('combined_id')]

In [None]:
saveRDS(cases_newData, '/grehawi/splice-reg-prj/new-data/RNA-seq/newData-casesIDs.rds')

In [None]:
# filter out dex samples from gene and trx count tables
final.txi.rsem.genes = final.txi.rsem.genes[, colnames(final.txi.rsem.genes) %in% combined_pheno$combined_id]
final.txi.rsem.transcripts = final.txi.rsem.transcripts[, colnames(final.txi.rsem.transcripts) %in% combined_pheno$combined_id]

In [None]:
# save the counts of genes and txs for the merged data with no dex
write.table(final.txi.rsem.genes, '/grehawi/splice-reg-prj/new-data/RNA-seq/gene_counts_combined_noDex.matrix')
write.table(final.txi.rsem.transcripts, '/grehawi/splice-reg-prj/new-data/RNA-seq/trx_counts_combined_noDex.matrix')

## Biotype analysis 

In [None]:
gencode.annotation = rtracklayer::import("/home/grehawi/splice-reg-prj/data/Homo_sapiens.GRCh38.97.gtf")

In [None]:
gencode.annotation.df = as.data.frame(gencode.annotation)
head(gencode.annotation.df)

In [None]:
selection = c("type","gene_id", "gene_name", "gene_biotype", "start", "end", "transcript_id", "transcript_name", "transcript_biotype")
sub.gcode.anno.df = gencode.annotation.df[, colnames(gencode.annotation.df) %in% selection]

In [None]:
head(sub.gcode.anno.df)

In [None]:
write.table(sub.gcode.anno.df, "/grehawi/splice-reg-prj/new-data/subsetted_gencode_annotation.txt")