In [None]:
library(data.table)
library(dplyr)
library(readxl)
library(ggplot2)
library(purrr)
library(stringr)
library(sva)
library(DESeq2)
library(vsn)
#library(hexbin)
library(FactoMineR)
library(factoextra)
library(variancePartition)
library(tidyr)
library(pheatmap)
library(limma)

# Processing of samples

## 1. Filtering samples: remove duplicate samples, only consider strict controls (no lifetime diag), etc.

In [None]:
sample_data = read.table("/grehawi/splice-reg-prj/new-data/RNA-seq/combined-pheno.csv")
head(sample_data)
dim(sample_data)

In [None]:
length(sample_data$ID)
length(unique(sample_data$ID))

In [None]:
# Number of cases (old + new run)
dim(sample_data[sample_data$ltany_di == 1, ])

In [None]:
# Number of controls (old + 37 from new run)
dim(sample_data[sample_data$ltany_di == 0, ])

In [None]:
# mean of total_pairs for IST study
mean(sample_data[sample_data$sequence_run ==2, ]$total_pairs)

## 1.1 remove duplicate samples

In [None]:
n_occur <- data.frame(table(sample_data$ID))
n_occur

In [None]:
duplicated.samples = sample_data[sample_data$ID %in% n_occur$Var1[n_occur$Freq > 1],]
duplicated.samples[, colnames(duplicated.samples) %in% c("combined_id", "ID", "RIN", "library_rna_amount")]#--> same RIN across duplicates --> keep one version with library rna anount = 200

In [None]:
# filter out duplicate samples with library_rna_amount != 200
duplicated.samples = duplicated.samples[duplicated.samples$library_rna_amount != 200,]
sample_data_nodupl = sample_data[!sample_data$combined_id %in% duplicated.samples$combined_id, ]
dim(sample_data_nodupl)

## 1.2 Remove samples with lifetime diagnosis

In [None]:
# Also filter to include only strict-controls and cases (without life-time diag)
janine.diag.split = read.table("/grehawi/splice-reg-prj/new-data/become_optima_lt_diag.txt", header=TRUE)
janine.diag.split = janine.diag.split %>% drop_na(ltany_di)
head(janine.diag.split)
dim(janine.diag.split)

In [None]:
# Number of controls
dim(janine.diag.split[janine.diag.split$ltany_di == 0, ])

In [None]:
# Number of cases
dim(janine.diag.split[janine.diag.split$ltany_di == 2, ])

In [None]:
# Number of life time diag
dim(janine.diag.split[janine.diag.split$ltany_di == 1, ])

In [None]:
ids_lifetime_diag = janine.diag.split[janine.diag.split$ltany_di == 1, colnames(janine.diag.split) %in% c("ID")]
ids_lifetime_diag

In [None]:
sample_data_final = sample_data_nodupl[!sample_data_nodupl$ID %in% ids_lifetime_diag, ]
dim(sample_data_final)

## 1.3 Remove cases samples added with the new data

In [None]:
new_cases = readRDS("/grehawi/splice-reg-prj/new-data/RNA-seq/newData-casesIDs.rds")
new_cases

In [None]:
sample_data_final = sample_data_final[!sample_data_final$combined_id %in% new_cases,]

In [None]:
dim(sample_data_final[sample_data_final$ltany_di ==0,])

In [None]:
dim(sample_data_final[sample_data_final$ltany_di ==1,])

## Add cell-types proportions to samples_info file

In [None]:
ct = read.table("/grehawi/splice-reg-prj/new-data/batch_corr/dtangle-cellTypes-on-raw-LM22.txt")

In [None]:
ct$combined_id = rownames(ct) 
sample_data_final_with_CT = sample_data_final %>% left_join(ct, by="combined_id")
head(sample_data_final_with_CT)

In [None]:
write.table(sample_data_final, "/home/grehawi/splice-reg-prj/new-data/Diff-Analysis/combined_pheno_withCT.csv")

In [None]:
length(unique(sample_data_final$ID))

In [None]:
#set t0_bdi for IST samples to zero
#set study column for IST samples to IST
sample_data_final$t0_bdi[sample_data_final$sequence_run == 2] = 0
sample_data_final$study[sample_data_final$sequence_run == 2] = 'IST'

In [None]:
IST_samples = sample_data_final[sample_data_final$sequence_run == 2, ]
write.table(IST_samples, '/grehawi/splice-reg-prj/new-data/Submission_related/IST_samples.txt', row.names = FALSE)

OPTIMA_BeCOME_samples = sample_data_final[sample_data_final$sequence_run == 1, ]
write.table(OPTIMA_BeCOME_samples, '/grehawi/splice-reg-prj/new-data/Submission_related/OPTIMA_BeCOME_samples.txt', row.names = FALSE)


In [None]:
write.table(sample_data_final, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/combined_pheno_withCT.csv")

In [None]:
dim(sample_data_final)

In [None]:
#number of controls from Optima
dim(sample_data_final[sample_data_final$ltany_di == 0 & sample_data_final$study =='optima',])
#number of controls from BecOME
dim(sample_data_final[sample_data_final$ltany_di == 0  & sample_data_final$study =='become',])
#number of controls from IST
dim(sample_data_final[sample_data_final$ltany_di == 0 & sample_data_final$study =='IST',])

In [None]:
#number of cases from Optima
dim(sample_data_final[sample_data_final$ltany_di == 1 & sample_data_final$study =='optima',])
#umber of cases from BecOME
dim(sample_data_final[sample_data_final$ltany_di == 1 & sample_data_final$study =='become',])

In [None]:
# mean of total_pairs for IST study
mean(sample_data_final[sample_data_final$sequence_run ==2, ]$total_pairs)

## The following code is to fill Table 1 of the manuscript

In [None]:
cidi.depr.diag = readRDS("/grehawi/splice-reg-prj/new-data/cidi_joanas/cidi_single_tidy_processed_jonas.rds")
colnames(cidi.depr.diag)
head(cidi.depr.diag)
dim(cidi.depr.diag)

In [None]:
length(intersect(cidi.depr.diag$ID, sample_data_final$ID[sample_data_final$ltany_di ==0]))

In [None]:
length(intersect(cidi.depr.diag$ID, sample_data_final$ID[sample_data_final$ltany_di ==1]))

In [None]:
# how many of all affected samples have subthreshold diagnosis of dysthemia or major depression
cidi.depr.diag.sup.aff = cidi.depr.diag[cidi.depr.diag$ID %in% sample_data_final$ID[sample_data_final$ltany_di == 1],]
length(unique(cidi.depr.diag.sup.aff$ID))
dysth_depr_samples_aff = cidi.depr.diag.sup.aff[cidi.depr.diag.sup.aff$Dysthymia_full_curr == 1 | 
                                        cidi.depr.diag.sup.aff$Depression_subthr_curr == 1 |  
                                        cidi.depr.diag.sup.aff$Depression_full_curr == 1 , ]
length(unique(dysth_depr_samples_aff$ID))

In [None]:
cidi.depr.diag.sup.unaff = cidi.depr.diag[cidi.depr.diag$ID %in% sample_data_final$ID[sample_data_final$ltany_di == 0],]
dysth_depr_samples_unaff = cidi.depr.diag.sup.unaff[cidi.depr.diag.sup.unaff$Dysthymia_full_curr == 1 | 
                                        cidi.depr.diag.sup.unaff$Depression_subthr_curr == 1 |  
                                        cidi.depr.diag.sup.unaff$Depression_full_curr == 1 , ]
length(unique(dysth_depr_samples_unaff$ID))

In [None]:
optima_samples_info = sample_data_final[sample_data_final$study == 'optima', ]
dim(optima_samples_info)

In [None]:
become_samples_info = sample_data_final[sample_data_final$study == 'become', ]
dim(become_samples_info)

In [None]:
ist_samples_info = sample_data_final[sample_data_final$study == 'IST', ]
dim(ist_samples_info)

In [None]:
# how many of OPTIMA samples (only affected) have subthreshold diagnosis of dysthemia or major depression
cidi.depr.diag.sup.optima = cidi.depr.diag[cidi.depr.diag$ID %in% optima_samples_info$ID[optima_samples_info$ltany_di == 1],]
length(unique(cidi.depr.diag.sup.optima$ID))
dysth_depr_samples_optima = cidi.depr.diag.sup.optima[cidi.depr.diag.sup.optima$Dysthymia_full_curr == 1 | 
                                        cidi.depr.diag.sup.optima$Depression_subthr_curr == 1 |  
                                        cidi.depr.diag.sup.optima$Depression_full_curr == 1 , ]
length(unique(dysth_depr_samples_optima$ID))

In [None]:
# how many of become affected samples have subthreshold diagnosis of dysthemia or major depression
# lt is life time, curr is current diagnosis
cidi.depr.diag.sup.become.affeceted = cidi.depr.diag[cidi.depr.diag$ID %in% become_samples_info$ID[become_samples_info$ltany_di == 1],]
dysth_depr_samples_become_affected = cidi.depr.diag.sup.become.affeceted[cidi.depr.diag.sup.become.affeceted$Dysthymia_full_curr == 1 | 
                                        cidi.depr.diag.sup.become.affeceted$Depression_subthr_curr == 1 |  
                                        cidi.depr.diag.sup.become.affeceted$Depression_full_curr == 1 , ]
length(unique(dysth_depr_samples_become_affected$ID))

In [None]:
# how many of become unaffected samples have subthreshold diagnosis of dysthemia or major depression
# lt is life time, curr is current diagnosis
cidi.depr.diag.sup.become.unaffeceted = cidi.depr.diag[cidi.depr.diag$ID %in% become_samples_info$ID[become_samples_info$ltany_di == 0],]
dysth_depr_samples_become_unaffected = cidi.depr.diag.sup.become.unaffeceted[cidi.depr.diag.sup.become.unaffeceted$Dysthymia_full_curr == 1 | 
                                        cidi.depr.diag.sup.become.unaffeceted$Depression_subthr_curr == 1 |  
                                        cidi.depr.diag.sup.become.unaffeceted$Depression_full_curr == 1 , ]
length(unique(dysth_depr_samples_become_unaffected$ID))
unique(dysth_depr_samples_become_unaffected$ID)

In [None]:
#mean BDI in affeceted individuals (total)
mean(sample_data_final$t0_bdi[sample_data_final$ltany_di == 1]); sd(sample_data_final$t0_bdi[sample_data_final$ltany_di == 1])
#mean BDI in unaffeceted individuals (total)
mean(sample_data_final$t0_bdi[sample_data_final$ltany_di == 0]); sd(sample_data_final$t0_bdi[sample_data_final$ltany_di == 0])

In [None]:
#mean BDI in affeceted individuals (become)
mean(become_samples_info$t0_bdi[become_samples_info$ltany_di == 1]); sd(become_samples_info$t0_bdi[become_samples_info$ltany_di == 1])
#mean BDI in unaffeceted individuals (become)
mean(become_samples_info$t0_bdi[become_samples_info$ltany_di == 0]); sd(become_samples_info$t0_bdi[become_samples_info$ltany_di == 0])

In [None]:
#mean BDI in affeceted individuals (optima)
mean(optima_samples_info$t0_bdi[optima_samples_info$ltany_di == 1]); sd(optima_samples_info$t0_bdi[optima_samples_info$ltany_di == 1])
#mean BDI in unaffeceted individuals (optima)
mean(optima_samples_info$t0_bdi[optima_samples_info$ltany_di == 0]); sd(optima_samples_info$t0_bdi[optima_samples_info$ltany_di == 0])

In [None]:
#mean BDI in affeceted individuals (IST)
mean(ist_samples_info$t0_bdi[ist_samples_info$ltany_di == 1]); sd(ist_samples_info$t0_bdi[ist_samples_info$ltany_di == 1])
#mean BDI in unaffeceted individuals (IST)
mean(ist_samples_info$t0_bdi[ist_samples_info$ltany_di == 0]); sd(ist_samples_info$t0_bdi[ist_samples_info$ltany_di == 0])

In [None]:
#mean age in affeceted individuals (total)
mean(sample_data_final$age[sample_data_final$ltany_di == 1]); sd(sample_data_final$age[sample_data_final$ltany_di == 1])
#mean age in unaffeceted individuals (total)
mean(sample_data_final$age[sample_data_final$ltany_di == 0]); sd(sample_data_final$age[sample_data_final$ltany_di == 0])

In [None]:
#mean age in affeceted individuals of become samples
mean(become_samples_info$age[become_samples_info$ltany_di == 1]); sd(become_samples_info$age[become_samples_info$ltany_di == 1])
#mean age in unaffeceted individuals of become samples
mean(become_samples_info$age[become_samples_info$ltany_di == 0]); sd(become_samples_info$age[become_samples_info$ltany_di == 0])

In [None]:
#mean age in affeceted individuals of OPTIMA samples
mean(optima_samples_info$age[optima_samples_info$ltany_di == 1]); sd(optima_samples_info$age[optima_samples_info$ltany_di == 1])
#mean age in unaffeceted individuals of OPTIMA samples
mean(optima_samples_info$age[optima_samples_info$ltany_di == 0]); sd(optima_samples_info$age[optima_samples_info$ltany_di == 0])

In [None]:
#mean age in affeceted individuals of IST samples
mean(ist_samples_info$age[ist_samples_info$ltany_di == 1]); sd(ist_samples_info$age[ist_samples_info$ltany_di == 1])
#mean age in unaffeceted individuals of IST samples
mean(ist_samples_info$age[ist_samples_info$ltany_di == 0]); sd(ist_samples_info$age[ist_samples_info$ltany_di == 0])

In [None]:
#mean BMI in affeceted individuals (total)
mean(sample_data_final$BMI[sample_data_final$ltany_di == 1]); sd(sample_data_final$BMI[sample_data_final$ltany_di == 1])
#mean BMI in unaffeceted individuals (total)
mean(sample_data_final$BMI[sample_data_final$ltany_di == 0]); sd(sample_data_final$BMI[sample_data_final$ltany_di == 0])

In [None]:
#mean BMI in affeceted individuals (become)
mean(become_samples_info$BMI[become_samples_info$ltany_di == 1]); sd(become_samples_info$BMI[become_samples_info$ltany_di == 1])
#mean BMI in unaffeceted individuals (become)
mean(become_samples_info$BMI[become_samples_info$ltany_di == 0]); sd(become_samples_info$BMI[become_samples_info$ltany_di == 0])

In [None]:
#mean BMI in affeceted individuals (optima)
mean(optima_samples_info$BMI[optima_samples_info$ltany_di == 1]); sd(optima_samples_info$BMI[optima_samples_info$ltany_di == 1])

In [None]:
#mean BMI in affeceted individuals (IST)
mean(ist_samples_info$BMI[ist_samples_info$ltany_di == 0]); sd(ist_samples_info$BMI[ist_samples_info$ltany_di == 0])

In [None]:
#number of females in affeceted individuals (total)
dim(sample_data_final[sample_data_final$ltany_di == 1 & sample_data_final$sex == 'female', ])
#number of females in unaffeceted individuals (total)
dim(sample_data_final[sample_data_final$ltany_di == 0 & sample_data_final$sex == 'female', ])

In [None]:
#number of females in affeceted individuals (become)
dim(become_samples_info[become_samples_info$ltany_di == 1 & become_samples_info$sex == 'female', ])
#number of females in unaffeceted individuals (become)
dim(become_samples_info[become_samples_info$ltany_di == 0 & become_samples_info$sex == 'female', ])

In [None]:
#number of females in affeceted individuals (optima)
dim(optima_samples_info[optima_samples_info$ltany_di == 1 & optima_samples_info$sex == 'female', ])
#number of females in unaffeceted individuals (optima)
dim(optima_samples_info[optima_samples_info$ltany_di == 0 & optima_samples_info$sex == 'female', ])

In [None]:
#number of females in unaffeceted individuals (ist)
dim(ist_samples_info[ist_samples_info$ltany_di == 0 & ist_samples_info$sex == 'female', ])

In [None]:
# medication information 
medication_data = readRDS('/grehawi/splice-reg-prj/data/clinical-data/medication_data_both.Rds')
cases_all = sample_data_final[sample_data_final$ltany_di == 1,]
medication_data = medication_data[medication_data$ID %in% cases_all$ID,]
medication_data

In [None]:
# missing medication information
# I asked Janine and these 3 samples are free from medication
setdiff(sample_data_final$ID[sample_data_final$ltany_di == 1], medication_data$ID)

In [None]:
missing_samples_df = data.frame(c('PTP0458','PTP0734','PTP1328'), c('FALSE', 'FALSE', 'FALSE'))
colnames(missing_samples_df) = c('ID', 't0_any_psych_medication')
medication_data = rbind(medication_data, missing_samples_df)

In [None]:
# get medication information for all cases
sample_data_with_medication_final = medication_data %>% left_join(cases_all, by="ID")
sample_data_with_medication_final_Become = sample_data_with_medication_final[sample_data_with_medication_final$study == 'become',]
sample_data_with_medication_final_optima = sample_data_with_medication_final[sample_data_with_medication_final$study == 'optima',]
# number of medicated samples from BeCOME
length(unique(sample_data_with_medication_final_Become$ID[sample_data_with_medication_final_Become$t0_any_psych_medication == 'TRUE']))
# number of medicated samples from OPTIMA
length(unique(sample_data_with_medication_final_optima$ID[sample_data_with_medication_final_optima$t0_any_psych_medication == 'TRUE']))

In [None]:
length(unique(sample_data_with_medication_final$ID[sample_data_with_medication_final$t0_any_psych_medication == 'TRUE']))