## Project

In this folder is the 3 workflow to process all the previously created CCLF lines for which we have RNA or WES data, we extract the relevant data and process it all throught a jupyter notebook and apply the 3 post processing step we have in CCLE

we then output a set of folder containing any of CNmaps, CNV seg files, SNP MAF files, fusions, RNA profile, RNA plot within CCLE CL.
that we will upload in a CCLE bucket

### check data types available and create DFs for each pipelines

In [None]:
aggregated = refwm.get_entities('sample_set').loc[sample_set_id]["combined_seg_file"]
aggregated

In [None]:
! gsutil cp $aggregated "temp/cnv_ccle.called.seg"

## Copy number pipeline

In [None]:
%%R
source('../JKBio/gkugener/RScripts/load_libraries_and_annotations.R')

In [None]:
%%R
genome_version <- 'hg38'
release <- '19Q3'
hg38_cyto_band_reference <- 'data/hg38_cytoband.gz'
new_samples_copy_number_broad_wes <- 'temp/cnv_ccle.called.seg'

In [None]:
%%R
source("CCLE_postp_function.R")

In [None]:
%%R
# Previous release copy number profiles. This line will need to be updated as well
wes.priority.cn.seg.profiles <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.file='wes.19Q3.segmented') %>%
  dplyr::select(DepMap_ID, Chromosome, Start, End, Num_Probes, Segment_Mean, Source)

## If want to reprocess something

In [None]:
#%%R
#segments_unfiltered <- readr::read_csv('temp/wes.19Q3.segmented.cn')

In [None]:
prevversion = '5c76b219-404f-479a-9b7c-7cf5699fc161_aggregate_CN_segments_wrkflw_c0119f63-73d2-4f47-a6ec-88f24f74f3f2_call-aggregate_CN_segments_19Q3.called.seg'

In [None]:
%%R
segments <- processSegments(new_samples_copy_number_broad_wes)
segments <- filterForCCLE(segments)
segments <- interpolateGapsInSegmented(segments)$segs
segments <- extendEndsOfSegments(segments,'../JKBio/data/hg38_cytoband.gz')
print(segments)
print(head(wes.priority.cn.seg.profiles))
segments_unfiltered <- reprioritizeData(segments, wes.priority.cn.seg.profiles)
# Fill in the gaps on the entire dataset
# Extend start sites to 1, end sites to the end of the chromosome?

In [None]:
%%R
embargoed <- "WES_embargoed.txt"
blacklisted <- "blacklist.txt"

In [None]:
%%R
# TODO: IF seqnames (CHR) are 1-9 values, append "chr" in front of each! (bulk)
# ifelse(grepl("[0-9]+", new_copy_number$ChrChromosome), "chr"+, "no")
segments_blacklisted <- filterBlackListedLine(filepath=blacklisted,segments_unfiltered)
segments_embargoed <- filterBlackListedLine(filepath=embargoed,segments_unfiltered)

In [None]:
%%R
write.table(segments_unfiltered, file = paste0("temp/wes.",release,".segmented.cn1"), sep = ',', quote = F, row.names = F) 

In [None]:
%%R
entrezgenes <- generateEntrezGenes()

In [None]:
%%R
head(corner(entrezgenes))

In [None]:
%%R
res <- generateGeneLevelMatrixFromSegments(entrezgenes, segments_unfiltered)

In [None]:
%%R
genematrix_unfiltered <- res$gene_level_data_hg38
corner(genematrix_unfiltered)

In [None]:
%%R
write.table(genematrix_unfiltered, file = paste0('temp/wes.',release,'.gene.cn'), 
sep = ',', quote = F, row.names = T)

## Validation step

In [None]:
release="19Q3.blacklisted"

In [None]:
genecn = pd.read_csv('temp/wes.'+release+'.gene.cn', sep = ',')

In [None]:
genecn = genecn[genecn.columns[1:]]

In [None]:
genecn[genecn["Unnamed: 0.1"]=="ACH-000658"]

In [None]:
wes_19Q3_gene = tc.get(name='segmented-cn-wes-prioritzed-7fe1', version=33, file='wes.19Q3.gene')

In [None]:
wes_19Q3_gene.loc['ACH-000658']

In [None]:
prevsegmentcn = pd.read_csv('temp/cnv_ccle.called.seg', sep = '\t')
segmentcn = pd.read_csv('temp/wes.'+release+'.segmented.cn', sep = ',')

In [None]:
genecn.index = [i[-1] for i in genecn.index.str.split('_').tolist()]
segmentcn["DepMap_ID"] = [i[-1] for i in segmentcn["DepMap_ID"].str.split('_').tolist()]

In [None]:
embargoed = "WES_embargoed.txt"
embargoed = [line.rstrip('\n') for line in open(embargoed)]
blacklisted = "blacklist.txt"
blacklisted = [line.rstrip('\n') for line in open(blacklisted)]

In [None]:
prev = set(tc.get(name='segmented-cn-wes-prioritzed-7fe1', version=27, file='wes.19Q3interim.gene').index.values.tolist())
new1 = set(genecn.index.values.tolist())
new2 = set(segmentcn['Sample'].values.tolist())
print(len(prev), len(prev & new1), len(new1), len(new1 & new2))

In [None]:
checkAmountOfSegments(segmentcn,thresh = 750)

In [None]:
checkGeneChangeAccrossAll(genecn, thresh=1.5)

In [None]:
newsamples =  list(set(segmentcn["Sample"].tolist()))

In [None]:
segmentcn[segmentcn["DepMap_ID"]=="ACH-000658"]

In [None]:
newsamples= refwm.get_sample_sets().loc[sample_set_id].samples

In [None]:
samples = refwm.get_samples()
plots = samples.loc[samples.index.isin(newsamples)]["modeled_segments_plot_tumor"].values
for plot in plots:
    ! gsutil cp $plot temp/

In [None]:
for plot in plots:
    display(Image('temp/'+plot.split('/')[-1]))

## RNA pipeline

In [None]:
%%R
release <- '19Q3'

In [None]:
%%R
library('taigr')

In [None]:
%%R
source('../JKBio/gkugener/RScripts/load_libraries_and_annotations.R')

In [None]:
%%R
source('CCLE_postp_function.R')

In [None]:
res = refwm.get_sample_sets().loc['all_samples']
res

In [None]:
res = refwm.get_sample_sets().loc['All_samples']
rsem_genes_expected_count = res['rsem_genes_expected_count']
rsem_genes_tpm = res['rsem_genes_tpm']
rsem_transcripts_tpm = res['rsem_transcripts_tpm']
! gsutil cp $rsem_genes_expected_count "temp/" & gsutil cp $rsem_genes_tpm "temp/" & gsutil cp $rsem_transcripts_tpm "temp/"

In [None]:
%%R
# alternative one
download_paths <- list(
  counts_genes='temp/DM19Q2_PATHS_CORRECTED_V2.rsem_genes_expected_count.txt.gz',
  tpm_genes='temp/DM19Q2_PATHS_CORRECTED_V2.rsem_genes_tpm.txt.gz',
  tpm_transcripts='temp/DM19Q2_PATHS_CORRECTED_V2.rsem_transcripts_tpm.txt.gz')

In [None]:
%%R
# These files are downloaded from FireCloud/Terra
download_paths <- list(
  tpm_genes='temp/expression.genes.tpm.txt.gz',
  tpm_transcripts='temp/expression.transcript.tpm.txt.gz',
  counts_genes='temp/expression.expectedcount.txt.gz')

In [None]:
%%R
tpm_transcripts = readTranscripts(download_paths$tpm_transcripts)
counts_genes = readCounts(download_paths$counts_genes)
tpm_genes = readTPM(download_paths$tpm_genes)

In [None]:
%%R
# Fix the colnames (for cases where there are mixed values (CCLE_name and DepMap_IDs))
colnames(counts_genes) %<>% renameFunction(.)
colnames(tpm_genes) %<>% renameFunction(.)
colnames(tpm_transcripts) %<>% renameFunction(.)

### data exploration and QC

In [None]:
%%R
tpm_genes[,'ACH-001767']

In [None]:
%%R 
# Quick QC
# We are looking for samples with a worrying amount of zeros
zero_threshold <- 39000
number_zeros <- apply(tpm_genes[,3:ncol(tpm_genes)], 2, FUN = function(x) length(x[x == 0]))
nzdf <- data.frame(CL=names(number_zeros), nz=number_zeros, stringsAsFactors = F)

In [None]:
%%R
number_zeros <- number_zeros[order(-number_zeros)]
number_zeros <- number_zeros[number_zeros < zero_threshold]
pass <- number_zeros %>% names()

# These samples failed
failed <- setdiff(colnames(tpm_genes), pass) %>% .[!(. %in% c('gene_id', 'transcript_id(s)'))]

counts_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_transcripts %<>% dplyr::select(c("transcript_id", "gene_id", pass))

In [None]:
%%R
failed

In [None]:
%%R 

In [None]:
%%R
# Plot of the samples that fail
plot <- ggplot(nzdf, aes(nz)) +
  geom_histogram(bins = 100, color='black', fill='white') +
  geom_vline(xintercept = zero_threshold, linetype=2) +
  geom_label_repel(data = nzdf %>% filter(nz > zero_threshold), aes(x=nz, y=0, label=CL), size=5, fill=rgb(1,1,1,0.5))

ggsave(plot, filename ='temp/ggplot.png', width=20, height = 20)

In [None]:
display(Image(filename='temp/ggplot.png'))

In [None]:
%%R
write.table(
  counts_genes, 
  file = paste0('temp/expression.', release,'.counts.tsv'), 
  sep = '\t', row.names = F, quote = F)
write.table(
  tpm_genes, 
  file = paste0('temp/expression.', release,'genes.tsv'), 
  sep = '\t', row.names = F, quote = F)
write.table(
  tpm_transcripts, 
  file = paste0('temp/expression.', release,'.transcripts.tsv'),
  sep = '\t', row.names = F, quote = F)

# Validation

In [None]:
counts_genes = pd.read_csv('temp/expression.'+ release + '.counts.tsv', sep='\t')

In [None]:
tpm_genes = pd.read_csv('temp/expression.'+ release + '.tpm.tsv', sep='\t')
tpm_transcripts = pd.read_csv('temp/expression.'+ release + '.transcripts.tsv', sep='\t')

In [None]:
DM19Q2counts = tc.get(name='depmap-expression-87f8', version=7, file='DM19Q2.counts')
DM19Q2tpm = tc.get(name='depmap-expression-87f8', version=7, file='DM19Q2.tpm')
DM19Q2transcripts = tc.get(name='depmap-expression-87f8', version=7, file='DM19Q2.transcripts')

In [None]:
tpm_genes['ACH-000309'] = DM19Q2tpm['ACH-000309']
tpm_genes['ACH-001852'] = DM19Q2tpm['ACH-001852']
tpm_transcripts['ACH-001852'] = DM19Q2transcripts['ACH-001852']
tpm_transcripts['ACH-000309'] = DM19Q2transcripts['ACH-000309']
counts_genes['ACH-000309'] = DM19Q2counts['ACH-000309']
counts_genes['ACH-001852'] = DM19Q2counts['ACH-001852']

In [None]:
tpm_genes.to_csv('temp/expression.'+ release + '.genes.tsv', sep='\t')
tpm_transcripts.to_csv('temp/expression.'+ release + '.transcripts.tsv', sep='\t')
counts_genes.to_csv('temp/expression.'+ release + '.counts.tsv', sep='\t')

In [None]:
counts_genes.columns.str.contains('ibm')

In [None]:
prev = tc.get(name='depmap-expression-87f8', version=7, file='DM19Q2.tpm')

In [None]:
tpm_genes[tpm_genes.columns[2:]] = tpm_genes[tpm_genes.columns[2:]].apply(lambda x: np.log2(x+1))

In [None]:
new1 = set(counts_genes.columns.values.tolist())
prev = set(prev.columns.values.tolist())

In [None]:
new3 = set(tpm_transcripts.columns.values.tolist())
new2 = set(tpm_genes.columns.values.tolist())

In [None]:
print(len(prev), len(prev & new2), len(new1), len(new1 & new2 & new3), new2-prev, prev-new2)

In [None]:
metadata = tc.get(name='internal-19q2-9504', version=24, file='sample_info')

In [None]:
metadata

In [None]:
# finding train and test set
trainame = [val for val in new1&prev if val[:3] == 'ACH']
testname = [val for val in new1-prev if val[:3] == 'ACH']

#looking at the 2000 most variable genes in the two sets
genetolookfor = 2000
gene_var = counts_genes[trainame].var(1).values
print(len(gene_var))
sorting = np.argsort(gene_var)[-genetolookfor:]

In [None]:
unregistered = set(testname) - set(metadata["DepMap_ID"].values.tolist())
unregistered

In [None]:
len(counts_genes['ACH-001767']) - np.count_nonzero(counts_genes['ACH-001767'].values)

In [None]:
# creating and reodering train and test sets
traindata = counts_genes[set(trainame)-unregistered].values[sorting].T
trainlabels = [metadata[metadata["DepMap_ID"]==val]["disease"].values[0] for val in counts_genes[set(trainame)-unregistered].columns.tolist() if val not in unregistered]

testdata = counts_genes[set(testname)-unregistered].values[sorting].T
testlabels = [metadata[metadata["DepMap_ID"]==val]["disease"].values[0] for val in counts_genes[set(testname)-unregistered].columns.tolist() if val not in unregistered]

In [None]:
traindata.shape

In [None]:
# learn KNN classifier to the metadata diseases
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(traindata, trainlabels) 
predicted = neigh.predict(testdata)

In [None]:
predicted[1]

In [None]:
labels = trainlabels + testlabels
colors=[0]*len(trainlabels)
colors.extend([1,2,2,2,2,1,2,2,2,1,2])
data = np.vstack([traindata,testdata])

In [None]:
# plot them with TSNE, highlight the points that failed and show colors for diseases
dimred = TSNE(2,10).fit_transform(data)

In [None]:
scatter(dimred, labels=labels,colors=colors, radi=1.9)

## Fusion post processing

In [None]:
terra.wait_for_submission(submission_id2)

In [None]:
aggregate = refwm.get_config('Aggregate_Fusion_Calls')
aggregate

In [None]:
refwm.update_config(aggregate)
submission_id2 = refwm.create_submission(aggregate['name'], 'All_samples')

In [None]:
terra.wait_for_submission(submission_id2)

In [None]:
refwm.get_sample_sets().loc['All_samples']['fusions_star']

In [None]:
! gsutil cp $aggregated "temp/expression.fusion.tsv"

### Overview

This document contains the code used to generate the unfiltered and filtered versions of the fusion datasets for the release. The bottom of the document also contains some comparisons between the release fusion dataset, CCLE2 fusion calls, and the translocation data from CCLE2.

In [None]:
%%R
source('../JKBio/gkugener/RScripts/load_libraries_and_annotations.R')
source("CCLE_postp_function.R")
filepath <- 'temp/expression.fusion.tsv'
release <- '19Q3'

## Generate filtered fusion table

Release: `r release`

We want to apply filters to the fusion table to reduce the number of artifacts in the dataset. Specifically, we filter the following:

* Remove fusions involving mitochondrial chromosomes, or HLA genes, or immunoglobulin genes
* Remove red herring fusions (from STAR-Fusion annotations column)
* Remove recurrent in CCLE (>= 25 samples)
* Remove fusion with (SpliceType=" INCL_NON_REF_SPLICE" and LargeAnchorSupport="No" and FFPM < 0.1)
* Remove fusions with FFPM < 0.05 (STAR-Fusion suggests using 0.1, but looking at the translocation data, this looks like it might be too aggressive)

In [None]:
%%R
unfiltered_fusions <- readFusions(filepath)
filtered_fusions <- filterFusions(unfiltered_fusions)

In [None]:
%%R
# Save the files (to be uploaded to taiga)
write.table(
  unfiltered_fusions,
  file = paste0('temp/fusions.',release, '.unfiltered.tsv'),
  sep = '\t', quote = F, row.names = F
)
write.table(
  filtered_fusions,
  file = paste0('temp/fusions.', release, '.filtered.tsv'),
  sep = '\t', quote = F, row.names = F
)

# Validation

## Validation Protocol:

to validate fusions, one should be able to list all cells with known fusions (i.e. elwing sarcoma) and check for each new cell in this set of knownfusioncells, if the fusion is present or not. and validate the fusion quality this way.

In [None]:
# check that all cells lines are present on fusion unfiltered
df = pd.read_csv('temp/fusions.'+release+'.unfiltered.tsv', sep='\t')
new = set([i.split('_')[1][:10] for i in list(set(df["DepMap_ID"].tolist()))])
print(prev - new, new-prev)

In [None]:
# remove fusions from the same samples as for that failed expression threshold


In [None]:
unfiltered = pd.read_csv('temp/fusions.'+release+'.unfiltered.tsv', sep='\t')
filtered = pd.read_csv('temp/fusions.'+ release+ '.filtered.tsv',sep='\t')

In [None]:
unfiltered["DepMap_ID"] = ['ACH' + i.split('_ACH')[1].split('.')[0] for i in unfiltered["DepMap_ID"]]

In [None]:
filtered["DepMap_ID"] = ['ACH' + i.split('_ACH')[1].split('.')[0] for i in filtered["DepMap_ID"]]

In [None]:
q1filtered = tc.get(name='depmap-fusions-7990', version=1, file='fusions.19q1.filtered')
q1unfiltered = tc.get(name='depmap-fusions-7990', version=1, file='fusions.19q1.unfiltered')

In [None]:
filtered

In [None]:
q1filtered

In [None]:
filtered = pd.concat([filtered,q1filtered[q1filtered['DepMap_ID']=='ACH-000309']], sort=False)
unfiltered = pd.concat([unfiltered,q1unfiltered[q1unfiltered['DepMap_ID']=='ACH-000309']], sort=False)

In [None]:
unfiltered.to_csv('temp/fusions.'+release+'.unfiltered.tsv', sep='\t')
filtered.to_csv('temp/fusions.'+ release+ '.filtered.tsv',sep='\t')

## Mutation pipeline

In [None]:
%%R
source('../gkugener/RScripts/load_libraries_and_annotations.R')
load('../gkugener/DM_OMICS/Annotations.RData') 
# There are some cell lines the celllinemapr does not know how to map so we need to load this data object for now (from old datasets)

In [None]:
%%R
source('CCLE_postp_function.R')
release <- '19Q3'

In [None]:
%%R
library(tidyverse)
library(data.table)
library(magrittr)
library(taigr)
library(celllinemapr) # To pull out DepMap_IDs from CCLE_names where needed
newly_merged_maf <- 'temp/mutation.mergedmaf.txt'

In [None]:
%%R
newly_merged_maf <- readMutations(newly_merged_maf)

In [None]:
%%R
new_release <- createSNPs(newly_merged_maf)

In [None]:
%%R
names(new_release)

### (1.5) Quick adding

Here, rather than rerunning the entire analysis, because we know we are adding only WES samples, we can download the previous release's MAF, add the samples, update any annotations, and perform any global filters at the end.

First we need to do an additional step of filtering on coverage and number 

In [None]:
%%R
## check if some rows have nans
length(which(is.na(new_release$Hugo_Symbol)))

In [None]:
%%R
dim(new_release)

In [None]:
%%R
filtered <- filterAllelicFraction(new_release)

In [None]:
%%R
filtered <- filterMinCoverage(filtered$merged, filtered$removed_from_maf)

In [None]:
%%R
filtered$merged, %<>% dplyr::rename(Tumor_Seq_Allele2=Tumor_Seq_Allele1)

In [None]:
%%R
# Save the ready to upload file to upload to taiga
write.table(
  filtered$merged,, 
  paste0('temp/mutations.', release, '.tsv'), sep = '\t', quote = F, row.names = F)

# Validation

### uploads