# Preproccessing and loading data

In [None]:
from __future__ import print_function

from depmapomics.config import *

from depmapomics import loading, tracker
from depmapomics import mutations as omics_mut
from depmapomics import copynumbers as omics_cn
from genepy import terra
from genepy.utils import helper as h

import dalmatian as dm
from bokeh.plotting import output_notebook

%load_ext autoreload
%autoreload 2

output_notebook()

## Adding new data

We are looking for new samples in a range of workspaces.

They are quite messy and might contains duplicates, contain broken file paths...

- We are thus looking at the bam files one by one and comparing them with our own bams. 
- We remove broken files, duplicates and add new version of a cell line's bam if we find some.

In [None]:
if isCCLE:
    print("loading new WGS data")
    wgssamples = loading.loadWGS(SAMPLESETNAME)

In [None]:
if isCCLE:
    ref = tracker.getCCLETracker()
    print('samples without wgs:')
    print(set(LINES_TO_RELEASE) - (set(wgssamples.arxspan_id) | set(ref[ref.datatype=='wgs'].arxspan_id)))
    print('\nsamples without DNAseq:')
    print(set(LINES_TO_RELEASE) - (set(wgssamples.arxspan_id) | set(ref[ref.datatype.isin(['wes', 'wgs'])].arxspan_id)))
    %store wgssamples

In [None]:
# HERE APPLY ANY STEP TO CLEANUP LINES FUTHER (from)
%store -r rnasamples
%store -r wgssamples
###############################
rnasamples.loc["CDS-nOdwML", 'participant_id'] = "PT-Elndt09g"
rnasamples.loc[["CDS-ugq3tZ",
"CDS-MyY7Gc"], 'cellosaurus_id'] = ["U", "CVCL_D843"]

wgssamples.loc[["CDS-WXSgfq",
"CDS-uLf8M0",
"CDS-5pxach",
"CDS-pX3bvG",
"CDS-JsZ9Nh",
"CDS-BXIBwR",
"CDS-BEbnKg",
"CDS-xvtYY4"],"cellosaurus_id"] = ["CVCL_D842",
"CVCL_D843",
"CVCL_0B81",
"CVCL_8915",
"CVCL_5040",
"CVCL_W891",
"CVCL_5402",
"CVCL_1793"]
#################################
%store rnasamples
%store wgssamples

In [None]:
if isCCLE:
    print('sorting our patient_id for new samples on both WGS and RNAseq')
    %store -r rnasamples
    %store -r wgssamples
    for val in set(wgssamples.arxspan_id)&set(rnasamples.arxspan_id):
        r = rnasamples[rnasamples.arxspan_id == val] 
        w = wgssamples[wgssamples.arxspan_id == val]
        if len(set(r.participant_id) | set(w.participant_id)) >1:
            print("sorting out: "+val)
            v = r.participant_id[0]
            rnasamples.loc[r.index, 'participant_id'] = v
            wgssamples.loc[w.index, 'participant_id'] = v
    %store rnasamples
    %store wgssamples

In [None]:
samples

In [None]:
if isCCLE:
    print("uploading samples to the tracker and Terra: "+SAMPLESETNAME)
    loading.update(wgssamples, samplesetname=SAMPLESETNAME, stype="wgs", bucket="gs://cclebams/wgs/", refworkspace=WGSWORKSPACE)

# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 5 functions To generate the mutation dataset:

*   For new samples in DepMap, run the ICE version of this task. CCLE2 samples used Agilent targets, so this pipeline should be used instead. The pipelines are identical in terms of their outputs, but the proper targets, baits, and pseudo normal should be used based on how the samples were sequenced.

    **ICE_CGA_Production_Analysis_Pipeline_Cell_Lines_copy** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22) OR


    **AGILENT_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Anablysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22)

*   **common_variant_filter** (breardon/common_variant_filterSnapshot ID: 3)
*   **filterMAF_on_CGA_pipeline** (gkugener/filterMAF_on_CGA_pipelineSnapshot ID: 8)
*   **aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1)

This outputs to be downloaded will be saved in the sample set that was run. The output we use for the release is:


*   **passedCGA_filteredMAF_aggregated** 

There are several other tasks in this workspace. In brief:



*   **CGA_Production_Analysis_Pipeline_Cell_Lines** (lelagina/CGA_Production_Analysis_Pipeline_Cell_LinesSnapshot ID: 12). This task is the same as the ICE and AGILENT prefixed version above, except that it relied on pulling the baits and targets to use from the metadata stored for the samples. Having AGILENT and ICE versions specified made the uploading and running process easier.
*   **SANGER_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22). This task was trying to run the CGA pipeline on the Sanger WES data, using a Sanger pseudo normal. In its current implementation, this task fails to complete for the samples.
*   **UNFILTERED_aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1). Aggregates the MAF outputted by the CGA cell line pipeline prior to the common variant filter and germline filtering tasks. This can give us insight to which mutations are getting filtered out when. We may want to potentially include this MAF in the release so people can see why certain mutations of interest may be getting filtered out.
*   WES_DM_Mutation_Calling_Pipeline_(standard |expensive) (gkugener/WES_DM_Mutation_Calling_PipelineSnapshot ID: 2). This was a previous mutation calling pipeline implemented for CCLE. We do not use this pipeline any more as the CGA pipeline looks better.
*   aggregate_filterMAF_CGA (CCLE/aggregate_filterMAF_CGASnapshot ID: 1). An aggregation MAF task that we used in the past. We do not use this task anymore.
*   calculate_mutational_burden (breardon/calculate_mutational_burdenSnapshot ID: 21). This task can be used to calculate the mutational rate of the samples. We do not make use of this data in the release although it could be of interest.
*   summarizeWigFile (breardon/summarizeWigFileSnapshot ID: 5). CCLF ran this task (might be necessary for the mutational burden task). For our workflow, we do not run it.

### cleaning workspaces

In [None]:
doCleanup=True

In [None]:
if doCleanup:
    print("cleaning workspaces")
    torm = await terra.deleteHeavyFiles(WGSWORKSPACE)
    h.parrun(['gsutil rm '+i for i in torm], cores=8)
    terra.removeFromFailedWorkflows(WGSWORKSPACE, dryrun=False, everythingFor=['Realign_WES_GATK4','Generate_uBAM_File_List','BamToUnmappedRGBams_MC','CGA_WES_CCLE_ICE','CGA_WES_CCLE_AGILENT'])

## On Terra

In [None]:
# TODO: update with latest workspace parameters from our repo

In [None]:
print("running Terra pipeline")
refwm = dm.WorkspaceManager(WGSWORKSPACE)
submission_id = refwm.create_submission("WGS_pipeline", SAMPLESETNAME, 'sample_set', expression='this.samples')
await terra.waitForSubmission(WGSWORKSPACE, submission_id)

In [None]:
submission_id = refwm.create_submission("WGS_aggregate", 'all')
await terra.waitForSubmission(WGSWORKSPACE, submission_id)

### Save the workflow configurations used

In [None]:
terra.saveWorkspace(WGSWORKSPACE,'data/'+SAMPLESETNAME+'/WGSconfig/')

## On local


### Copy Number

In [None]:
wespriosegs, wgspriosegs = omics_cn.CCLEPostProcessing(samplesetname=SAMPLESETNAME)

### Achilles CN

In [None]:
omics_cn.ProcessForAchilles(wespriosegs, wgspriosegs, samplesetname=SAMPLESETNAME,)

### Somatic Mutations

In [None]:
omics_mut.CCLEPostProcessing(samplesetname=SAMPLESETNAME)

In [None]:
omics_mut.analyzeUnfiltered(allsampleset="allcurrent")

### Structural variants

### germline Mutations