# Loading the necessary packages

In [None]:
from __future__ import print_function

from depmapomics import constants
from depmapomics import env_config

from depmapomics import dm_omics
from depmapomics import mutations as omics_mut
from depmapomics import copynumbers as omics_cn
from depmapomics import fingerprinting as fp

from mgenepy import terra
import dalmatian as dm
from bokeh.plotting import output_notebook

%load_ext autoreload
%autoreload 2

output_notebook()

In [None]:
isCCLE = True
doCleanup = False

# Loading new data

Currently, sequenced data for DepMap is generated by the Genomics Platform (GP) at the Broad who deposits them into several different Terra workspaces. Therefore, the first step of this pipeline is to look at these workspaces and:

 - identify new samples by looking at the bam files and compare them with bams we have already onboarded
 - remove duplicates and ones with broken file paths
 - map files to profiles in Gumbo, if possible
 - onboard new samples and new versions of old cell lines if we find any

#### The following two cells scan the delivery workspaces and add new samples to gumbo. Currently under construction to be regularly run off-cycle

In [None]:
# Currently working on running this step off-cycle
# if isCCLE:
#     print("loading new WGS data")
#     from depmap_omics_upload import loading
#     wgssamples, unmapped = loading.loadFromMultipleWorkspaces(WGSWORKSPACES, EXTRACT_DEFAULTS["sm_id"], "SMIDOrdered", "wgs", bamcol="cram_path")

In [None]:
# Currently working on running this step off-cycle
# if isCCLE:
#     from depmap_omics_upload import loading
#     # write samples to Sequencing table, copy bam files to internal storage bucket:
#     wgssamples, cmds = loading.addSamplesToGumbo(wgssamples, 'wgs', WGS_GCS_PATH, filetypes=["cram", "crai"])

#### All WGS sequencingIDs in gumbo that are not in the WGS terra workspace yet are considered "new" for the current release. Here we add them to the terra processing workspace as a sample set.

In [None]:
if isCCLE:
    from depmap_omics_upload import loading
    # load new rna samples from gumbo to WGS terra workspace:
    loading.addSamplesToDepMapWorkspace('wgs', env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME, add_to_samplesets=['allcurrent'])

# Run SNP fingerprinting, new (rna + wgs) vs all existing samples

In [None]:
if isCCLE:
    updated_lod_mat, mismatches, matches = await fp._CCLEFingerPrint(rna_all, wgs_all)

# Run pipeline on Terra

We are using Dalmatian to send requests to Terra. See [our readme](https://github.com/broadinstitute/depmap_omics/blob/master/documentation/DepMap_processing_pipeline.md) for detailed breakdown of the subtasks in our WGS pipeline.

For non internal users, please make sure that your workspace is correctly setup

To set up your workspace, follow the instructions in the README page.

In [None]:
print("running Terra pipeline")
refwm = dm.WorkspaceManager(env_config.WGSWORKSPACE)
submission_id = refwm.create_submission("WGS_pipeline", constants.SAMPLESETNAME, 'sample_set', expression='this.samples')
await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)

In [None]:
submission_id = refwm.create_submission("Aggregate_CN_seg_files", 'all')
await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)

### Save the workflow configurations used

In [None]:
terra.saveWorkspace(env_config.WGSWORKSPACE,'data/'+constants.SAMPLESETNAME+'/WGSconfig/')

# Postprocessing on local


### Copy Number

In [None]:
if isCCLE:
    wespriosegs, wgspriosegs = await dm_omics.cnPostProcessing(samplesetname=constants.SAMPLESETNAME, wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE, dryrun=True)
else:
    segments, genecn, failed, purecn_segments, purecn_genecn, loh_status, feature_table = await omics_cn.postProcess(env_config.WGSWORKSPACE, sampleset=constants.SAMPLESETNAME)

### Somatic Mutations

In [None]:
if isCCLE:
    await dm_omics.mutationPostProcessing(wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE)
else:
    await omics_mut.postProcess(env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME)

# Subset and upload

Based on release dates and embargo status in gumbo, subset and upload datasets for each release audience, and hand off to the portal team.

In [None]:
from depmap_omics_upload import tracker
from depmap_omics_upload import upload
from mgenepy.utils import helper as h

In [None]:
virtual = upload.initVirtualDatasets(samplesetname=constants.SAMPLESETNAME)

In [None]:
upload.uploadAuxTables(taiga_ids=virtual)

In [None]:
upload.makeModelLvMatrices(virtual_ids=virtual, files_nummat=env_config.LATEST2FN_NUMMAT_MODEL, files_table=env_config.LATEST2FN_TABLE_MODEL_BETA, upload_guide_matrices=False)

In [None]:
upload.makePRLvMatrices(virtual_ids=virtual, files_nummat=env_config.LATEST2FN_NUMMAT_PR, files_table=env_config.LATEST2FN_TABLE_PR_BETA)

### cleaning workspaces

In [None]:
if doCleanup:
    print("cleaning workspaces")
    torm = await terra.deleteHeavyFiles(env_config.WGSWORKSPACE)
    h.parrun(['gsutil rm '+i for i in torm], cores=8)
    terra.removeFromFailedWorkflows(env_config.WGSWORKSPACE, dryrun=False, everythingFor=['Realign_WES_GATK4','Generate_uBAM_File_List','BamToUnmappedRGBams_MC','CGA_WES_CCLE_ICE','CGA_WES_CCLE_AGILENT'])