# PureCN Curation
Notebook to select PureCN absolute copy number calls for manual curation.

William Colgan (wcolgan@broadinstitute.org)

In [None]:
# Load packages

import pandas as pd
import numpy as np
import dalmatian as dm
import os
pd.options.mode.chained_assignment = None

In [None]:
# Pipeline parameters
WORKSPACE = "broad-firecloud-ccle/DepMap_WES_CN_hg38"
MIN_GOF = 70
MAX_PLOIDY = 5
SAVE_PDF_DIR = "output/solutions/" # directory where all solution PDF files are saved
SAVE_TOCURATE_FILE = "to_curate.csv" # location of the csv file where solution indices are listed

## Select calls to Curate
Criteria for samples that need to be curated: Goodness of fit < MIN_GOF and not Non-aberrant or Ploidy > MAX_PLOIDY

In [None]:
wm = dm.WorkspaceManager(WORKSPACE)
samples_df = wm.get_samples()
#samples_df = pd.read_csv("~/Downloads/sample.tsv",sep = "\t")

In [None]:
samples_df['PureCN_gof'] = samples_df.PureCN_comment.str.extract(r'([0-9]+)',expand = True).fillna(100).astype(int)
samples_df['Non_aberrant'] = samples_df.PureCN_comment.str.contains("NON-ABERRANT").astype(bool)

In [None]:
samples_df = samples_df[samples_df.PureCN_ploidy != 'NA']

In [None]:
to_curate = samples_df[((samples_df.PureCN_gof < MIN_GOF) & ~samples_df.Non_aberrant) | \
                       (samples_df.PureCN_ploidy.astype(float) > MAX_PLOIDY)]
to_curate["PureCN_curated_solution"] = ""
to_curate["PureCN_failed"] = ""
to_curate["PureCN_curated"] = True
to_curate_df = to_curate.loc[:,['PureCN_ploidy','PureCN_comment','PureCN_curated', \
           'PureCN_curated_solution','PureCN_failed']]
# Saves lines that need to be manually curated as a csv file
to_curate_df.to_csv(SAVE_TOCURATE_FILE) 

In [None]:
to_curate_df

## Download Solution PDFs

In [None]:
os.system("gsutil -m cp "+to_curate.PureCN_solutions_pdf.str.cat(sep = " ")+ " " + SAVE_PDF_DIR)

# Manually inspect and update solutions 

Now that all the solution PDFs and "to_curate.csv" are downloaded, open the solution PDFs locally and manually inspect them following the guidelines here: https://docs.google.com/document/d/1Rte0xKK3ZE_UV6MWepdXRIbAehUJg8FuLaDckrWhPTQ/edit?usp=sharing. Fill in the PureCN_curated_solution column for all samples in to_curate.csv with the index of the most reasonable solution. Note that solution indices are 1-based. If none of the solutions look right, set PureCN_failed = "TRUE" & PureCN_curated_solution = 0.

# Once PureCN_curated_solution is filled in for all samples in to_curate.csv, run the following cells to sync the manually curated columns to terra workspace

In [None]:
curated = pd.read_csv(SAVE_TOCURATE_FILE,index_col = 0)
samples_df = samples_df.drop(['PureCN_curated'], axis = 1).join( \
                  curated[['PureCN_curated','PureCN_curated_solution','PureCN_failed']])
samples_df['PureCN_curated'] = samples_df['PureCN_curated'].fillna(False)
samples_df['PureCN_failed'] = samples_df['PureCN_failed'].fillna(False)

In [None]:
wm.upload_samples(samples_df.drop(['PureCN_gof','Non_aberrant'], axis = 1))
#samples_df.drop(['PureCN_gof','Non_aberrant'], axis = 1).to_csv("~/Desktop/sample.tsv",sep = "\t",index=False)