# CCLE SNP Fingerprinting Pipeline
Author: William Colgan (wcolgan@broadinstitute.org)

In [None]:
# Load packages

import pandas as pd
import numpy as np
import dalmatian as dm
import subprocess
from taigapy import TaigaClient
from functools import reduce
from genepy.google import gcp
import asyncio
from genepy import terra

tc = TaigaClient()

In [None]:
# Pipeline parameters

# Local directory to save intermediate files to
working_dir = "temp/"

# GC storage bucket containing fingerprints
fingerprints_dir = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/fingerprints/"

# GC storage bucket containing lists for vcf files
vcf_list_dir = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/vcf_lists/"

# Batch sise for crosscheck_vcf. If more than 200 bams are being run this should be decreased
crosscheck_batch_size = 500
recreate_batch = False

WORKSPACE = "broad-firecloud-ccle/CCLE_SNP_QC"

bamcolname = ["legacy_bam_filepath","legacy_bai_filepath"]
sid = 'id'

allsampleset = "all_samples"

sampleset = "21Q3"

taiga_dataset = "ccle-bam-fingerprints-6f30"
taiga_filename = 'fingerprint_lod_matrix'

## Generate Fingerprint VCFs

Here we use Dalmatian to run the `fingerprint_bam_with_liftover` workflow on Terra. 
This workflow calls Picard ExtractFingerprint to generate a fingerprint VCF and then 
calls Picard LiftoverVcf to covert this vcf to hg38. To fingerprint hg38 bam files just run `fingerprint_bam` instead.

In [None]:
%store -r rnasamples
%store -r wgssamples

In [None]:
fbams = pd.concat([rnasamples, wgssamples])
bams = fbams[bamcolname]
bams[sid] = bams.index
bams # a [bamcolname]+[id] dataframe

In [None]:
# Create batch files listing all vcfs in fingerprints dir and upload to bucket
# (NEW VERSION ONLY) will only needed if need to recreate batches

if recreate_batch:
    #vcf_list = gcp.lsFiles([vcf_list_dir])
    #vcf_list = wm.get_samples()["fingerprint_vcf"].tolist()
    batches = []
    for i, l in enumerate(range(0, len(vcf_list), crosscheck_batch_size)):
        f = open(working_dir + "vcf_batch_"+str(i), 'w')
        f.write("\n".join(vcf_list[l:l + crosscheck_batch_size]))
        f.close()
        batches.append(working_dir+"vcf_batch_"+str(i))
    gcp.cpFiles(batches, vcf_list_dir)

In [None]:
# Upload sample sheet
wm = dm.WorkspaceManager(WORKSPACE).disable_hound()
samples_df = pd.DataFrame()
samples_df[["bam_filepath", "bai_filepath", "sample_id", "participant_id"]] = bams[bamcolname + [sid, sid]].values
samples_df = samples_df.set_index('sample_id')
wm.upload_samples(samples_df, add_participant_samples=True)
wm.update_sample_set(sampleset, samples_df.index)

In [None]:
# Submit jobs 
#submission_id = wm.create_submission("fingerprint_bam_with_liftover", sampleset, 'sample_set', expression='this.samples')
await terra.waitForSubmission(WORKSPACE, submission_id)

## Crosscheck Fingerprint VCFs

Here we use Dalmation to run the `crosscheck_vcfs` workflow on Terra. This workflow calls Picard CrosscheckFingerprints to compare the new fingerprint vcfs to batches of existing fingerprint vcfs in `fingerprints_dir`

In [None]:
# Create list with new vcfs and upload to bucket
f = open(working_dir + sampleset, 'w')
f.write(('\n').join(wm.get_samples().loc[samples_df.index, 'fingerprints'].tolist()))
f.close()
gcp.cpFiles(working_dir + sampleset, vcf_list_dir)
rm = working_dir + sampleset
! rm $rm

In [None]:
# Upload sample sheet
if recreate_batch:
    sample_group_df = pd.DataFrame(data={"entity:sample_group_id" : batches, "vcf_group" : [vcf_list_dir + x for x in batches]}).set_index('entity:sample_group_id')
else:
    sample_group_df = pd.DataFrame(data={"entity:sample_group_id" : [sampleset], "vcf_group" : [vcf_list_dir+sampleset]}).set_index('entity:sample_group_id')
#in case it does not work
sample_group_df.to_csv("../temp.tsv", sep='\t')
print(wm.get_entities('sample_group').index.tolist())
wm.upload_entities("sample_group", sample_group_df)
#wm.update_entity_set(allsampleset, wm.get_entities('sample_group').index)

In [None]:
# Submit jobs
#conf = wm.get_config("crosscheck_vcfs")
conf['inputs']['crosscheck.run_crosscheck.vcf_second_input_file'] = '"'+vcf_list_dir+sampleset+'"'
wm.update_config(conf)
submission_id = wm.create_submission("crosscheck_vcfs", allsampleset, 'sample_set',expression='this.samples')
await terra.waitForSubmission(WORKSPACE, submission_id)

## Update LOD matrix

Here we update the fingerprint LOD matrix on taiga with the new fingerprints

In [None]:
# Generate matrix with LOD score for new fingerprint vcfs
new_lod_list = []
samples_df = wm.get_entities("sample_group")['cross_checks_out'].tolist()
for batch in samples_df:
    # could be pd concat
    df = pd.read_csv(batch, sep='\t', comment='#')
    lod_mat = df.pivot(index = "LEFT_SAMPLE",columns="RIGHT_SAMPLE",values = "LOD_SCORE")
    new_lod_list.append(lod_mat)
new_lod_mat = pd.concat(new_lod_list)
new_lod_mat.index.name = None
new_lod_mat = new_lod_mat.T

In [None]:
# Update LOD matrix ( have to update (A+a)*(B+b) = (AB)+(aB)+(Ab)+(ab))
prev_lod_mat =  tc.get(name=taiga_dataset,file=taiga_filename)
new_ids = set(new_lod_mat.index)
old_ids = set(prev_lod_mat.index) - set(new_ids)
updated_lod_mat = pd.concat((prev_lod_mat.loc[old_ids,old_ids],new_lod_mat.loc[new_ids,old_ids]), axis=0)
updated_lod_mat = pd.concat((updated_lod_mat.loc[new_ids.union(old_ids), old_ids], new_lod_mat.transpose().loc[new_ids.union(old_ids, new_ids)]), axis=1)
updated_lod_mat.to_csv(working_dir+taiga_filename+'.csv')

In [None]:
from depmapomics import tracker

In [None]:
ref = tracker.getCCLETracker()
ref = ref.append(fbams)

In [None]:
print("issues with ")
previ = ''
l = {}
for i,j in [(v.index[x], v.columns[y]) for x, y in np.argwhere(v.values>500)]:
    if i == j:
        continue
    if ref.loc[i]['participant_id'] == ref.loc[j]['participant_id']:
        continue
    if i != previ:
        if previ!='':
            l.update({'_'.join(ref.loc[previ, ['arxspan_id','version','datatype','participant_id','stripped_cell_line_name']].astype(str).values.tolist()):n})
        n = [tuple(ref.loc[j, ['arxspan_id','version','datatype','participant_id','stripped_cell_line_name']].values)]
    else:
        n.append(tuple(ref.loc[j, ['arxspan_id','version','datatype','participant_id','stripped_cell_line_name']].values))
    previ = i

In [None]:
l

In [None]:
rnasamples[rnasamples.arxspan_id=="ACH-002061"]

In [None]:
previ = ''
l = {}
#ref = ref.append(fbams)
for u in set(fbams.arxspan_id):
    res = v.loc[fbams[fbams.arxspan_id==u].index , ref[ref.arxspan_id==u].index.tolist()]
    for i,j in [(res.index[x], res.columns[y]) for x, y in np.argwhere(res.values<100)]:
        print('__________________________')
        print(res.loc[i,j])
        print(i,':', tuple(ref.loc[i,['arxspan_id','version','datatype','participant_id']].values),j,':',tuple(ref.loc[j,['arxspan_id','version','datatype','participant_id', 'blacklist']]))

In [None]:
# Upload updated LOD matrix to Tiaga
tc.update_dataset(dataset_permaname = taiga_dataset,
                  changes_description="New bam fingerprints added for "+sampleset,
                  upload_files=[
                    {
                        "path": working_dir+taiga_filename+'.csv',
                        "name": taiga_filename,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    }
                 ],
                 add_all_existing_files=True)