# CCLE SNP Fingerprinting Pipeline
Author: William Colgan (wcolgan@broadinstitute.org)

In [None]:
# Load packages

import pandas as pd
import numpy as np
import dalmatian as dm
import subprocess
from taigapy import TaigaClient
from functools import reduce
from genepy.google import gcp

tc = TaigaClient()

In [None]:
# Pipeline parameters

# Local directory to save intermediate files to
working_dir = "temp/"

# GC storage bucket containing fingerprints
fingerprints_dir = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/fingerprints/"

# GC storage bucket containing lists for vcf files
vcf_list_dir = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/vcf_lists/"

# Batch sise for crosscheck_vcf. If more than 200 bams are being run this should be decreased
crosscheck_batch_size = 500
recreate_batch = False

WORKSPACE = "broad-firecloud-ccle/CCLE_SNP_QC"

bamcolname = ["bam_filepath","bai_filepath"]

allsampleset = "all_samples"

sampleset = "21Q3"

taiga_dataset = "ccle-bam-fingerprints-6f30"
taiga_filename = 'fingerprint_lod_matrix'

## Generate Fingerprint VCFs

Here we use Dalmatian to run the `fingerprint_bam_with_liftover` workflow on Terra. 
This workflow calls Picard ExtractFingerprint to generate a fingerprint VCF and then 
calls Picard LiftoverVcf to covert this vcf to hg38. To fingerprint hg38 bam files just run `fingerprint_bam` instead.

In [None]:
%store -r bams

In [None]:
bams # a [bamcolname]+[id] dataframe

In [None]:
# Create batch files listing all vcfs in fingerprints dir and upload to bucket
# (NEW VERSION ONLY) will only needed if need to recreate batches

if recreate_batch:
    #vcf_list = gcp.lsFiles([vcf_list_dir])
    #vcf_list = wm.get_samples()["fingerprint_vcf"].tolist()
    batches = []
    for i, l in enumerate(range(0, len(vcf_list), crosscheck_batch_size)):
        f = open(working_dir + "vcf_batch_"+str(i), 'w')
        f.write("\n".join(vcf_list[l:l + crosscheck_batch_size]))
        f.close()
        batches.append(working_dir+"vcf_batch_"+str(i))
    gcp.cpFiles(batches, vcf_list_dir)

In [None]:
# Upload sample sheet
wm = dm.WorkspaceManager(WORKSPACE).disable_hound()
samples_df = pd.DataFrame()
samples_df[["bam_filepath", "bai_filepath", "sample_id", "participant_id"]] = bams[bamcolname + ["id", 'id']]
samples_df = samples_df.set_index('sample_id')
wm.upload_samples(samples_df, add_participant_samples=True)
wm.update_sample_set(sampleset, samples_df.index)

In [None]:
# Submit jobs 
submission_id = wm.create_submission("fingerprint_bam_with_liftover", sampleset, 'sample_set', expression='this.samples')
asyncio.run(terra.waitForSubmission(WORKSPACE, submission_id))

## Crosscheck Fingerprint VCFs

Here we use Dalmation to run the `crosscheck_vcfs` workflow on Terra. This workflow calls Picard CrosscheckFingerprints to compare the new fingerprint vcfs to batches of existing fingerprint vcfs in `fingerprints_dir`

In [None]:
# Create list with new vcfs and upload to bucket
f = open(working_dir + sampleset, 'w')
f.write(('.vcf\n').join(wm.get_samples().loc[samples_df.index, 'fingerprints'].tolist()))
f.close()
gcp.cpFiles(working_dir + sampleset, vcf_list_dir)

In [None]:
# Upload sample sheet
if recreate_batches:
    sample_group_df = pd.DataFrame(data={"sample_group_id" : batches, "vcf_input_file" : [vcf_list_dir + x for x in batches]})
else:
    sample_group_df = pd.DataFrame(data={"sample_group_id" : [sampleset], "vcf_input_file" : [vcf_list_dir+sampleset]}).set_index('sample_group_id')
wm.upload_entities("sample_group", sample_group_df)
wm.update_entity_set(allsampleset, wm.get_entities('sample_group').index)

In [None]:
# Submit jobs
conf = wm.get_config("crosscheck_vcfs")
conf['inputs']['vcf_second_input_file'] = vcf_list_dir+sampleset
wm.update_config(conf)
submission_id = wm.create_submission("crosscheck_vcfs", allsampleset, 'sample_set',expression='this.samples')
asyncio.run(terra.waitForSubmission(WORKSPACE, submission_id))

## Update LOD matrix

Here we update the fingerprint LOD matrix on taiga with the new fingerprints

In [None]:
# Generate matrix with LOD score for new fingerprint vcfs
new_lod_list = []
samples_df = wm.get_entities("sample_group")['cross_checks_out'].tolist()
for batch in samples_df:
    # could be pd concat
    df = pd.read_csv(batch, sep='\t', comment='#')
    lod_mat = df.pivot(index = "LEFT_SAMPLE",columns="RIGHT_SAMPLE",values = "LOD_SCORE")
    new_lod_list += [lod_mat]
new_lod_mat = reduce(lambda x, y: pd.merge(x, y, left_index = True, right_index = True), new_lod_list)
new_lod_mat.index.name = None

In [None]:
# Update LOD matrix ( have to update (A+a)*(B+b) = (AB)+(aB)+(Ab)+(ab))
lod_mat =  tc.get(name=taiga_dataset,file=taiga_filename)
new_ids = set(new_lod_mat.index)
old_ids = set(lod_mat.index).difference(new_ids)
updated_lod_mat = pd.concat((lod_mat.loc[old_ids,old_ids],new_lod_mat.loc[new_ids,old_ids]), axis=0)
updated_lod_mat = pd.concat((updated_lod_mat.loc[new_ids.union(old_ids), old_ids], new_lod_mat.transpose().loc[new_ids.union(old_ids, new_ids)]), axis=1)
updated_lod_mat.to_csv(working_dir+taiga_filename+'.csv')

In [None]:
%store updated_lod_mat

In [None]:
# Upload updated LOD matrix to Tiaga
tc.update_dataset(dataset_permaname = taiga_dataset,
                  changes_description="New bam fingerprints added for "+sampleset,
                  upload_file_path_dict=[
                    {
                        "path": working_dir+taiga_filename+'.csv',
                        "name": taiga_filename,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    }
                 ],
                 add_all_existing_files=True)