# CCLE SNP Fingerprinting Pipeline
Author: William Colgan (wcolgan@broadinstitute.org)

In [236]:
# Load packages

import pandas as pd
import numpy as np
import dalmatian as dm
import subprocess
from taigapy import TaigaClient
from functools import reduce

tc = TaigaClient()

In [360]:
# Pipeline parameters

# CSV file of bams to fingerprint with columns: id, bam_filepath, bai_filepath
bams_sheet = "/Users/wcolgan/Desktop/bams_to_fingerprint.csv" 

# Local directory to save intermediate files to
working_dir = "/Users/wcolgan/Desktop/fingerprint/"

# GC storage bucket containing fingerprints
fingerprints_dir = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/fingerprints"

# GC storage bucket containing lists for vcf files
vcf_list_dir = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/vcf_lists"

# Batch sise for crosscheck_vcf. If more than 200 bams are being run this should be decreased
crosscheck_batch_size = 500

## Generate Fingerprint VCFs

Here we use Dalmatian to run the `fingerprint_bam_with_liftover` workflow on Terra. 
This workflow calls Picard ExtractFingerprint to generate a fingerprint VCF and then 
calls Picard LiftoverVcf to covert this vcf to hg38. To fingerprint hg38 bam files just run `fingerprint_bam` instead.

In [42]:
bams = pd.read_csv(bams_sheet)
bams

Unnamed: 0,id,bam_filepath,bai_filepath
0,CDS-23oatC,gs://cclebams/raindance/G16640/NCI-H1915/curre...,gs://cclebams/raindance/G16640/NCI-H1915/curre...
1,CDS-QfMo0t,gs://cclebams/raindance/G16640/HH/current/HH.bam,gs://cclebams/raindance/G16640/HH/current/HH.bai
2,CDS-VzUBNL,gs://cclebams/raindance/G16640/LXF-289/current...,gs://cclebams/raindance/G16640/LXF-289/current...


In [10]:
wm = dm.WorkspaceManager("broad-firecloud-ccle/CCLE_SNP_QC")

In [43]:
# Upload sample sheet
samples_df = bams[["bam_filepath","bai_filepath"]]
samples_df[["sample_id"]] = bams[["id"]]
samples_df[["participant_id"]] = bams[["id"]]
samples_df = samples_df.set_index('sample_id')
wm.upload_samples(samples_df, add_participant_samples=True)
wm.update_sample_set('all_samples', samples_df.index)

Successfully imported 3 participants.
Successfully imported 3 samples.
  * The FireCloud data model currently does not provide participant.samples
    Adding "participant.samples_" as an explicit attribute.
    Finished attaching samples to 3 participants
Hound executing batch upload of 12 records
Successfully imported 1 sample sets:
  * all_samples (3 samples)


In [53]:
# Submit jobs 
wm.create_submission("fingerprint_bam_with_liftover",'all_samples','sample_set',expression='this.samples')
#wm.create_submission("fingerprint_bam",'all_samples','sample_set',expression='this.samples')

Successfully created submission 1390a3ce-a2f8-409d-a86d-3c0384e64d10.


'1390a3ce-a2f8-409d-a86d-3c0384e64d10'

In [None]:
# Monitor jobs
wm.get_submission_status()

## Move Fingerprint VCFs

Here we move the fingerprint vcf files into `fingerprints_dir`

In [97]:
samples_df = wm.get_samples()
cmd = 'echo -e "{}" | gsutil -m cp -I {}'.format('\n'.join(samples_df["fingerprint_vcf"]), fingerprints_dir)
subprocess.check_call(cmd, shell=True, executable='/bin/bash')

0

## Crosscheck Fingerprint VCFs

Here we use Dalmation to run the `crosscheck_vcfs` workflow on Terra. This workflow calls Picard CrosscheckFingerprints to compare the new fingerprint vcfs to batches of existing fingerprint vcfs in `fingerprints_dir`

In [361]:
# Create batch files listing all vcfs in fingerprints dir and upload to bucket
cmd = "gsutil ls "+fingerprints_dir
out, err = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, executable='/bin/bash').communicate()
vcf_list = str(out).split("\\n")
vcf_list = vcf_list[1:len(vcf_list)-1]
num = 0
batches = [] 
for i in range(0, len(vcf_list), crosscheck_batch_size):
    f = open(working_dir + "vcf_batch_"+str(num), 'w')
    f.write("\n".join(vcf_list[i:i + crosscheck_batch_size]))
    f.close()
    batches += ["vcf_batch_"+str(num)]
    num += 1
cmd = 'echo -e "{}" | gsutil -m cp -I {}'.format(working_dir+('\n'+working_dir).join(batches),vcf_list_dir)
subprocess.check_call(cmd, shell=True, executable='/bin/bash')

0

In [351]:
# Create list with new vcfs and upload to bucket
f = open(working_dir + "new_vcfs", 'w')
f.write(fingerprints_dir+'/'+('.vcf\n'+fingerprints_dir+'/').join(bams["id"])+'.vcf')
f.close()
cmd = 'gsutil cp '+working_dir+'new_vcfs '+vcf_list_dir
subprocess.check_call(cmd, shell=True, executable='/bin/bash')

0

In [347]:
# Upload sample sheet
samples_df = pd.DataFrame(data={"sample_id" : batches, "participant_id" : batches, \
                    "vcf_input_file" : [vcf_list_dir+'/new_vcfs']*len(batches), \
                   "vcf_second_input_file" : [vcf_list_dir+'/'+x for x in batches]})
samples_df = samples_df.set_index('sample_id')
wm.upload_samples(samples_df, add_participant_samples=True)
wm.update_sample_set('all_samples', samples_df.index)

Successfully imported 12 participants.
Successfully imported 12 samples.
  * The FireCloud data model currently does not provide participant.samples
    Adding "participant.samples_" as an explicit attribute.
    Finished attaching samples to 15 participants
Hound executing batch upload of 60 records
Sample set "all_samples" (12 samples) successfully updated.


In [352]:
# Submit jobs
wm.create_submission("crosscheck_vcfs",'all_samples','sample_set',expression='this.samples')

Successfully created submission 5ef1d80d-a5d9-4b52-835b-b5bead5dae67.


'5ef1d80d-a5d9-4b52-835b-b5bead5dae67'

In [None]:
# Monitor jobs
wm.get_submission_status()

## Update LOD matrix

Here we update the fingerprint LOD matrix on taiga with the new fingerprints

In [371]:
# Download crosscheck results
samples_df = wm.get_samples()
cmd = 'echo -e "{}" | gsutil -m cp -I {}'.format('\n'.join(samples_df.loc[batches,"crosscheck_out"]),working_dir)
subprocess.check_call(cmd, shell=True, executable='/bin/bash')

0

In [380]:
# Generate matrix with LOD score for new fingerprint vcfs
new_lod_list = []
for batch in batches:
    df = pd.read_csv(working_dir+batch+"_crosscheck",sep='\t',comment='#')
    lod_mat = df.pivot(index = "LEFT_SAMPLE",columns="RIGHT_SAMPLE",values = "LOD_SCORE")
    new_lod_list += [lod_mat]
new_lod_mat = reduce(lambda x, y: pd.merge(x, y, left_index = True, right_index = True), new_lod_list)
new_lod_mat.index.name = None
new_lod_mat

RIGHT_SAMPLE,CDS-000dBy,CDS-00Nrci,CDS-00rz9N,CDS-010xbm,CDS-01bI6z,CDS-02TzJp,CDS-02ltm1,CDS-02waxZ,CDS-04TUV3,CDS-04j2qH,...,CDS-ztl1X2,CDS-zu3dLJ,CDS-zuxWuZ,CDS-zvEfPE,CDS-zveZ9V,CDS-zvrgDc,CDS-zvvAOM,CDS-zx92JS,CDS-zyxdAN,CDS-zzbiLf
CDS-23oatC,-94.284684,-122.99953,-123.655088,-84.356566,-86.213755,-92.719186,0.0,-112.404489,-97.072316,-68.507172,...,-47.291486,-97.681978,-137.193205,-89.912195,-64.553341,-98.109644,-75.392459,-97.419513,-92.086247,-104.100597
CDS-QfMo0t,-94.983583,-147.947205,-95.976123,-109.823965,-100.514152,-114.737916,0.0,-139.54113,-109.777991,-84.796627,...,-56.90168,-111.504015,-128.873864,-104.861319,-73.690429,-118.464489,-110.689836,-107.049804,-110.146897,-90.764134
CDS-VzUBNL,-99.422741,-137.322642,-106.501876,-109.943256,-106.39727,-104.116092,0.0,-125.199889,-86.127213,-51.771305,...,-59.525901,-111.01043,-143.055577,-109.00378,-73.797286,-99.229976,-90.85396,-133.635805,-127.439504,-99.963662


In [381]:
lod_mat =  tc.get(name='ccle-bam-fingerprints-6f30',file='fingerprint_lod_matrix')

[##################]100% |   7.8 MiB/s | 250.2 MiB / 250.2 MiB | Time:  0:00:31


In [386]:
# Update LOD matrix
new_ids = set(new_lod_mat.index)
old_ids = set(lod_mat.index).difference(new_ids)
updated_lod_mat = pd.concat((lod_mat.loc[old_ids,old_ids],new_lod_mat.loc[new_ids,old_ids]), axis=0)
updated_lod_mat = pd.concat((updated_lod_mat.loc[new_ids.union(old_ids),old_ids], \
                             new_lod_mat.transpose().loc[new_ids.union(old_ids,new_ids)]), axis=1)
updated_lod_mat

In [237]:
# Upload updated LOD matrix to Tiaga
updated_lod_mat.to_csv(working_dir+'fingerprint_lod_matrix.csv')
tc.update_dataset(dataset_permaname = "ccle-bam-fingerprints-6f30",
                  changes_description="New bam fingerprints added",
                  upload_file_path_dict=[
                    {
                        "path": working_dir+'fingerprint_lod_matrix.csv',
                        "name": "fingerprint_lod_matrix",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    }
                 ],
                 add_all_existing_files=True)

AttributeError: 'list' object has no attribute 'items'