In [None]:
import subprocess
import os
import sys
from shlex import split

import numpy as np 

from ipywidgets import interactive, interact, fixed
from ipywidgets.embed import embed_minimal_html
import ipywidgets as widgets
import functools

from interactPlots import *

import yaml

In [None]:
#Read options file

with open("/home/adefalco/options.yaml", 'r') as stream:
    data = yaml.safe_load(stream)
    
print(data['SENTIEON_INSTALL_DIR'])

In [None]:
#Set environment variables

os.environ["SENTIEON_INSTALL_DIR"] = data['SENTIEON_INSTALL_DIR']
os.environ["SENTIEON_LICENSE"]= data['SENTIEON_LICENSE']
os.environ["BCFTOOLS_PLUGINS"]= data['bcfdir']

In [None]:
#Set working directory

#data['workdir'] = "/home/adefalco/"+ "/" + data['batch'] + "/"
data['workdir'] = os.getcwd()+ "/" + data['batch'] + "/"

if not os.path.isdir(data['workdir']): 
    os.mkdir(data['workdir']) 
 
data['SENTIEON_TMPDIR'] = data['workdir']+"/$SAMPLE/tmpdir"

In [None]:
#Read batch

data['samplelist'] = os.getcwd()+ "/" +data['batch']+".txt"

batch = []
with open (data['samplelist'], 'r') as f:
    for row in csv.reader(f,delimiter='\t'):
            batch.append(row)
           
batchSize = int(np.size(batch)/np.size(batch,1))        
print(tabulate(batch))

In [None]:
#Dropdown widgets

tumors = [row[1] for row in batch]

widgTum = widgets.Dropdown(
    options= tumors,
    disabled=False,
)  

lists = tumors.copy()
lists.append('ALL')

widgTumAll = widgets.Dropdown(
    options= lists,
    value='ALL',
    disabled=False,
) 

#normals = [row[0] for row in batch]
normals = [batch[0][0]]
#lists = normals.copy()
#lists.append('ALL')

widgNor = widgets.Dropdown(
    options= normals,
    #value='ALL',
    disabled=False,
)  

In [None]:
os.chdir(data['workdir'])
exec_scripts = False #Run shell scripts

# 1a. Mapping reads with BWA-MEM, sorting for normal sample
The results of this call are dependent on the number of threads used. To have number of threads independent results, add chunk size option -K 10000000

In [None]:
ERROR = "error"
SAMPLE = batch[0][0]
"""for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    NORMAL = batch[i][2] """
command = "(" + data['SENTIEON_INSTALL_DIR']+"/bin/sentieon bwa mem -M -R '@RG\\tID:"+SAMPLE+"N\\tSM:"+SAMPLE+"N\\tPL:"+data['platform']+"' -t "+str(data['nt'])+" -K 10000000 "+data['fasta']+" "+data['fastq_folder_NORMAL']+"/*_"+data['fastq_1_suffix']+" "+data['fastq_folder_NORMAL']+"/*_"+data['fastq_2_suffix']+" || echo -n "+ ERROR +" ) | "+data['SENTIEON_INSTALL_DIR']+"/bin/sentieon util sort -o "+SAMPLE+"N_sorted.bam -t "+str(data['nt'])+" --sam2bam -i -"
print(command+'\n')
if exec_scripts:
    subprocess.check_output(command, shell=True)

# 1a. Mapping reads with BWA-MEM, sorting for tumor sample
The results of this call are dependent on the number of threads used. To have number of threads independent results, add chunk size option -K 10000000

In [None]:
for i in range(batchSize):
    #SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    #NORMAL = batch[i][2] 
    command = "(" + data['SENTIEON_INSTALL_DIR']+"/bin/sentieon bwa mem -M -R '@RG\\tID:"+TUMOR+"T\\tSM:"+TUMOR+"T\\tPL:"+data['platform']+"' -t "+str(data['nt'])+" -K 10000000 "+data['fasta']+" "+data['fastq_folder']+"/"+TUMOR+"/*_"+data['fastq_1_suffix']+" "+data['fastq_folder']+"/"+TUMOR+"/*_"+data['fastq_2_suffix']+" || echo -n "+ ERROR +" ) | "+data['SENTIEON_INSTALL_DIR']+"/bin/sentieon util sort -o "+TUMOR+"T_sorted.bam -t "+str(data['nt'])+" --sam2bam -i -"
    print(command+'\n')
    if exec_scripts:
        subprocess.check_output(command, shell=True)

# 2a. Metrics for normal sample

In [None]:
command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+SAMPLE+"N_sorted.bam --algo MeanQualityByCycle "+SAMPLE+"N_mq_metrics.txt --algo QualDistribution "+SAMPLE+"N_qd_metrics.txt --algo GCBias --summary "+SAMPLE+"N_gc_summary.txt "+SAMPLE+"N_gc_metrics.txt --algo AlignmentStat --adapter_seq '' "+SAMPLE+"N_aln_metrics.txt --algo InsertSizeMetricAlgo "+SAMPLE+"N_is_metrics.txt"
print(command1+'\n')
if exec_scripts:
    subprocess.check_output(command1,shell=True)


In [None]:
interact(tableShow, Sample=widgNor, file=fixed("_N_aln_metrics.txt"), cols = fixed([0,1,2,5,6]), listSample = fixed([]));

In [None]:
command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot GCBias -o "+SAMPLE+"N_gc-report.pdf "+SAMPLE+"N_gc_metrics.txt"
print(command2+'\n')
if exec_scripts:        
    subprocess.check_output(command2,shell=True)

In [None]:
interact(plots, Sample=widgNor, file=fixed("_N_gc-report.pdf"), normal=fixed(True), listSample = fixed([]));
#embed_minimal_html('export.html', views=test, title='Widgets export')

In [None]:
command3 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot QualDistribution -o "+SAMPLE+"N_qd-report.pdf "+SAMPLE+"N_qd_metrics.txt"
print(command3+'\n')
if exec_scripts:
    subprocess.check_output(command3,shell=True)

In [None]:
interact(plots, Sample=widgNor, file=fixed("_N_qd-report.pdf"), normal=fixed(True), listSample = fixed([]));

In [None]:
command4 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot MeanQualityByCycle -o "+SAMPLE+"N_mq-report.pdf "+SAMPLE+"N_mq_metrics.txt"
print(command4+'\n')
if exec_scripts:
    subprocess.check_output(command4,shell=True)

In [None]:
interact(plots, Sample=widgNor, file=fixed("_N_mq-report.pdf"), normal=fixed(True), listSample = fixed([]));

In [None]:
command5 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot InsertSizeMetricAlgo -o "+SAMPLE+"N_is-report.pdf "+SAMPLE+"N_is_metrics.txt"
print(command5+'\n')
if exec_scripts:
    subprocess.check_output(command5,shell=True)

In [None]:
interact(plots, Sample=widgNor, file=fixed("_N_is-report.pdf"), normal=fixed(True), listSample = fixed([]));

# 2a. Metrics for tumor sample

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_sorted.bam --algo MeanQualityByCycle "+TUMOR+"T_mq_metrics.txt --algo QualDistribution "+TUMOR+"T_qd_metrics.txt --algo GCBias --summary "+TUMOR+"T_gc_summary.txt "+TUMOR+"T_gc_metrics.txt --algo AlignmentStat --adapter_seq '' "+TUMOR+"T_aln_metrics.txt --algo InsertSizeMetricAlgo "+TUMOR+"T_is_metrics.txt"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
interact(tableShow, Sample=widgTumAll, file=fixed("_T_aln_metrics.txt"), cols = fixed([0,1,2,5,6]), listSample = fixed(tumors));

CATEGORY: Distinguishes either PAIRED: for a fragment run, FIRST_OF_PAIR: when metrics are for only the first read in a paired run, SECOND_OF_PAIR: when metrics are for only the second read in a paired run, or PAIR when metrics are aggregated for both first and second reads in a pair. 

Total Reads: The total number of reads 

PF_READS_ALIGNED: The number of reads that aligned to the reference sequence

PCT_PF_READS_ALIGNED: The percentage of reads that aligned to the reference sequence

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot GCBias -o "+TUMOR+"T_gc-report.pdf "+TUMOR+"T_gc_metrics.txt"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

In [None]:
interact(plots, Sample=widgTumAll, file=fixed("_T_gc-report.pdf"), normal=fixed(False), listSample = fixed(tumors));

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command3 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot QualDistribution -o "+TUMOR+"T_qd-report.pdf "+TUMOR+"T_qd_metrics.txt"
    print(command3+'\n')
    if exec_scripts:
        subprocess.check_output(command3,shell=True)

In [None]:
interact(plots, Sample=widgTumAll, file=fixed("_T_qd-report.pdf"), normal=fixed(False), listSample = fixed(tumors));

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command4 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot MeanQualityByCycle -o "+TUMOR+"T_mq-report.pdf "+TUMOR+"T_mq_metrics.txt"
    print(command4+'\n')
    if exec_scripts:
        subprocess.check_output(command4,shell=True)

In [None]:
interact(plots, Sample=widgTumAll, file=fixed("_T_mq-report.pdf"), normal=fixed(False), listSample = fixed(tumors));
#embed_minimal_html('export.html', views=[widgTumAll], title='Widgets export')

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command5 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot InsertSizeMetricAlgo -o "+TUMOR+"T_is-report.pdf "+TUMOR+"T_is_metrics.txt"
    print(command5+'\n')
    if exec_scripts:
        subprocess.check_output(command5,shell=True)

In [None]:
interact(plots, Sample=widgTumAll, file=fixed("_T_is-report.pdf"), normal=fixed(False), listSample = fixed(tumors));

# 3a. Remove Duplicate Reads for normal sample

In [None]:
command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -i "+SAMPLE+"N_sorted.bam --algo LocusCollector --fun score_info "+SAMPLE+"N_score.txt"
print(command1+'\n')
if exec_scripts:
    subprocess.check_output(command1,shell=True)

In [None]:
command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -i "+SAMPLE+"N_sorted.bam --algo Dedup --rmdup --score_info "+SAMPLE+"N_score.txt --metrics "+SAMPLE+"N_dedup_metrics.txt "+SAMPLE+"N_deduped.bam"
print(command2+'\n')
if exec_scripts:
    subprocess.check_output(command2,shell=True)

# 3a. Remove Duplicate Reads for tumor sample

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -i "+TUMOR+"T_sorted.bam --algo LocusCollector --fun score_info "+TUMOR+"T_score.txt"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -i "+TUMOR+"T_sorted.bam --algo Dedup --rmdup --score_info "+TUMOR+"T_score.txt --metrics "+TUMOR+"T_dedup_metrics.txt "+TUMOR+"T_deduped.bam"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

# 4a. Indel realigner for normal sample

In [None]:
command1 = data['SENTIEON_INSTALL_DIR']+ "/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+SAMPLE+"N_deduped.bam --algo Realigner -k "+data['known_Mills_indels']+" -k "+data['known_1000G_indels']+" "+SAMPLE+"N_realigned.bam"
print(command1+'\n')
if exec_scripts:
    subprocess.check_output(command1,shell=True)

# 4a. Indel realigner for tumor sample

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+ "/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_deduped.bam --algo Realigner -k "+data['known_Mills_indels']+" -k "+data['known_1000G_indels']+" "+TUMOR+"T_realigned.bam"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

# 5a. Base recalibration for normal sample

In [None]:
command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+SAMPLE+"N_realigned.bam --algo QualCal -k "+data['dbsnp']+" -k "+data['known_Mills_indels']+" -k "+data['known_1000G_indels']+" "+SAMPLE+"N_recal_data.table"
print(command1+'\n')
if exec_scripts:
    subprocess.check_output(command1,shell=True)

In [None]:
command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+SAMPLE+"N_realigned.bam -q "+SAMPLE+"N_recal_data.table --algo QualCal -k "+data['dbsnp']+" -k "+data['known_Mills_indels']+" -k "+data['known_1000G_indels']+" "+SAMPLE+"N_recal_data.table.post"
print(command2+'\n')
if exec_scripts:
    subprocess.check_output(command2,shell=True)

In [None]:
command3 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" --algo QualCal --plot --before "+SAMPLE+"N_recal_data.table --after "+SAMPLE+"N_recal_data.table.post "+SAMPLE+"N_recal.csv"
print(command3+'\n')
if exec_scripts:
    subprocess.check_output(command3,shell=True)

In [None]:
command4 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot QualCal -o "+SAMPLE+"N_recal_plots.pdf "+SAMPLE+"N_recal.csv"
print(command4+'\n')
if exec_scripts:
    subprocess.check_output(command4,shell=True)

In [None]:
#SAMPLE = batch[0][0]
#img = WImage(filename=SAMPLE+"N_recal_plots.pdf")
    
interact(multiPage, Sample=widgNor, page=[1,2] , file=fixed("_N_recal_plots.pdf"), normal=fixed(True), listSample = fixed([]));

In [None]:
# ReadWriter to output recalibrated bam
command5 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+SAMPLE+"N_realigned.bam -q "+SAMPLE+"N_recal_data.table --algo ReadWriter "+SAMPLE+"N_recal.bam"
print(command5+'\n')
if exec_scripts:
    subprocess.check_output(command5,shell=True)

# 5a. Base recalibration for tumor sample

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_realigned.bam --algo QualCal -k "+data['dbsnp']+" -k "+data['known_Mills_indels']+" -k "+data['known_1000G_indels']+" "+TUMOR+"T_recal_data.table"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_realigned.bam -q "+TUMOR+"T_recal_data.table --algo QualCal -k "+data['dbsnp']+" -k "+data['known_Mills_indels']+" -k "+data['known_1000G_indels']+" "+TUMOR+"T_recal_data.table.post"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command3 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" --algo QualCal --plot --before "+TUMOR+"T_recal_data.table --after "+TUMOR+"T_recal_data.table.post "+TUMOR+"T_recal.csv"
    print(command3+'\n')
    if exec_scripts:
        subprocess.check_output(command3,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command4 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon plot QualCal -o "+TUMOR+"T_recal_plots.pdf "+TUMOR+"T_recal.csv"
    print(command4+'\n')
    if exec_scripts:
        subprocess.check_output(command4,shell=True)

In [None]:
interact(multiPage, Sample=widgTum, page=[1,2] , file=fixed("_T_recal_plots.pdf"), normal=fixed(True), listSample = fixed(tumors));

In [None]:
# ReadWriter to output recalibrated bam
for i in range(batchSize):
    TUMOR = batch[i][1]
    command5 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_realigned.bam -q "+TUMOR+"T_recal_data.table --algo ReadWriter "+TUMOR+"T_recal.bam"
    print(command5+'\n')
    if exec_scripts:
        subprocess.check_output(command5,shell=True)

# 7a. HC Variant caller (normal)

In [None]:
command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+SAMPLE+"N_recal.bam --algo Haplotyper -d "+data['dbsnp']+" --emit_conf=30 --call_conf=30 "+SAMPLE+"N-output-hc.vcf.gz"
print(command1+'\n')
if exec_scripts:
        subprocess.check_output(command1,shell=True)

# 7a. HC Variant caller (tumor)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_recal.bam --algo Haplotyper -d "+data['dbsnp']+" --emit_conf=30 --call_conf=30 "+TUMOR+"T-output-hc.vcf.gz"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

# 8a. Variant calling DNAscope (normal)

In [None]:
command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -r "+data['fasta']+" -i "+SAMPLE+"N_recal.bam --algo DNAscope -d "+data['dbsnp']+" --model "+data['ML_MODEL_N']+" "+SAMPLE+"N-tmpDNAscope.vcf.gz"
print(command1+'\n')
if exec_scripts:
    subprocess.check_output(command1,shell=True)

In [None]:
command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -r "+data['fasta']+" --algo DNAModelApply --model "+data['ML_MODEL_N']+" -v "+SAMPLE+"N-tmpDNAscope.vcf.gz "+SAMPLE+"N-DNAscope.vcf.gz"
print(command2+'\n')
if exec_scripts:
    subprocess.check_output(command2,shell=True)

In [None]:
command3 = data['bcfdir'] + " filter -s ML_FAIL -i INFO/ML_PROB > 0.81 "+SAMPLE+"N-DNAscope.vcf.gz -O z -m x -o "+SAMPLE+"N-filtDNAscope.vcf.gz"
print(command3+'\n')
if exec_scripts:
    subprocess.check_output(command3,shell=True)

# 8a. Variant calling DNAscope (tumor)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -r "+data['fasta']+" -i "+TUMOR+"T_recal.bam --algo DNAscope -d "+data['dbsnp']+" --model "+data['ML_MODEL_N']+" "+TUMOR+"T-tmpDNAscope.vcf.gz"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -r "+data['fasta']+" --algo DNAModelApply --model "+data['ML_MODEL_N']+" -v "+TUMOR+"T-tmpDNAscope.vcf.gz "+TUMOR+"T-DNAscope.vcf.gz"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command3 = data['bcfdir'] + " filter -s ML_FAIL -i INFO/ML_PROB > 0.81 "+TUMOR+"T-DNAscope.vcf.gz -O z -m x -o "+TUMOR+"T-filtDNAscope.vcf.gz"
    print(command3+'\n')
    if exec_scripts:
        subprocess.check_output(command3,shell=True)

# 9a. Variant Annotation (normal)

In [None]:
command1 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+SAMPLE+"N-output-hc.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+SAMPLE+"N-output-hc.snpEff.vcf.gz"
print(command1+'\n')
if exec_scripts:
    subprocess.check_output(command1,shell=True)

In [None]:
command2 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+SAMPLE+"N-filtDNAscope.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+SAMPLE+"N-filtDNAscope.snpEff.vcf.gz"
print(command2+'\n')
if exec_scripts:
    subprocess.check_output(command2,shell=True)

# 9a. Variant Annotation (tumor)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command1 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+TUMOR+"T-output-hc.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+TUMOR+"T-output-hc.snpEff.vcf.gz"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    TUMOR = batch[i][1]
    command2 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+TUMOR+"T-filtDNAscope.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+TUMOR+"T-filtDNAscope.snpEff.vcf.gz"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

# 7b. Somatic Variant Calling TNseq

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_recal.bam -i "+SAMPLE+"N_recal.bam --algo TNsnv --tumor_sample "+TUMOR+"T --normal_sample "+SAMPLE+"N --pon "+data['panel_of_normal_TNsnv'] +" --cosmic "+data['cosmic_db']+" --dbsnp "+data['dbsnp']+" --call_stats_out "+TUMOR+"-call.stats "+TUMOR+"-TNsnv.vcf.gz"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_recal.bam -i "+SAMPLE+"N_recal.bam --algo TNhaplotyper --tumor_sample "+TUMOR+"T --normal_sample "+SAMPLE+"N --pon "+data['panel_of_normal_TNhaplotyper']+" --cosmic "+data['cosmic_db']+" --dbsnp "+data['dbsnp']+" "+TUMOR+"-TNhaplotyper.vcf.gz"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

# 8b. Somatic Variant calling TNscope

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command1 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -r "+data['fasta']+" -t "+str(data['nt'])+" -i "+TUMOR+"T_recal.bam -i "+SAMPLE+"N_recal.bam --algo TNscope --tumor_sample "+TUMOR+"T --normal_sample "+SAMPLE+"N --dbsnp "+data['dbsnp']+" --clip_by_minbq 1 --max_error_per_read 3 --min_init_tumor_lod 2.0 --min_base_qual 10 --min_base_qual_asm 10 --min_tumor_allele_frac 0.00005 "+TUMOR+"-tmpTNscope.vcf.gz"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command2 = data['SENTIEON_INSTALL_DIR']+"/bin/sentieon driver -t "+str(data['nt'])+" -r "+data['fasta']+" --algo TNModelApply --model "+data['ML_MODEL_T'] +" -v "+TUMOR+"-tmpTNscope.vcf.gz "+TUMOR+"-TNscope.vcf.gz"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command3 = data['bcfdir'] + " filter -s ML_FAIL -i \INFO/ML_PROB > 0.81 "+TUMOR+"-TNscope.vcf.gz -O z -m x -o "+TUMOR+ "-filtTNscope.vcf.gz"
    print(command3+'\n')
    if exec_scripts:
        subprocess.check_output(command3,shell=True)

# 9a. Somatic Variant Annotation

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command1 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+TUMOR+"-TNsnv.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+TUMOR+"-TNsnv.snpEff.vcf.gz"
    print(command1+'\n')
    if exec_scripts:
        subprocess.check_output(command1,shell=True)

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command2 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+TUMOR+"-TNhaplotyper.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+TUMOR+"-TNhaplotyper.snpEff.vcf.gz"
    print(command2+'\n')
    if exec_scripts:
        subprocess.check_output(command2,shell=True)

In [None]:
for i in range(batchSize):
    SAMPLE = batch[i][0]
    TUMOR = batch[i][1]
    command3 = "/storage/gluster/vol1/bcbio/anaconda/bin/snpEff -Xms1000m -Xmx36400m -Djava.io.tmpdir="+data['SENTIEON_TMPDIR']+" eff -noStats -t -noLog -dataDir /storage/gluster/vol1/bcbio/genomes/Hsapiens/hg19/snpeff -hgvs -noLof -i vcf -o vcf -noInteraction -noMotif -noNextProt -strict GRCh37.75 "+TUMOR+"-filtTNscope.vcf.gz | "+data['bgzipdir']+" --threads "+str(data['nt'])+" -c > "+TUMOR+"-filtTNscope.snpEff.vcf.gz"
    print(command3+'\n')
    if exec_scripts:
        subprocess.check_output(command3,shell=True)

In [None]:
#!jupyter nbconvert /home/adefalco/pipeline.ipynb --to html

In [None]:
#!python -m nbconvert /home/adefalco/pipeline.ipynb  --to ipynb --execute

In [None]:
#import nbinteract as nbi
#nbi.publish('AntonioDeFalco/testInteract/master', 'pipeline.ipynb')

In [None]:
!jupyter nbconvert /home/adefalco/pipeline.ipynb --to rst