## A novel approach to Plasmid assembly

#### Contents:
Novel 

In [1]:
# Get directory
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
import subprocess
import shutil
RUN_DIR = os.getenv("RUN_DIR")
PRESENTATION_DIR = os.getenv("PRESENTATION_DIR")

In [2]:
print(RUN_DIR)

/data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED


In [3]:
### Function to count the number of reads per fastq file.
### Output to csv with the following columns:
### Bin, barcode, num nucleotides per barcode/bin
os.chdir(RUN_DIR)
FASTQ_DIR = os.path.join(RUN_DIR, "fastq")

barcode2sample_dict = {"barcode01": "JB2",
                       "barcode02": "JB1",
                       "barcode11": "PTS1",
                       "barcode12": "PTS2",
                       "barcode06": "YHC6",
                       "barcode07": "YHC7",
                       "barcode08": "YHC8",
                       "barcode09": "YHC17",
                       "unclassified": "unclassified"}

fastq_files = [fastq_file for fastq_file in os.listdir(FASTQ_DIR)
              if fastq_file.endswith(".fastq")
              and "mux" not in fastq_file
              and "all" not in fastq_file]
csv_output_file = os.path.join(PRESENTATION_DIR, "data", "size_by_barcode.tsv")
bin_df = pd.DataFrame(columns=["bin", "sample", "nucleotides"])
for fastq_file in fastq_files:
    # 0005_49335_plasmids.barcode07.0.fastq
    bin_id = fastq_file.split("_")[0]
    barcode = fastq_file.split(".")[1]
    # Get number of nucleotides
    num_nucleotides = 0
    with open(os.path.join(FASTQ_DIR, fastq_file), "rU") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            num_nucleotides += len(record.seq)
    df_as_series = pd.Series(data=[bin_id, barcode2sample_dict[barcode], num_nucleotides], index=["bin", "sample", "nucleotides"])
    bin_df = bin_df.append(df_as_series, ignore_index=True)
bin_df.to_csv(csv_output_file, index=False, sep="\t")

In [4]:
# Get minimum sample and produce a pauvre plot
grouped = bin_df.groupby("sample")
min_sample_name = grouped['nucleotides'].aggregate(np.sum).idxmin()
print("Sample with minimum yield is %s" % min_sample_name)
min_barcode_name = next((barcode for barcode, sample in barcode2sample_dict.items() 
                         if sample == min_sample_name), None)
print(min_barcode_name)
# Generate a pauvre plot for this sample
aggregated_fastq_file = os.path.join(FASTQ_DIR, min_barcode_name + ".all.fastq")
pauvre_command_options=["python3 $(which pauvre)"]
pauvre_command_options.append("marginplot")
pauvre_command_options.append("--fastq %s" % aggregated_fastq_file)
pauvre_command_options.append("--title 'Distribution of %s'" % min_sample_name)
pauvre_command = ' '.join(pauvre_command_options)
print(pauvre_command)
pauvre_proc = subprocess.Popen(pauvre_command, shell=True,
                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = pauvre_proc.communicate()
print(stdout, stderr)
pauvre_output_plot = min_barcode_name + ".all.png"

# Move paurve_output_plot to presentation folder
pauvre_plot_presentation_file = os.path.join(PRESENTATION_DIR, "images", "pauvre_min.png")

shutil.move(pauvre_output_plot, pauvre_plot_presentation_file)

Sample with minimum yield is PTS2
barcode12
python3 $(which pauvre) marginplot --fastq /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/fastq/barcode12.all.fastq --title 'Distribution of PTS2'


In [6]:
### Run porechop on each of the barcode.all files
### Convert from barcode names to sample names

# Barcoded files
concat_fastq_files = [fastq_file for fastq_file in os.listdir(FASTQ_DIR)
                     if fastq_file.startswith("barcode")]
trimmed_dir = os.path.join(RUN_DIR, "trimmed")
if not os.path.isdir(trimmed_dir):
    os.mkdir(trimmed_dir)
    
for fastq_file in concat_fastq_files:
    # barcode07.all.fastq
    output_fastq_file = barcode2sample_dict[fastq_file.split(".")[0]] + ".all.trimmed.fastq"
    porechop_command_options = ["porechop"]
    porechop_command_options.append("--input %s" % os.path.join(FASTQ_DIR, fastq_file))
    porechop_command_options.append("--discard_middle")  # Remove any reads with adapters in the middle.
    porechop_command_options.append("--require_two_barcodes")  # Remove any reads with adapters in the middle.
    porechop_command_options.append("--output %s" % os.path.join(trimmed_dir, output_fastq_file))
    porechop_command = ' '.join(porechop_command_options)
    # Run through subprocess
    porechop_proc = subprocess.Popen(porechop_command, shell=True,
                                     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = porechop_proc.communicate()
    print(stdout, stderr)

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                92.9       73.9\x1b[0m\n  Rapid                                     63.8        0.0\n  SQK-MAP006                                74.2       75.0\n  SQK-MAP006 Short                          70.4       76.0\n  PCR adapters 1                            73.9       75.0\n  Barcode 1 (reverse)                       76.9       84.0\n  \x1b[32mBarcode 2 (reverse)                       84.0       95.8\x1b[0m\n  Barcode 3 (reverse)                       70.4       70.8\n  Barcode 4 (reverse)                       79.2       70.4\n  Barcode 5 (reverse)                       65.4       75.0\n  Barcode 6 (reverse)   

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                92.9       73.9\x1b[0m\n  Rapid                                     64.9        0.0\n  SQK-MAP006                                77.4       78.3\n  SQK-MAP006 Short                          71.4       72.0\n  PCR adapters 1                            72.7       75.0\n  Barcode 1 (reverse)                       72.0       80.8\n  \x1b[32mBarcode 2 (reverse)                      100.0      100.0\x1b[0m\n  Barcode 3 (reverse)                       70.8       72.4\n  Barcode 4 (reverse)                       74.1       72.0\n  Barcode 5 (reverse)                       72.0       76.9\n  Barcode 6 (reverse)   

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                96.4       73.9\x1b[0m\n  Rapid                                     64.3        0.0\n  SQK-MAP006                                71.9       72.7\n  SQK-MAP006 Short                          72.0       79.2\n  PCR adapters 1                            77.3       73.9\n  Barcode 1 (reverse)                       75.0       74.1\n  Barcode 2 (reverse)                       70.8       72.0\n  Barcode 3 (reverse)                       74.1       72.0\n  Barcode 4 (reverse)                       68.0       70.4\n  Barcode 5 (reverse)                       70.4       75.0\n  Barcode 6 (reverse)                  

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                92.9       72.7\x1b[0m\n  Rapid                                     65.5        0.0\n  SQK-MAP006                                72.4       75.0\n  SQK-MAP006 Short                          68.0       76.0\n  PCR adapters 1                            77.3       77.3\n  Barcode 1 (reverse)                       71.4       72.0\n  Barcode 2 (reverse)                       71.4       70.8\n  Barcode 3 (reverse)                       70.8       72.0\n  Barcode 4 (reverse)                       69.2       73.1\n  Barcode 5 (reverse)                       75.0       74.1\n  Barcode 6 (reverse)                  

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                               100.0       77.3\x1b[0m\n  Rapid                                     65.5        0.0\n  SQK-MAP006                                76.7       78.3\n  SQK-MAP006 Short                          76.9       75.0\n  PCR adapters 1                            76.0       78.3\n  \x1b[32mBarcode 1 (reverse)                       83.3       92.3\x1b[0m\n  \x1b[32mBarcode 2 (reverse)                      100.0      100.0\x1b[0m\n  Barcode 3 (reverse)                       75.0       73.1\n  Barcode 4 (reverse)                       75.0       76.9\n  Barcode 5 (reverse)                       80.8       76.9\n  Barcode

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                92.9       81.8\x1b[0m\n  Rapid                                     65.0        0.0\n  SQK-MAP006                                77.4       76.0\n  SQK-MAP006 Short                          74.1       72.0\n  PCR adapters 1                            75.0       75.0\n  \x1b[32mBarcode 1 (reverse)                      100.0      100.0\x1b[0m\n  \x1b[32mBarcode 2 (reverse)                      100.0       84.6\x1b[0m\n  Barcode 3 (reverse)                       75.0       72.0\n  Barcode 4 (reverse)                       74.1       70.4\n  Barcode 5 (reverse)                       80.0       78.6\n  Barcode

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                92.9       75.0\x1b[0m\n  Rapid                                     61.9        0.0\n  SQK-MAP006                                71.9       76.0\n  SQK-MAP006 Short                          71.4       69.2\n  PCR adapters 1                            77.3       75.0\n  Barcode 1 (reverse)                       73.1       73.1\n  Barcode 2 (reverse)                       70.8       70.4\n  Barcode 3 (reverse)                       72.0       72.0\n  Barcode 4 (reverse)                       65.6       69.2\n  Barcode 5 (reverse)                       76.0       72.0\n  Barcode 6 (reverse)                  

('\n\x1b[1m\x1b[4mLooking for known adapter sets\x1b[0m\n                                        Best               \x1b[0m\n                                        read       Best    \x1b[0m\n                                        start      read end\x1b[0m\n  \x1b[4mSet                                   %ID        %ID     \x1b[0m\n  \x1b[32mSQK-NSK007                                92.9       72.0\x1b[0m\n  Rapid                                     60.7        0.0\n  SQK-MAP006                                71.0       72.0\n  SQK-MAP006 Short                          69.2       66.7\n  PCR adapters 1                            70.8       72.7\n  Barcode 1 (reverse)                       69.2       68.0\n  Barcode 2 (reverse)                       70.4       69.2\n  Barcode 3 (reverse)                       69.2       70.4\n  Barcode 4 (reverse)                       69.2       72.0\n  Barcode 5 (reverse)                       67.9       71.4\n  \x1b[32mBarcode 6 (reverse)          

In [7]:
# Estimate genome size using top five percent of reads.
trimmed_fastq_files = [fastq_file for fastq_file in os.listdir(trimmed_dir)
                       if fastq_file.endswith(".fastq")
                       and "trimmed" in fastq_file]
genome_lengths={}

for fastq_file in trimmed_fastq_files:
    print(fastq_file)
    with open(os.path.join(trimmed_dir, fastq_file), "rU") as handle:
        record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fastq"))
    number_reads = len(record_dict)
    iterator = 0
    number_reads_top5 = number_reads*0.05
    cumulative_sum = 0
    read_list = []
    for id in sorted(record_dict, key=lambda id: len(record_dict[id].seq), reverse=True):
        iterator += 1
        cumulative_sum += len(record_dict[id].seq)
        read_list.append(len(record_dict[id].seq))
        if iterator > number_reads_top5:
            break
            
    mean = cumulative_sum/iterator
    genome_lengths[fastq_file.split(".")[0]] = mean
    print("%s: top 5 mean: %d. Over %d reads. reads are: %s" % (fastq_file, mean, iterator, ' '.join(map(str, read_list))))

YHC17.all.trimmed.fastq
YHC17.all.trimmed.fastq: top 5 mean: 21705. Over 10 reads. reads are: 26923 22132 22090 22090 21669 21557 21060 20027 19884 19620
YHC6.all.trimmed.fastq
YHC6.all.trimmed.fastq: top 5 mean: 20112. Over 4 reads. reads are: 37138 14579 14408 14326
YHC7.all.trimmed.fastq
YHC7.all.trimmed.fastq: top 5 mean: 15924. Over 8 reads. reads are: 19173 18436 16460 15427 15298 15181 14069 13351
JB1.all.trimmed.fastq
JB1.all.trimmed.fastq: top 5 mean: 8894. Over 219 reads. reads are: 22273 21469 16487 15694 15277 13740 13286 13019 12166 12154 11921 11247 11232 10685 9669 9309 9150 9040 8985 8921 8877 8874 8843 8836 8817 8800 8795 8743 8736 8717 8707 8700 8700 8686 8678 8663 8642 8640 8635 8624 8607 8604 8588 8584 8583 8580 8580 8580 8579 8577 8573 8572 8566 8564 8563 8562 8560 8560 8556 8555 8553 8553 8549 8549 8547 8547 8547 8546 8546 8545 8543 8542 8540 8540 8540 8539 8538 8536 8535 8533 8533 8532 8531 8527 8526 8526 8526 8523 8522 8521 8520 8519 8518 8516 8515 8515 8512 851

In [8]:
# Now run canu on each of these commands with the desired genome length
canu_dir = os.path.join(RUN_DIR, "canu")
if not os.path.isdir(canu_dir):
    os.mkdir(canu_dir)
    
for sample, est_genome_length in genome_lengths.iteritems():
    sample_canu_dir = os.path.join(canu_dir, sample)
    # Delete previous directory and run again.
    if os.path.isdir(sample_canu_dir):
        shutil.rmtree(sample_canu_dir)
    os.mkdir(sample_canu_dir)
        
    canu_command_options = ["canu"]
    canu_command_options.append("-p %s" % sample)
    canu_command_options.append("-d %s" % sample_canu_dir)
    canu_command_options.append("genomeSize=%d" % est_genome_length)
    canu_command_options.append("useGrid=false")
    canu_command_options.append('stopOnReadQuality=false')
    canu_command_options.append("-nanopore-raw")
    canu_command_options.append("%s.all.trimmed.fastq" % os.path.join(trimmed_dir, sample))
    canu_command_options.append("2> %s" % os.path.join(sample_canu_dir, sample + ".stderr.log"))
    canu_command = ' '.join(canu_command_options)
    
    canu_proc = subprocess.Popen(canu_command, shell=True,
                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = canu_proc.communicate()
    print(stdout, stderr)

('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')
('-- Canu release v1.5\nGFA alignments updated.\n', '')


In [9]:
# Let's grab the first tigID for each sample
samples = genome_lengths.iterkeys()

canu_length_estimates = {}
for sample in samples:
    contigFile = os.path.join(canu_dir, sample, sample+".contigs.fasta")
    if os.stat(contigFile).st_size == 0:
        # Failed to form contig, use tigInfoFile to get main contig out of unassembled file
        unassembled_fasta_file = os.path.join(canu_dir, sample, sample+".unassembled.fasta")
        # Open up unassembled fasta file:
        records = SeqIO.parse(unassembled_fasta_file, "fasta")
        # Open up tigInfo File
        tigInfoFile = os.path.join(canu_dir, sample, sample+".contigs.layout.tigInfo")
        tigInfoDF = pd.read_csv(tigInfoFile, sep="\t", header=0)
        # Get contigs with coverage > 1
        covered_contigs = ["tig{:08d}".format(contig)
                           for contig in tigInfoDF.loc[tigInfoDF["coverage"] > 1]["#tigID"].tolist()]
        # write contigs to contigFile
        #with open(contigFile, "w") as output_handle:
         #   SeqIO.write(record for, output_handle, "fasta")
        #for contig in covered_contigs:
        #    print(record_dict["{:08d}".format(contig)]))
        SeqIO.write([record for record in records if record.id in covered_contigs],
                    contigFile, "fasta")        

In [30]:
# Now run circulator to circularise the genome and generate a consensus
samples = genome_lengths.iterkeys()
circlator_dir = os.path.join(RUN_DIR, "circlator")
if not os.path.isdir(circlator_dir):
    os.mkdir(circlator_dir)
for sample in samples:
    # Retrieve necessary files
    sample_canu_dir = os.path.join(canu_dir, sample)
    contigFile = os.path.join(sample_canu_dir, sample+".contigs.fasta")
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_output_prefix = os.path.join(circlator_sample_dir, sample)
    # Output directory must not exist
    if os.path.isdir(circlator_sample_dir):
        shutil.rmtree(circlator_sample_dir)
    os.mkdir(circlator_sample_dir) 
    # Circlator command
    circlator_command_options = ["circlator minimus2"]
    circlator_command_options.append(contigFile)
    circlator_command_options.append(circlator_output_prefix)
    circlator_command = ' '.join(circlator_command_options)
    
    # Need to replace top line ">tig00000001.circularised" with actual name.
    # Can use sed command to do this.
    sed_command = "sed -i \"1 s/^.*$/>%s.circularised/\" %s" % (sample, circlator_output_prefix+".circularise.fasta")
    
    circlator_proc = subprocess.Popen(' && '.join([circlator_command, sed_command]), shell=True,
                                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = circlator_proc.communicate()
    print(stdout, stderr)

('', '')
('', '')
('', '')
('', '')
('', '')
('', '')
('', '')
('', '')


In [31]:
# Re-align reads using samtools
# Create alignment dir
samples = genome_lengths.iterkeys()
alignment_dir = os.path.join(RUN_DIR, "alignment")
if not os.path.isdir(alignment_dir):
    os.mkdir(alignment_dir)
# Run alignment for each sample reads    
for sample in samples:
    # Get trimmed canu reads
    sample_canu_dir = os.path.join(canu_dir, sample)
    sample_alignment_dir = os.path.join(alignment_dir, sample)
    if os.path.isdir(sample_alignment_dir):
        shutil.rmtree(sample_alignment_dir)
    os.mkdir(sample_alignment_dir)
    
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_contig_file = os.path.join(circlator_sample_dir, sample+".circularise.fasta")

    sam_file = os.path.join(sample_alignment_dir, sample+".sam")
    bam_file = os.path.join(sample_alignment_dir, sample+".bam")
    bai_file = os.path.join(sample_alignment_dir, sample+".bai")

    if not os.path.isdir(sample_alignment_dir):
        os.mkdir(sample_alignment_dir)
    # Use the corrected and trimmed reads from Canu
    trimmed_reads = os.path.join(sample_canu_dir, sample+".trimmedReads.fasta.gz")
    
    # Create the bwa and samtools indexes for the draft reference
    bwa_index_command = "bwa index %s" % circlator_contig_file
    samtools_index_command = "samtools faidx %s" % circlator_contig_file
    index_proc = subprocess.Popen(' && '.join([bwa_index_command, samtools_index_command]), shell=True,
                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = index_proc.communicate()
    print(stdout, stderr)
    
    # Now run the bwa and sam2bam command
    bwa_align_command = "bwa mem -x ont2d %s %s > %s" % (circlator_contig_file, trimmed_reads, sam_file)
    
    # Samtools view and sort command
    samtools_view_and_sort_command = "samtools view -bS %s | samtools sort -o %s -" % (sam_file, bam_file)
    
    # Create bam index
    bam_index_command = "samtools index %s %s" % (bam_file, bai_file)
    
    # Run all commands sequentially
    alignment_commands = ' && '.join([bwa_align_command, samtools_view_and_sort_command,
                                       bam_index_command])
    alignment_proc = subprocess.Popen(alignment_commands, shell=True,
                                      stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = alignment_proc.communicate()
    print(stdout, stderr)

('', '[bwa_index] Pack FASTA... 0.00 sec\n[bwa_index] Construct BWT for the packed sequence...\n[bwa_index] 0.00 seconds elapse.\n[bwa_index] Update BWT... 0.00 sec\n[bwa_index] Pack forward-only FASTA... 0.00 sec\n[bwa_index] Construct SA from BWT and Occ... 0.01 sec\n[main] Version: 0.7.15-r1140\n[main] CMD: bwa index /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/circlator/YHC8/YHC8.circularise.fasta\n[main] Real time: 0.022 sec; CPU: 0.019 sec\n')
('', '[M::bwa_idx_load_from_disk] read 0 ALT contigs\n[M::process] read 69 sequences (516009 bp)...\n[M::mem_process_seqs] Processed 69 reads in 3.809 CPU sec, 3.809 real sec\n[main] Version: 0.7.15-r1140\n[main] CMD: bwa mem -x ont2d /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/circlator/YHC8/YHC8.circularise.fasta /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/canu/YHC8/YHC8.trimmedReads.fasta.gz\n[main] Real time: 3.828 sec; CPU: 3.828 sec\n')
('', '[bwa_index] Pack FASTA... 0.00 sec\n[

In [32]:
# Test for circularisation, export to TSV
samples = genome_lengths.iterkeys()
circlator_dir = os.path.join(RUN_DIR, "circlator")
circularised_df = pd.DataFrame(columns=["sample", "genome_size", "circularised"])
tsv_output_file = os.path.join(PRESENTATION_DIR, "data", "genome_status.tsv")
for sample in samples:
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_log_file = os.path.join(circlator_sample_dir, sample+".log")
    # Example line in log file: tig00000001 circularised: True
    is_circularised_command = "grep circularised: %s | rev | cut -d' ' -f1 | rev" % circlator_log_file
    is_circularised_proc = subprocess.Popen(is_circularised_command, shell=True,
                                           stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = is_circularised_proc.communicate()
    is_circularised = stdout.rstrip()=="True"
    print(is_circularised)
    circularised_as_seres = pd.Series(data=[sample, genome_lengths[sample], is_circularised], 
                                      index=["sample", "genome_size", "circularised"])
    circularised_df = circularised_df.append(circularised_as_seres, ignore_index=True)
circularised_df.to_csv(tsv_output_file, index=False, sep="\t")
print(circularised_df)

False
True
True
True
True
False
True
False
  sample genome_size circularised
0   YHC8       27458        False
1    JB2       12228         True
2    JB1        8894         True
3   PTS1       13987         True
4   PTS2       13553         True
5   YHC6       20112        False
6   YHC7       15924         True
7  YHC17       21705        False


In [36]:
# Use the unassembled data and map to the draft genomes
# Step 1: Create concatenated draft genome
samples = genome_lengths.iterkeys()
random_dir = os.path.join(RUN_DIR, "unclassified")
if not os.path.isdir(random_dir):
    os.mkdir(random_dir)
    
# Concatenate references into all_genomes_files
all_draft_genomes_file = os.path.join(random_dir, "all_draft_genomes.fna")
if os.path.isfile(all_draft_genomes_file):
    os.remove(all_draft_genomes_file)
for sample in samples:
    # Concatenate references into all_genomes_files
    sample_alignment_dir = os.path.join(alignment_dir, sample) 
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_contig_file = os.path.join(circlator_sample_dir, sample+".circularise.fasta")
    cat_command = "cat %s >> %s" % (circlator_contig_file, all_draft_genomes_file)
    cat_proc = subprocess.Popen(cat_command, shell=True,
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = cat_proc.communicate()
    print(stdout, stderr)
    
# Create bwa and faidx
bwa_index_command = "bwa index %s" % all_draft_genomes_file
samtools_index_command = "samtools faidx %s" % all_draft_genomes_file
index_proc = subprocess.Popen(' && '.join([bwa_index_command, samtools_index_command]), shell=True,
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = index_proc.communicate()

# Align all unclassified reads to the genome sets.
all_unclassified_fasta_files = os.path.join(FASTQ_DIR, "unclassified.all.fastq")
all_unclassified_trimmed_fasta_file = os.path.join(trimmed_dir, "unclassified.all.trimmed.fastq")
sam_file = os.path.join(random_dir, "unclassified.all.sam")
bam_file = os.path.join(random_dir, "unclassified.all.bam")
bai_file = os.path.join(random_dir, "unclassified.all.bai")

# But first we trim.
porechop_command_options = ["porechop"]
porechop_command_options.append("--input %s" % all_unclassified_fasta_files)
porechop_command_options.append("--discard_middle")  # Remove any reads with adapters in the middle.
porechop_command_options.append("--output %s" % os.path.join(trimmed_dir, all_unclassified_trimmed_fasta_file))
porechop_command = ' '.join(porechop_command_options)
porechop_proc = subprocess.Popen(porechop_command, shell=True,
                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = porechop_proc.communicate()

# Now run the bwa and sam2bam command
bwa_align_command = "bwa mem -x ont2d %s %s > %s" % (all_draft_genomes_file, all_unclassified_trimmed_fasta_file, sam_file)

# Samtools view and sort command
samtools_view_and_sort_command = "samtools view -bS %s | samtools sort -o %s -" % (sam_file, bam_file)
    
# Create bam index
bam_index_command = "samtools index %s %s" % (bam_file, bai_file)
# Split bam file by output
bamtools_split_command = "bamtools split -in %s -reference" % bam_file

# Run all commands sequentially
alignment_commands = ' && '.join([bwa_align_command, samtools_view_and_sort_command,
                                  bam_index_command, bamtools_split_command])
alignment_proc = subprocess.Popen(alignment_commands, shell=True,
                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = alignment_proc.communicate()

('', '')
('', '')
('', '')
('', '')
('', '')
('', '')
('', '')
('', '')


In [60]:
# Step 2: Convert these output bam files to fastq
bam_files = [bam_file for bam_file in os.listdir(random_dir)
             if bam_file.endswith(".bam") and
             "unmapped" not in bam_file and
             "REF" in bam_file]

# Create dictionary of bam_files
bam_files_by_sample = {}
for bam_file in bam_files:
    # Bam file: unclassified.all.REF_PTS2.circularised.bam
    print(bam_file)
    sample = bam_file.split(".")[2].split("_")[1]
    bam_files_by_sample[sample] = os.path.join(random_dir, bam_file)

for sample, bam_file in bam_files_by_sample.iteritems():
    # Run bam2fastq
    fastq_file = os.path.join(random_dir, "unclassified.%s.fastq" % sample)
    bam2fastq_command = "samtools bam2fq -O %s | seqtk seq > %s" % (bam_file, fastq_file)
    
    # Create concatenated fastq file that appends reads from trimmed directory
    original_trimmed_file = os.path.join(trimmed_dir, sample+".all.trimmed.fastq")
    
    cat_command = "cat %s >> %s" % (original_trimmed_file, fastq_file)
    
    # Run this again through canu
    sample_canu_dir = os.path.join(random_dir, "canu_%s" % sample)
    canu_prefix = "retry_canu_%s" % sample
    if os.path.isdir(sample_canu_dir):
        shutil.rmtree(sample_canu_dir)
    os.mkdir(sample_canu_dir)
    
    canu_command_options = ["canu"]
    canu_command_options.append("-p %s" % canu_prefix)
    canu_command_options.append("-d %s" % sample_canu_dir)
    canu_command_options.append("genomeSize=%d" % genome_lengths[sample])
    canu_command_options.append("useGrid=false")
    canu_command_options.append('stopOnReadQuality=false')
    canu_command_options.append("-nanopore-raw")
    canu_command_options.append("%s" % fastq_file)
    canu_command_options.append("2> %s" % os.path.join(sample_canu_dir, sample + ".stderr.log"))
    canu_command = ' '.join(canu_command_options)
    rerun_pipeline = subprocess.Popen(' && '.join([bam2fastq_command, cat_command, canu_command]), 
                                      shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = rerun_pipeline.communicate()
    print(stdout, stderr)
    contigFile = os.path.join(sample_canu_dir, canu_prefix+".contigs.fasta")
    if os.stat(contigFile).st_size == 0:
        # Failed to form contig, use tigInfoFile to get main contig out of unassembled file
        unassembled_fasta_file = os.path.join(sample_canu_dir, canu_prefix+".unassembled.fasta")
        # Open up unassembled fasta file:
        records = SeqIO.parse(unassembled_fasta_file, "fasta")
        # Open up tigInfo File
        tigInfoFile = os.path.join(sample_canu_dir, canu_prefix+".contigs.layout.tigInfo")
        tigInfoDF = pd.read_csv(tigInfoFile, sep="\t", header=0)
        # Get contigs with coverage > 1
        covered_contigs = ["tig{:08d}".format(contig)
                           for contig in tigInfoDF.loc[tigInfoDF["coverage"] > 1]["#tigID"].tolist()]
        # write contigs to contigFile
        #with open(contigFile, "w") as output_handle:
         #   SeqIO.write(record for, output_handle, "fasta")
        #for contig in covered_contigs:
        #    print(record_dict["{:08d}".format(contig)]))
        SeqIO.write([record for record in records if record.id in covered_contigs],
                    contigFile, "fasta")
    


unclassified.all.REF_JB2.circularised.bam
unclassified.all.REF_JB1.circularised.bam
unclassified.all.REF_YHC8.circularised.bam
unclassified.all.REF_PTS2.circularised.bam
unclassified.all.REF_PTS1.circularised.bam
unclassified.all.REF_YHC6.circularised.bam
unclassified.all.REF_YHC7.circularised.bam
unclassified.all.REF_YHC17.circularised.bam
('-- Canu release v1.5\nGFA alignments updated.\n', '[M::bam2fq_mainloop] discarded 0 singletons\n[M::bam2fq_mainloop] processed 181 reads\n')
('-- Canu release v1.5\nGFA alignments updated.\n', '[M::bam2fq_mainloop] discarded 0 singletons\n[M::bam2fq_mainloop] processed 431 reads\n')
('-- Canu release v1.5\nGFA alignments updated.\n', '[M::bam2fq_mainloop] discarded 0 singletons\n[M::bam2fq_mainloop] processed 1430 reads\n')
('-- Canu release v1.5\nGFA alignments updated.\n', '[M::bam2fq_mainloop] discarded 0 singletons\n[M::bam2fq_mainloop] processed 95 reads\n')
('-- Canu release v1.5\nGFA alignments updated.\n', '[M::bam2fq_mainloop] discarded 0

In [61]:
# Now run circulator to circularise the genome and generate a consensus
samples = genome_lengths.iterkeys()
circlator_dir = os.path.join(random_dir, "circlator")
if not os.path.isdir(circlator_dir):
    os.mkdir(circlator_dir)
for sample in samples:
    # Retrieve necessary files
    sample_canu_dir = os.path.join(random_dir, "canu_%s" % sample)
    contigFile = os.path.join(sample_canu_dir, "retry_canu_"+sample+".contigs.fasta")
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_output_prefix = os.path.join(circlator_sample_dir, sample)
    # Output directory must not exist
    if os.path.isdir(circlator_sample_dir):
        shutil.rmtree(circlator_sample_dir)
    os.mkdir(circlator_sample_dir) 
    # Circlator command
    circlator_command_options = ["circlator minimus2"]
    circlator_command_options.append(contigFile)
    circlator_command_options.append(circlator_output_prefix)
    circlator_command = ' '.join(circlator_command_options)
    
    # Need to replace top line ">tig00000001.circularised" with actual name.
    # Can use sed command to do this.
    sed_command = "sed -i \"1 s/^.*$/>%s.circularised/\" %s" % (sample, circlator_output_prefix+".circularise.fasta")
    
    circlator_proc = subprocess.Popen(' && '.join([circlator_command, sed_command]), shell=True,
                                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = circlator_proc.communicate()
    print(stdout, stderr)

('', '')
('', '')
('', '')
('', '')
('', '')
('', '')
('', '')
('', '')


In [62]:
# Create index and realign trimmed reads to the reference genome.
# Re-align reads using samtools
# Create alignment dir
samples = genome_lengths.iterkeys()
alignment_dir = os.path.join(random_dir, "alignment")
if not os.path.isdir(alignment_dir):
    os.mkdir(alignment_dir)
# Run alignment for each sample reads    
for sample in samples:
    # Get trimmed canu reads
    sample_canu_dir = os.path.join(random_dir, "canu_%s" % sample)
    sample_alignment_dir = os.path.join(alignment_dir, sample)
    if os.path.isdir(sample_alignment_dir):
        shutil.rmtree(sample_alignment_dir)
    os.mkdir(sample_alignment_dir)
    
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_contig_file = os.path.join(circlator_sample_dir, sample+".circularise.fasta")

    sam_file = os.path.join(sample_alignment_dir, sample+".sam")
    bam_file = os.path.join(sample_alignment_dir, sample+".bam")
    bai_file = os.path.join(sample_alignment_dir, sample+".bai")

    if not os.path.isdir(sample_alignment_dir):
        os.mkdir(sample_alignment_dir)
    # Use the corrected and trimmed reads from Canu
    trimmed_reads = os.path.join(sample_canu_dir, "retry_canu_"+sample+".trimmedReads.fasta.gz")
    
    # Create the bwa and samtools indexes for the draft reference
    bwa_index_command = "bwa index %s" % circlator_contig_file
    samtools_index_command = "samtools faidx %s" % circlator_contig_file
    index_proc = subprocess.Popen(' && '.join([bwa_index_command, samtools_index_command]), shell=True,
                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = index_proc.communicate()
    print(stdout, stderr)
    
    # Now run the bwa and sam2bam command
    bwa_align_command = "bwa mem -x ont2d %s %s > %s" % (circlator_contig_file, trimmed_reads, sam_file)
    
    # Samtools view and sort command
    samtools_view_and_sort_command = "samtools view -bS %s | samtools sort -o %s -" % (sam_file, bam_file)
    
    # Create bam index
    bam_index_command = "samtools index %s %s" % (bam_file, bai_file)
    
    # Run all commands sequentially
    alignment_commands = ' && '.join([bwa_align_command, samtools_view_and_sort_command,
                                       bam_index_command])
    alignment_proc = subprocess.Popen(alignment_commands, shell=True,
                                      stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = alignment_proc.communicate()
    print(stdout, stderr)

('', '[bwa_index] Pack FASTA... 0.00 sec\n[bwa_index] Construct BWT for the packed sequence...\n[bwa_index] 0.00 seconds elapse.\n[bwa_index] Update BWT... 0.00 sec\n[bwa_index] Pack forward-only FASTA... 0.01 sec\n[bwa_index] Construct SA from BWT and Occ... 0.00 sec\n[main] Version: 0.7.15-r1140\n[main] CMD: bwa index /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/unclassified/circlator/YHC8/YHC8.circularise.fasta\n[main] Real time: 0.045 sec; CPU: 0.023 sec\n')
('', '[M::bwa_idx_load_from_disk] read 0 ALT contigs\n[M::process] read 113 sequences (707415 bp)...\n[M::mem_process_seqs] Processed 113 reads in 6.151 CPU sec, 6.150 real sec\n[main] Version: 0.7.15-r1140\n[main] CMD: bwa mem -x ont2d /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/unclassified/circlator/YHC8/YHC8.circularise.fasta /data/Bioinfo/bioinfo-proj-alexis/2017_07_09_PLASMIDS_BARCODED/unclassified/canu_YHC8/retry_canu_YHC8.trimmedReads.fasta.gz\n[main] Real time: 6.172 sec; CPU: 6.171

In [63]:
# Test for circularisation, export to TSV
samples = genome_lengths.iterkeys()
circlator_dir = os.path.join(random_dir, "circlator")
circularised_df = pd.DataFrame(columns=["sample", "genome_size", "circularised"])
tsv_output_file = os.path.join(PRESENTATION_DIR, "data", "using_unclassified_data_genome_status.tsv")
for sample in samples:
    circlator_sample_dir = os.path.join(circlator_dir, sample)
    circlator_log_file = os.path.join(circlator_sample_dir, sample+".log")
    # Example line in log file: tig00000001 circularised: True
    is_circularised_command = "grep circularised: %s | rev | cut -d' ' -f1 | rev" % circlator_log_file
    is_circularised_proc = subprocess.Popen(is_circularised_command, shell=True,
                                           stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = is_circularised_proc.communicate()
    is_circularised = stdout.rstrip()=="True"
    print(is_circularised)
    circularised_as_seres = pd.Series(data=[sample, genome_lengths[sample], is_circularised], 
                                      index=["sample", "genome_size", "circularised"])
    circularised_df = circularised_df.append(circularised_as_seres, ignore_index=True)
circularised_df.to_csv(tsv_output_file, index=False, sep="\t")
print(circularised_df)

True
True
True
True
True
False
True
False
  sample genome_size circularised
0   YHC8       27458         True
1    JB2       12228         True
2    JB1        8894         True
3   PTS1       13987         True
4   PTS2       13553         True
5   YHC6       20112        False
6   YHC7       15924         True
7  YHC17       21705        False
