# Improving the Graph

## Steps

* Get the contigs and calculate the coverage
    * Align the contigs to reads using Minimap2
    * Sort the aligments using Samtools
    * Obtain the dept of each contig position using Samtools depth

* Get the coverage of links in the graph
    * For each link, obtain the connection sequence by connecting the latter 50% of first contig with first 50% of the next contig
    * Obtain the coverage as in the same way above

## Extracting Contigs, Links and Connection Contigs from the GFA file

In [34]:
![ ! -e 'links' ] && mkdir 'links'
![ ! -e 'contigs' ] && mkdir 'contigs'
![ ! -e 'mappings' ] && mkdir 'mappings'


import gfapy as gf
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import DNAAlphabet
from Bio import SeqIO

gfa = gf.Gfa.from_file('./graphs/assembly_graph.gfa')

segments = {}
contig_lengths = {}


def get_aligning_sequences(s1, o1, s2, o2):
    seq1 = Seq(s1)
    seq2 = Seq(s2)

    if (o1 == '-'):
        seq1 = seq1.reverse_complement()
    
    if (o2 == '-'):
        seq2 = seq2.reverse_complement()

    return seq1, seq2

for x in gfa.segments:
    segments[x.name] = x
    contig_lengths[x.name] = len(x.sequence)
    record1 = SeqRecord(Seq(x.sequence, DNAAlphabet),
                        id=x.name, description="Contig name="+x.name)
    SeqIO.write(record1, "./contigs/" + x.name, "fasta")
    
    

for x in gfa.dovetails:
    seq1, seq2 = get_aligning_sequences(x.from_segment.sequence, x.from_orient, x.to_segment.sequence, x.to_orient)

    idstr = x.from_segment.name + x.from_orient + x.to_segment.name + x.to_orient
#   TODO check if we need to get half or a weighted amount considering the read lengths
    record = SeqRecord(seq1[round(-1*len(seq1)/2)::] + seq2[0:int(len(seq1)/2)],
                        id=idstr, description="")

    SeqIO.write(record, "./links/"+idstr, "fasta")


{'contig_1': 624999, 'contig_2': 1999806, 'contig_3': 1749098}


# Computing Coverage of Links and Contigs

## Function to Obtain Coverage of Contigs

In [85]:
from pathlib import Path

def compute_contig_coverage(folder):
    paths = Path(folder).glob('**/*')
    readspath = "../Assembly/Sampled Reads/"

    coverage_dict = {}

    for p in [str(p) for p in paths]:
        fname = p.split("/").pop()
        print("INFO::Inspecting path = " + p)

        !minimap2 -a "$p" "$readspath"sd_0004.fastq.gz > out.txt
        !samtools sort out.txt > out.sorted.txt
        !samtools depth out.sorted.txt > out.txt

        tot = !cut -f3 out.txt | paste -sd+ | bc
        lines = !wc -l out.txt

        tot = float(tot[0])
        lines = float(lines[0].split()[0])

        coverage = tot/lines
        !rm out.txt
        !rm out.sorted.txt

        coverage_dict[fname] = coverage
    return coverage_dict

In [68]:
# Running Coverage Calculation for Contigs

contig_coverage = compute_coverage("./contigs/")

print(contig_coverage)

INFO::Inspecting path = contigs/contig_3
[M::mm_idx_gen::0.043*1.01] collected minimizers
[M::mm_idx_gen::0.053*1.36] sorted minimizers
[M::main::0.053*1.36] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.056*1.34] mid_occ = 3
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.060*1.32] distinct minimizers: 325791 (99.89% are singletons); average occurrences: 1.001; average spacing: 5.363
[M::worker_pipeline::10.621*2.35] mapped 19248 sequences
[M::main] Version: 2.14-r886-dirty
[M::main] CMD: minimap2 -a contigs/contig_3 ../Assembly/Sampled Reads/sd_0004.fastq.gz
[M::main] Real time: 10.640 sec; CPU: 24.987 sec; Peak RSS: 0.493 GB
INFO::Inspecting path = contigs/contig_2
[M::mm_idx_gen::0.072*1.02] collected minimizers
[M::mm_idx_gen::0.083*1.29] sorted minimizers
[M::main::0.083*1.29] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.088*1.27] mid_occ = 3
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #

## Function to Obtain Coverage of Links

In [110]:
import re

def get_match_from_cigar(cigar):
    matches = re.findall(r'(\d+)M', cigar)
    misses = re.findall(r'(\d+)H', cigar)
    deletions = re.findall(r'(\d+)D', cigar)
    insertions = re.findall(r'(\d+)I', cigar)
    
    return sum(map(int, matches)), sum(map(int, misses)), sum(map(int, deletions)), sum(map(int, insertions))
    

def obtain_relevant_mappings(mapping_file, output_file, contig1_length, contig2_length):
    with open(output_file, 'w+') as outf:
        with open(mapping_file, 'r') as mapf:
            line_o = mapf.readline()
            line = line_o.split()
            # consume headers
            while line_o[0] == "@":
                outf.write(line_o)
                line_o = mapf.readline()
                line = line_o.split()
            while line_o:
                pos = int(line[3])

                if pos == 0: 
                    line_o = mapf.readline()
                    line = line_o.split()
                    continue # no match found
                else:
                    # match found
                    matches, misses, deletions, insertions = get_match_from_cigar(line[5])
                    
                    # ensure mappings including both links are taken
                    match_starts_contig1 = (pos-1) < contig1_length
                    match_ends_contig2 = matches + pos > contig1_length

                    # if matches are from both links it is a valid mapping
                    if match_starts_contig1 and match_ends_contig2:# and matches / (matches + misses + deletions + insertions) > 0.75:
                        outf.write(line_o)
                
                line_o = mapf.readline()
                line = line_o.split()
    
def get_coverage_from_file_name(fname):
    content = re.split('\+|\-',fname)
    valid_content = []
    
    for x in content:
        if len(x.strip()) > 0:
            valid_content.append(x.strip())
    
    return contig_coverage[valid_content[0]], contig_coverage[valid_content[1]]
    
def compute_link_coverage(folder):
    paths = Path(folder).glob('**/*')
    readspath = "../Assembly/Sampled Reads/"

    coverage_dict = {}

    for p in [str(p) for p in paths]:
        fname = p.split("/").pop()
        
        c1_len, c2_len = get_coverage_from_file_name(fname)
        
        print("INFO::Inspecting link = " + p)

        !minimap2 -a "$p" "$readspath"sd_0004.fastq.gz > unfiltered.out.txt
        
        obtain_relevant_mappings('unfiltered.out.txt', 'filtered.out.txt', c1_len, c2_len)
        
        !samtools sort filtered.out.txt > filtered.out.sorted.txt
        !samtools depth filtered.out.sorted.txt > out.txt

        tot = !cut -f3 out.txt | paste -sd+ | bc
        lines = !wc -l out.txt

        tot = float(tot[0])
        lines = float(lines[0].split()[0])

        coverage = tot/lines
        !rm out.txt
        !rm unfiltered.out.txt
        !rm filtered.out.txt
        !rm filtered.out.sorted.txt

        coverage_dict[fname] = coverage
    return coverage_dict

In [111]:
# Running Coverage Calculation for Contigs and Links

link_coverage = compute_link_coverage("./links/")

print(link_coverage)

INFO::Inspecting link = links/contig_1-contig_3-
[M::mm_idx_gen::0.017*1.04] collected minimizers
[M::mm_idx_gen::0.023*1.55] sorted minimizers
[M::main::0.023*1.55] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.026*1.50] mid_occ = 3
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.027*1.47] distinct minimizers: 116533 (99.96% are singletons); average occurrences: 1.000; average spacing: 5.361
[M::worker_pipeline::7.603*2.21] mapped 19248 sequences
[M::main] Version: 2.14-r886-dirty
[M::main] CMD: minimap2 -a links/contig_1-contig_3- ../Assembly/Sampled Reads/sd_0004.fastq.gz
[M::main] Real time: 7.606 sec; CPU: 16.801 sec; Peak RSS: 0.490 GB
DONE CORRECT MAPPING
INFO::Inspecting link = links/contig_1+contig_1+
[M::mm_idx_gen::0.017*1.04] collected minimizers
[M::mm_idx_gen::0.023*1.53] sorted minimizers
[M::main::0.023*1.53] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.025*1.49] mid_occ = 3
[M::mm_idx_

In [112]:
import pprint

pp = pprint.PrettyPrinter(indent=4)

pp.pprint(contig_coverage)
print()
pp.pprint(link_coverage)

{   'contig_1': 99.67630348208557,
    'contig_2': 51.34579254187656,
    'contig_3': 49.193007481570504}

{   'contig_1+contig_1+': 31.54664988110225,
    'contig_1+contig_2+': 31.54664988110225,
    'contig_1-contig_3-': 26.783238336499895,
    'contig_3+contig_2+': 14.922250671329545}
