# Improving the Graph

## Steps

* Get the contigs and calculate the coverage
    * Align the contigs to reads using Minimap2
    * Sort the aligments using Samtools
    * Obtain the dept of each contig position using Samtools depth

* Get the coverage of links in the graph
    * For each link, obtain the connection sequence by connecting the latter 50% of first contig with first 50% of the next contig
    * Obtain the coverage as in the same way above

## Extracting Contigs, Links and Connection Contigs from the GFA file

In [6]:
# All the imports
import gfapy as gf

# Python imports
from pathlib import Path
import re
import copy
from uuid import uuid1
from joblib import Parallel, delayed
import multiprocessing

# BioPython imports
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import DNAAlphabet
from Bio import SeqIO

# Scientific imports
import numpy as np
import matplotlib.pyplot as plt

# for generated genome G4
# project_name = "simG4"
# readspath = "../Assembly/Sampled Reads/sd_0004.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly Graph/Assembly/FLYE/Out FLYE_4/assembly_graph.gfa"

# e coli
# readspath = "../Assembly/Sampled Reads EColi/sd_0001.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly Graph/Assembly/FLYE/Out EColi/assembly_graph.gfa"

# paths for human chromosome 22 (5MB)
readspath = "../Assembly/Sampled_Reads_Chr22/sd_0001.fastq"
graphpath = "/media/anuradhawick/data/Experiments/Assembly_Graph/Assembly/FLYE/Out Chr22/assembly_graph.gfa"
project_name = "chr22"

# paths for Yeast Genome
# readspath = "../Assembly/Sampled_Reads_Yeast/sd_0001.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly_Graph/Assembly/FLYE/Out Yeast/assembly_graph_refined.gfa"
# project_name = "yeast_refined"

read_coverage = 50
read_length = 20000

In [5]:
![ -e "$project_name" ] && rm -r "$project_name"
![ ! -e "$project_name" ] && mkdir "$project_name"

![ ! -e "$project_name"/links ] && mkdir "$project_name"/links
![ ! -e "$project_name"/contigs ] && mkdir "$project_name"/contigs

gfa = gf.Gfa.from_file(graphpath)

segments = {}
contig_lengths = {}

def get_aligning_sequences(s1, o1, s2, o2):
    seq1 = Seq(s1)
    seq2 = Seq(s2)

    if (o1 == '-'):
        seq1 = seq1.reverse_complement()
    
    if (o2 == '-'):
        seq2 = seq2.reverse_complement()

    return seq1, seq2

for x in gfa.segments:
    segments[x.name] = x
    contig_lengths[x.name] = len(x.sequence)
    record1 = SeqRecord(Seq(x.sequence, DNAAlphabet),
                        id=x.name, description="Contig name="+x.name)
    SeqIO.write(record1, project_name + "/contigs/" + x.name, "fasta")
    
    

for x in gfa.dovetails:
    seq1, seq2 = get_aligning_sequences(x.from_segment.sequence, x.from_orient, x.to_segment.sequence, x.to_orient)

    idstr = x.from_segment.name + x.from_orient + x.to_segment.name + x.to_orient
#   TODO check if we need to get half or a weighted amount considering the read lengths
    seq1_portion = round(read_length / 2)
    seq2_portion = round(read_length / 2)
    
    record = SeqRecord(seq1[-1*seq1_portion::] + seq2[0:seq2_portion],
                        id=idstr, description="")

    SeqIO.write(record, project_name + "/links/"+idstr, "fasta")


# Computing Coverage of Links and Contigs

## Function to Obtain Coverage of Contigs and Links

In [9]:
def compute_contig_coverage(folder):
    paths = Path(folder).glob('**/*')

    coverage_dict = {}
    
    for p in [str(p) for p in paths]:
        print("INFO::Inspecting path = " + p)
        fname = p.split("/").pop()
                
        !minimap2 --secondary=no -a "$p" "$readspath" > out.txt
        !samtools sort out.txt > out.sorted.txt
        !samtools depth out.sorted.txt > out.txt

        tot = !cut -f3 out.txt | paste -sd+ | bc
        lines = !wc -l out.txt

        tot = float(tot[0])
        lines = float(lines[0].split()[0])
        
#         y = !cut -f3 out.txt        
        
#         y = np.array(y)
#         x = np.arange(len(y))
        
#         plt.plot(x,y)
#         plt.show()
#         plt.figure()

        coverage = tot/lines
        !rm out.txt
        !rm out.sorted.txt

        coverage_dict[fname] = coverage
        
        print("Coverage: " + str(coverage))
        
    return coverage_dict


def get_match_from_cigar(cigar):
    matches = re.findall(r'(\d+)M', cigar)
    misses = re.findall(r'(\d+)H', cigar)
    deletions = re.findall(r'(\d+)D', cigar)
    insertions = re.findall(r'(\d+)I', cigar)
    
    return sum(map(int, matches)), sum(map(int, misses)), sum(map(int, deletions)), sum(map(int, insertions))
    

def obtain_relevant_mappings(mapping_file, output_file, contig1_length, contig2_length):
    print ("INFO:: Filtering irrelavant maps")
    with open(output_file, 'w+') as outf:
        with open(mapping_file, 'r') as mapf:
            line_o = mapf.readline()
            line = line_o.split()
            # consume headers
            while line_o[0] == "@":
                outf.write(line_o)
                line_o = mapf.readline()
                line = line_o.split()
            while line_o:
                # mapping attributes
                pos = int(line[3])

                if pos == 0: 
                    line_o = mapf.readline()
                    line = line_o.split()
                    continue # no match found
                else:
                    # match found
                    matches, misses, deletions, insertions = get_match_from_cigar(line[5])
                    
                    # ensure mappings including both links are taken (half of a read lenth overlap)
                    match_starts_contig1 = (pos-1) < contig1_length * 0.75
                    match_ends_contig2 = sum([matches, misses, deletions, insertions]) + pos > contig1_length + contig2_length * 0.25

                    # if matches are from both links it is a valid mapping
                    if match_starts_contig1 and match_ends_contig2: # and matches / (matches + misses + deletions + insertions) > 0.9:
                        outf.write(line_o)
                
                line_o = mapf.readline()
                line = line_o.split()
    print ("INFO:: Done filtering irrelavant maps")
    
    
def get_contig_names_from_file_name(fname):
    content = re.split('\+|\-',fname)
    valid_content = []
    
    for x in content:
        if len(x.strip()) > 0:
            valid_content.append(x.strip())
    
    return valid_content
    
    
def compute_link_coverage(folder, read_coverage=50):
    paths = Path(folder).glob('**/*')

    coverage_dict = {}
    
    for p in [str(p) for p in paths]:
        c1_len = round(read_length / 2)
        c2_len = round(read_length / 2)   
        fname = p.split("/").pop()
        
        print("INFO::Inspecting link = " + p)

        !minimap2 --secondary=no -a "$p" "$readspath" > unfiltered.out.txt
        
        obtain_relevant_mappings('unfiltered.out.txt', 'filtered.out.txt', c1_len, c2_len)
                
        !samtools sort filtered.out.txt > filtered.out.sorted.txt
        
        basewise_coverage = !samtools depth filtered.out.sorted.txt
        
        data = [int(x.split()[2]) for x in basewise_coverage]
        npd = np.array(data)

#         n, bins, patches = plt.hist(x=npd, bins='auto', color='#0504aa',
#                             alpha=0.7, rwidth=0.85)
#         plt.grid(axis='y', alpha=0.75)
#         plt.xlabel('Coverage')
#         plt.ylabel('Occurances')
#         plt.title('Coverage Distribution')
#         maxfreq = n.max()
#         # Set a clean upper y-axis limit.
#         plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
#         plt.show()
#         plt.figure()
        if len(npd) > 0:
            mean = npd.mean()# because we consider half's coverage for the whole contigs (correction)
            std = npd.std()
        else:
            mean, std = 0, 0
        
        print("Mean", mean, "Std", std)

        !rm unfiltered.out.txt
        !rm filtered.out.txt
        !rm filtered.out.sorted.txt

        coverage_dict[fname] = mean
        
    return coverage_dict

## Obtaining Coverage of Links

In [10]:
# Running Coverage Calculation for Contigs


contig_coverage = compute_contig_coverage(project_name+"/contigs/")
# # contig_coverage = {   'c2': 50,
# #     'c1': 150,
# #     'c3': 49,
# #     'c4': 50
# #     }

link_coverage = compute_link_coverage(project_name+"/links/")
# link_coverage = {   'c2+c1+': 50,
#     'c1+c1+': 100,
#     'c1+c3+': 53,
#     'c3+c4': 50}

print(link_coverage)
print()
print(contig_coverage)

for key in list(link_coverage.keys()):
    if link_coverage[key] == 0:
        del link_coverage[key]
        
print("Link coverages after removal of insignificant links")
print(link_coverage)
print()

INFO::Inspecting path = chr22/contigs/contig_5
[M::mm_idx_gen::0.001*1.49] collected minimizers
[M::mm_idx_gen::0.001*2.06] sorted minimizers
[M::main::0.001*2.04] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.002*1.99] mid_occ = 3
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.002*1.95] distinct minimizers: 1290 (99.46% are singletons); average occurrences: 1.005; average spacing: 5.267
[M::worker_pipeline::41.487*2.88] mapped 10791 sequences
[M::main] Version: 2.14-r886-dirty
[M::main] CMD: minimap2 --secondary=no -a chr22/contigs/contig_5 ../Assembly/Sampled_Reads_Chr22/sd_0001.fastq
[M::main] Real time: 41.487 sec; CPU: 119.295 sec; Peak RSS: 0.530 GB
Coverage: 988.8228663446055
INFO::Inspecting path = chr22/contigs/contig_3
[M::mm_idx_gen::0.019*0.65] collected minimizers
[M::mm_idx_gen::0.035*0.60] sorted minimizers
[M::main::0.035*0.60] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.036*0.61] mid