# Improving the Graph

## Steps

* Get the contigs and calculate the coverage
    * Align the contigs to reads using Minimap2
    * Sort the aligments using Samtools
    * Obtain the dept of each contig position using Samtools depth

* Get the coverage of links in the graph
    * For each link, obtain the connection sequence by connecting the latter 50% of first contig with first 50% of the next contig
    * Obtain the coverage as in the same way above

## Extracting Contigs, Links and Connection Contigs from the GFA file

In [1]:
# All the imports
import gfapy as gf

# Python imports
from pathlib import Path
import re
import copy
from uuid import uuid1
from joblib import Parallel, delayed
import multiprocessing

# BioPython imports
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import DNAAlphabet
from Bio import SeqIO

# Scientific imports
import numpy as np
import matplotlib.pyplot as plt

# PySAM
import pysam
from collections import defaultdict

# for generated genome G4
# project_name = "simG4"
# readspath = "../Assembly/Sampled Reads/sd_0004.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly Graph/Assembly/FLYE/Out FLYE_4/assembly_graph.gfa"

# e coli
# readspath = "../Assembly/Sampled Reads EColi/sd_0001.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly Graph/Assembly/FLYE/Out EColi/assembly_graph.gfa"

# paths for human chromosome 22 (5MB)
# readspath = "../Assembly/Sampled_Reads_Chr22/sd_0001.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly_Graph/Assembly/FLYE/Out Chr22/assembly_graph.gfa"
# project_name = "chr22"

# paths for Yeast Genome
# readspath = "../Assembly/Sampled_Reads_Yeast/sd_0001.fastq"
# graphpath = "/media/anuradhawick/data/Experiments/Assembly_Graph/Assembly/FLYE/Out Yeast/assembly_graph.gfa"
# project_name = "yeast"

readspath = "/home/anuradhawick/Desktop/pipeline_output/yeast_reads/all.fastq"
graphpath = "/home/anuradhawick/Desktop/pipeline_output/flye_yeast/assembly_graph.gfa"
project_name = "yeast_nci"

read_coverage = 50
read_length = 25000

In [2]:
![ -e "$project_name" ] && rm -r "$project_name"
![ ! -e "$project_name" ] && mkdir "$project_name"

![ ! -e "$project_name"/links ] && mkdir "$project_name"/links
![ ! -e "$project_name"/contigs ] && mkdir "$project_name"/contigs

gfa = gf.Gfa.from_file(graphpath)

segments = {}
contig_lengths = {}
contig_coverage = {}

def get_aligning_sequences(s1, o1, s2, o2):
    seq1 = Seq(s1)
    seq2 = Seq(s2)

    if (o1 == '-'):
        seq1 = seq1.reverse_complement()
    
    if (o2 == '-'):
        seq2 = seq2.reverse_complement()

    return seq1, seq2

for x in gfa.segments:
    contig_coverage[x.name] = x.get('dp')
    segments[x.name] = x
    contig_lengths[x.name] = len(x.sequence)
    record1 = SeqRecord(Seq(x.sequence, DNAAlphabet),
                        id=x.name, description="Contig name="+x.name)
    SeqIO.write(record1, project_name + "/contigs/" + x.name, "fasta")
    
    

for x in gfa.dovetails:
    seq1, seq2 = get_aligning_sequences(x.from_segment.sequence, x.from_orient, x.to_segment.sequence, x.to_orient)

    fname = x.from_segment.name + x.from_orient + x.to_segment.name + x.to_orient

    seq1_portion = round(read_length / 2)
    seq2_portion = round(read_length / 2)
    
    seq1_len = len(seq1[-1 * seq1_portion::])
    seq2_len = len(seq2[0:seq2_portion])
    
    idstr = fname + "_" + str(seq1_len) + ":" + str(seq2_len)
    
    record = SeqRecord(seq1[-1 * seq1_portion::] + seq2[0:seq2_portion], id=idstr, description="")

    SeqIO.write(record, project_name + "/links/" + fname, "fasta")

# Gathering all contigs to one place
!cat $project_name/contigs/contig* > $project_name/contigs/all.fasta
!cat $project_name/links/contig*contig* > $project_name/links/all.fasta

# Computing Coverage of Links 

In [3]:
![ -e "$project_name/link_mappings" ] && rm -r "$project_name/link_mappings"
![ ! -e "$project_name/link_mappings" ] && mkdir "$project_name/link_mappings"

!minimap2 -d "$project_name/links/links_mapping.bai" "$project_name/links/all.fasta"
!minimap2 --cs -a "$project_name/links/all.fasta" "$readspath" > "$project_name/link_mappings/links_mapping.sam"

[M::mm_idx_gen::0.055*1.04] collected minimizers
[M::mm_idx_gen::0.063*1.26] sorted minimizers
[M::main::0.067*1.25] loaded/built the index for 106 target sequence(s)
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 106
[M::mm_idx_stat::0.068*1.24] distinct minimizers: 74085 (68.29% are singletons); average occurrences: 2.914; average spacing: 5.372
[M::main] Version: 2.14-r886-dirty
[M::main] CMD: minimap2 -d yeast_nci/links/links_mapping.bai yeast_nci/links/all.fasta
[M::main] Real time: 0.070 sec; CPU: 0.087 sec; Peak RSS: 0.013 GB
[M::mm_idx_gen::0.058*1.03] collected minimizers
[M::mm_idx_gen::0.065*1.25] sorted minimizers
[M::main::0.066*1.25] loaded/built the index for 106 target sequence(s)
[M::mm_mapopt_update::0.067*1.25] mid_occ = 63
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 106
[M::mm_idx_stat::0.068*1.24] distinct minimizers: 74085 (68.29% are singletons); average occurrences: 2.914; average spacing: 5.372
[M::worker_pipeline::26.142*2.90] mapped

In [4]:
# read and edit SAM file here
samfile = pysam.AlignmentFile(project_name + "/link_mappings/links_mapping.sam", "r")
contained_filtered = defaultdict(set) # dictionary of sets
contained_seqs = defaultdict(set)

new_sam = open(project_name + "/link_mappings/links_mapping.filtered.sam", "w+")
new_sam.write(str(samfile.header))

for aln in samfile:
    name = aln.reference_name
    if not aln.is_unmapped:
        lengths = list(map(int, name.split("_").pop().split(":")))
        start = aln.reference_start
        end = aln.reference_end
        
        if start < lengths[0] and end > lengths[0]:
            new_sam.write(aln.to_string() + "\n")

new_sam.close()

In [5]:

!samtools view -Sb "$project_name/link_mappings/links_mapping.filtered.sam" > "$project_name/link_mappings/links_mapping.bam"
!samtools sort "$project_name/link_mappings/links_mapping.bam" > "$project_name/link_mappings/links_mapping.sorted.bam"
!samtools index "$project_name/link_mappings/links_mapping.sorted.bam"

!bedtools genomecov -bg -ibam "$project_name/link_mappings/links_mapping.sorted.bam" -g "$project_name/links/all.fasta" > "$project_name/link_mappings/coverage_map.bam"

## Obtaining Coverage of Links

In [9]:
link_coverage_temp = defaultdict(list)
link_coverage = defaultdict(int)

with open(project_name+ "/link_mappings/coverage_map.bam", "r") as cmap:
    line = cmap.readline()
    
    while line:
        name, start, end, coverage = line.split()
        name = re.sub(r'_[0-9]*:[0-9]*', '', name)
        start, end, coverage = map(int, [start, end, coverage])
   
        link_coverage_temp[name].append([end - start, coverage])
        line = cmap.readline()

for key, val in link_coverage_temp.items():
    sig_l = 0
    lc = 0
    for l, c in val:
        lc += l * c
        sig_l += l
    link_coverage[key] = round(float(lc) / float(sig_l))

print(link_coverage)
print()
print(contig_coverage)

defaultdict(<class 'int'>, {'contig_1-contig_34-': 39, 'contig_11+contig_20+': 50, 'contig_11-contig_17+': 25, 'contig_12+contig_24-': 15, 'contig_12-contig_30-': 25, 'contig_13+contig_26-': 43, 'contig_13-contig_26-': 41, 'contig_14+contig_27-': 65, 'contig_14-contig_18+': 14, 'contig_16+contig_10+': 31, 'contig_16+contig_7+': 30, 'contig_18+contig_28-': 2, 'contig_18+contig_37-': 56, 'contig_18+contig_38-': 1, 'contig_18-contig_16+': 16, 'contig_18-contig_24+': 10, 'contig_19+contig_20+': 40, 'contig_2+contig_34-': 33, 'contig_20+contig_30-': 11, 'contig_21-contig_33+': 17, 'contig_22-contig_35+': 45, 'contig_22-contig_35-': 390, 'contig_22-contig_36+': 45, 'contig_23+contig_23+': 11, 'contig_23+contig_35+': 187, 'contig_23+contig_35-': 82, 'contig_23+contig_36+': 40, 'contig_23-contig_25+': 44, 'contig_23-contig_35+': 74, 'contig_23-contig_35-': 11, 'contig_23-contig_36+': 46, 'contig_25+contig_23+': 5, 'contig_25+contig_25+': 15, 'contig_25+contig_25-': 2, 'contig_25+contig_35+': 6

# Filter contig containments

### Get contigs from the GFA

In [7]:
![ -e "$project_name/self_mappings" ] && rm -r "$project_name/self_mappings"
![ ! -e "$project_name/self_mappings" ] && mkdir "$project_name/self_mappings"

!cp "$project_name/contigs/all.fasta" "$project_name/contigs/all_ref.fasta"

!minimap2 -d "$project_name/self_mappings/mapping.bai" "$project_name/contigs/all_ref.fasta"
!minimap2 --cs -a "$project_name/contigs/all_ref.fasta" "$project_name/contigs/all.fasta" > "$project_name/self_mappings/mapping.sam"
!samtools view -Sb "$project_name/self_mappings/mapping.sam" > "$project_name/self_mappings/mapping.bam"
!samtools sort "$project_name/self_mappings/mapping.bam" > "$project_name/self_mappings/mapping.sorted.bam"
!samtools index "$project_name/self_mappings/mapping.sorted.bam"

[M::mm_idx_gen::0.283*1.00] collected minimizers
[M::mm_idx_gen::0.352*1.40] sorted minimizers
[M::main::0.422*1.33] loaded/built the index for 38 target sequence(s)
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 38
[M::mm_idx_stat::0.441*1.32] distinct minimizers: 2076414 (96.29% are singletons); average occurrences: 1.073; average spacing: 5.339
[M::main] Version: 2.14-r886-dirty
[M::main] CMD: minimap2 -d yeast_nci/self_mappings/mapping.bai yeast_nci/contigs/all_ref.fasta
[M::main] Real time: 0.446 sec; CPU: 0.585 sec; Peak RSS: 0.114 GB
[M::mm_idx_gen::0.252*1.00] collected minimizers
[M::mm_idx_gen::0.322*1.43] sorted minimizers
[M::main::0.322*1.43] loaded/built the index for 38 target sequence(s)
[M::mm_mapopt_update::0.349*1.40] mid_occ = 33
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 38
[M::mm_idx_stat::0.370*1.38] distinct minimizers: 2076414 (96.29% are singletons); average occurrences: 1.073; average spacing: 5.339
[M::worker_pipeline::1.476*2.33]

In [8]:
samfile = pysam.AlignmentFile(project_name + "/self_mappings/mapping.sorted.bam", "rb")
contained_filtered = defaultdict(set) # dictionary of sets
contained_seqs = defaultdict(set)

for aln in samfile:
    if aln.reference_name == aln.query_name:
        contained_seqs[aln.reference_name].add(None)
        contained_filtered[aln.reference_name].add(None)
    else:
        # check if query is within refernece
        ref_start = aln.reference_start
        ref_end = aln.reference_end
        q_len = aln.query_length
        
        if ref_start + q_len <= ref_end:
            contained_seqs[aln.reference_name].add(aln.query_name)
            contained_filtered[aln.reference_name].add(aln.query_name)

removed = True

while (removed):
    removable = []    
    for key in list(contained_filtered.keys()):
        removable.extend(list(contained_filtered[key]))
    
    removed = False

    for key in removable:
        if key in contained_filtered:
            del contained_filtered[key]
            removed = True
    
significant_contigs = list(contained_filtered.keys())
all_contigs = list(contained_seqs.keys())
removed_contigs = set(all_contigs) - set(significant_contigs)

![ -e "$project_name/valid_contigs" ] && rm -r "$project_name/valid_contigs"
![ ! -e "$project_name/valid_contigs" ] && mkdir "$project_name/valid_contigs"

cmd = "cat "

for c in significant_contigs:
    p = project_name + "/contigs/" + c + " "
    cmd += p

cmd += " > " + project_name + "/valid_contigs/all_valid.fasta"


!$cmd

print(significant_contigs)
print(all_contigs)
print(removed_contigs)

['contig_1', 'contig_10', 'contig_11', 'contig_12', 'contig_13', 'contig_14', 'contig_15', 'contig_16', 'contig_17', 'contig_18', 'contig_19', 'contig_2', 'contig_20', 'contig_21', 'contig_22', 'contig_24', 'contig_26', 'contig_27', 'contig_28', 'contig_3', 'contig_30', 'contig_31', 'contig_32', 'contig_33', 'contig_34', 'contig_37', 'contig_38', 'contig_4', 'contig_5', 'contig_6', 'contig_7', 'contig_8', 'contig_9']
['contig_1', 'contig_10', 'contig_11', 'contig_12', 'contig_13', 'contig_14', 'contig_15', 'contig_16', 'contig_17', 'contig_18', 'contig_19', 'contig_2', 'contig_20', 'contig_21', 'contig_22', 'contig_23', 'contig_24', 'contig_25', 'contig_26', 'contig_27', 'contig_28', 'contig_29', 'contig_3', 'contig_30', 'contig_31', 'contig_32', 'contig_33', 'contig_34', 'contig_35', 'contig_36', 'contig_37', 'contig_38', 'contig_4', 'contig_5', 'contig_6', 'contig_7', 'contig_8', 'contig_9']
{'contig_23', 'contig_29', 'contig_35', 'contig_25', 'contig_36'}
