# https://www.nature.com/articles/ncomms9864
- trying to get potential chimeric sequences

In [1]:
from collections import defaultdict
import os
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from tqdm import tnrange, tqdm_notebook

In [2]:
fn = '/home/bay001/projects/codebase/clash_seq_pipeline/tests/wf_clash_seq/wf_clash_seq/results/miRNA.sam'
fa_file = '/home/bay001/projects/codebase/clash_seq_pipeline/tests/wf_clash_seq/wf_clash_seq/results/SRR2413156.filtered.collapsed.3trim.5trim.fa'

In [3]:
mir_reads = defaultdict(dict)

with open(fn, 'r') as f:
    for line in f:
        mir, strand, read_name, _, sequence, _, _, _ = line.split('\t')
        mir_reads[mir][read_name] = sequence

In [4]:
for key, value in mir_reads.iteritems():
    if (len(mir_reads[key])) > 1:
        print(mir_reads[key])

In [11]:
def get_reference_seq_from_query(row):
    """ from a bowtie standard output row, return the reference sequence """
    mir, strand, ref_name, offset0base, qseq, qualities, alt_alignments, mutation_string = row.rstrip('\n').split('\t')
    # if strand == "-":
    #     qseq = str(Seq(qseq, generic_dna).reverse_complement())
    # elif strand == "+":
    #     pass
    # else:
    #     return 1
    
    rseq = qseq
    mutations = mutation_string.split(',') if mutation_string != "" else []
    for mutation in mutations:
        pos0base, change = mutation.split(':')
        ref, query = change.split('>')
        
        if strand == "-":
            pos0base = (len(rseq)-1) - int(pos0base)
        else:
            pos0base = int(pos0base)
            
        rseq = rseq[:pos0base] + ref + rseq[pos0base+1:]
        try:
            assert qseq[pos0base] == query
        except AssertionError:
            print(qseq, strand, pos0base, ref, query)
        
        
    return ref_name, rseq, strand

def get_name2seq_dict(fa_file, names):
    """ 
    Takes a fasta file and a list of names, 
    returns a dictionary containing {read_name:sequence}
    """
    name2seq_dict = {}
    handle = open(fa_file, "rU")
    for record in SeqIO.parse(handle, "fasta"):
        if record.id in names:
            name2seq_dict[record.id] = str(record.seq)
    handle.close()
    return name2seq_dict

In [12]:
def get_rnames_and_rseq_fragments_from_bowtie_output(fn):
    rnames = defaultdict(dict)
    with open(fn, 'r') as f:
        for line in f:
            rname, rseq, strand = get_reference_seq_from_query(line)
            rnames["#".join(rname.split("#")[:-1])] = {"fragment":rseq, "strand":strand} # may need to change based on read names
    return rnames

In [7]:
name2seq = get_name2seq_dict(fa_file, set(rnames.keys()))

In [30]:
def write_candidate_chimeric_targets_to_file(rnames, name2seq, output_file, min_seq_len=18):
    with open(output_file, 'w') as o:
        for rname, d in rnames.iteritems():
            rseq = d['fragment']
            strand = d['strand']
            fullseq = name2seq[rname]
            assert fullseq.find(rseq) != -1 # cannot find the sequence, mutation must be wrong.
            lo = fullseq[:fullseq.find(rseq)]
            hi = fullseq[(fullseq.find(rseq)+len(rseq)):]
            if strand == "-":
                downstream_seq = lo
                upstream_seq = hi
            elif strand == "+":
                downstream_seq = hi
                upstream_seq = lo
            else:
                return 1
            if len(downstream_seq) > min_seq_len:
                o.write(">{}_{}_downstream\n{}\n".format(rname, strand, downstream_seq))
            if len(upstream_seq) > min_seq_len:
                o.write(">{}_{}_upstream\n{}\n".format(rname, strand, upstream_seq))
                

In [31]:
out_file = '/home/bay001/projects/codebase/clash_seq_pipeline/tests/wf_clash_seq/wf_clash_seq/results/candidate_chimerics.fa'
write_candidate_chimeric_targets_to_file(rnames, name2seq, out_file)

In [None]:
rnames = get_rnames_and_rseq_fragments_from_bowtie_output(fn)
name2seq = get_name2seq_dict(fa_file, set(rnames.keys()))
write_candidate_chimeric_targets_to_file()