# This looks through the hg19 genome and finds all instances of a given motif, forward and reverse.

In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import os
import glob
import concurrent.futures
from tqdm import tnrange, tqdm_notebook

In [2]:
input_fa = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/motif_coords/'

In [3]:
motif = 'TTTTC'

def get_positions(record, motif=motif, output_dir=output_dir):
    i = 0
    positions = []
    progress = tnrange(len(record.seq))
    while i < len(record.seq)-len(motif):
        kmer = record.seq[i:i+len(motif)].upper()
        if kmer == motif:
            positions.append("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                record.name, i, i+len(motif), motif, '0', '+'
            ))
        i+=1
        progress.update(1)
    with open(os.path.join(output_dir, "{}.{}.bed".format(motif, record.name)), 'w') as o:
        for position in positions:
            o.write(position)

records = SeqIO.parse(input_fa, "fasta")
with concurrent.futures.ProcessPoolExecutor() as executor:
    for record, positions in zip(records, executor.map(get_positions, records)):
        print(positions)

In [4]:
motif = 'GAAAA'

def get_positions(record, motif=motif, output_dir=output_dir):
    i = 0
    positions = []
    progress = tnrange(len(record.seq))
    while i < len(record.seq)-len(motif):
        kmer = record.seq[i:i+len(motif)].upper()
        if kmer == motif:
            positions.append("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                record.name, i, i+len(motif), motif, '0', '-'
            ))
        i+=1
        progress.update(1)
    with open(os.path.join(output_dir, "{}.{}.bed".format(motif, record.name)), 'w') as o:
        for position in positions:
            o.write(position)

records = SeqIO.parse(input_fa, "fasta")
with concurrent.futures.ProcessPoolExecutor() as executor:
    for record, positions in zip(records, executor.map(get_positions, records)):
        print(positions)