# This notebook generates commands for de-novo motif finding using individual barcodes
- Unlike notebook 17/18, we won't be assigning editC scores or filtering for anything, since the number of edits per barcode is low. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import pandas as pd
import numpy as np
import os
import pybedtools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

from scipy.special import betainc

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs'
bedfile_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_merged_bedfiles'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/individual_barcodes_fx_for_motif'

In [3]:
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'

barcodes = [
    'CAACAACCAAGCGCTC-1',
    'TAACGACAGTGAGGCT-1',
    'AAAGGTAGTGGATGAC-1',
    'GCGGATCAGGTCACCC-1',
    'AACCCAACATGAGTAA-1',
    'TACTTACAGTAGTCCT-1',
    'GGTTGTAGTAGTCCTA-1',
    'TCCTCCCAGTCGAATA-1',
    'TCGAAGTGTAAGAACT-1',
    'ATCACAGTCCCTAGGG-1',
]

slop = 25
g = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.chrom.sizes'
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'

In [4]:
def barcode_to_bedfile(barcode):
    bed = os.path.join(bedfile_dir, "possorted_genome_bam_MD-{}.bed".format(barcode))
    try:
        assert os.path.exists(bed)
    except AssertionError:
        print(bed)
    return bed

def fx_bed(bed, output_bed):
    df = pd.read_csv(bed, sep='\t', names=['chrom','start','end','name','score','strand'])
    df['fxchrom'] = 'chr' + df['chrom']
    df[['fxchrom','start','end','name','score','strand']].to_csv(output_bed, sep='\t', header=False, index=False)
    
def slop_and_merge(bed, output_bed, slop, g):
    bt = pybedtools.BedTool(bed).sort().slop(b=slop, g=g)
    bt = bt.merge(c=(4,5,6), o=('count','count','distinct'))
    bt.saveas(output_bed)

for barcode in barcodes:
    bed = barcode_to_bedfile(barcode)
    output_bed = os.path.join(tmp_dir, os.path.splitext(os.path.basename(bed))[0] + ".fx.bed")
    output_slopped_bed = os.path.join(tmp_dir, os.path.splitext(os.path.basename(bed))[0] + ".fx.slop.bed")
    
    fx_bed(bed, output_bed)
    slop_and_merge(output_bed, output_slopped_bed, slop, g)

In [5]:
all_bedfiles = sorted(glob.glob(os.path.join(tmp_dir, '*.fx.slop.bed')))
print(len(all_bedfiles))
all_bedfiles[:3]

10


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/individual_barcodes_fx_for_motif/possorted_genome_bam_MD-AAAGGTAGTGGATGAC-1.fx.slop.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/individual_barcodes_fx_for_motif/possorted_genome_bam_MD-AACCCAACATGAGTAA-1.fx.slop.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/individual_barcodes_fx_for_motif/possorted_genome_bam_MD-ATCACAGTCCCTAGGG-1.fx.slop.bed']

In [6]:
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
cmds = []
for bed in all_bedfiles:
    if not os.path.exists(os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0])):
        out_homer_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0])
        out_file = out_homer_dir + '.svg'
        out_pickle_file = out_homer_dir + '.pickle'
        cmd = 'module load eclipanalysis/0.0.3a;analyze_motifs '
        cmd += '--peaks {} '.format(bed)
        cmd += '--out_pickle_file {} '.format(out_pickle_file)
        cmd += '--out_file {} '.format(out_file)
        cmd += '--species hg19 '
        cmd += '--out_homer_dir {} '.format(out_homer_dir)
        cmd += '--genome_fasta {} '.format(genome_fasta)
        cmds.append(cmd)

In [8]:
Submitter(
    cmds, 
    'motif_analysis', 
    nodes=1, 
    ppn=1, 
    array=True, 
    submit=True, 
    walltime='40:00:00'
)

Writing 10 tasks as an array-job.
Wrote commands to motif_analysis.sh.
Submitted script to queue home.
 Job ID: 21965495


<qtools.submitter.Submitter at 0x2ac41a07a590>