# Runs motif finding on conf-filtered edit windows

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import pandas as pd
import numpy as np
import os
import pybedtools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/combined_outputs_w_cov_info'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs'

In [3]:
chrom_sizes = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.chrom.sizes'

In [4]:
all_beds = sorted(glob.glob(os.path.join(input_dir, '*.fx.bed')))
print(len(all_beds))
all_beds[:3]

19


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/combined_outputs_w_cov_info/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset.fx.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/combined_outputs_w_cov_info/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.fx.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/combined_outputs_w_cov_info/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.fx.bed']

# Read in one of them and check on IGV

In [5]:
conf = 0.999
flank_size = 25
window_size = flank_size * 2 + 1

for bed in all_beds:
    output_windows = os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0] + "windows_{}.conf{}.bed".format(window_size, conf))
    df = pd.read_csv(bed, sep='\t', names=['chrom','start','end','name','score','strand'])
    df = df[df['name'] >= conf]
    bedtool = pybedtools.BedTool.from_dataframe(df)
    bedtool = bedtool.slop(b=flank_size, g=chrom_sizes).sort()
    bedtool = bedtool.merge(c=(4,5,6), o=('collapse','collapse','distinct'))
    bedtool.saveas(output_windows)

In [6]:
all_bedfiles = sorted(glob.glob(os.path.join(output_dir, '*conf{}.bed'.format(conf))))
len(all_bedfiles)

19

In [7]:
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
cmds = []
for bed in all_bedfiles:
    if not os.path.exists(os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0])):
        out_homer_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0])
        out_file = out_homer_dir + '.svg'
        out_pickle_file = out_homer_dir + '.pickle'
        cmd = 'module load eclipanalysis/0.0.3a;analyze_motifs '
        cmd += '--peaks {} '.format(bed)
        cmd += '--out_pickle_file {} '.format(out_pickle_file)
        cmd += '--out_file {} '.format(out_file)
        cmd += '--species hg19 '
        cmd += '--out_homer_dir {} '.format(out_homer_dir)
        cmd += '--genome_fasta {} '.format(genome_fasta)
        cmds.append(cmd)

In [8]:
cmds

['module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset.fxwindows_51.conf0.999.bed --out_pickle_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset.fxwindows_51.conf0.999.pickle --out_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset.fxwindows_51.conf0.999.svg --species hg19 --out_homer_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset.fxwindows_51.conf0.999 --genome_fasta /projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa ',
 'module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/motif_outputs/RBFOX2_ORFS.barcodes_

In [9]:
Submitter(
    cmds, 
    'motif_analysis', 
    sh='motif_analysis.sh', 
    nodes=1, 
    ppn=1, 
    array=True, 
    submit=True, 
    walltime='12:00:00'
)

Writing 2 tasks as an array-job.
Wrote commands to motif_analysis.sh.
Submitted script to queue home.
 Job ID: 21389173


<qtools.submitter.Submitter at 0x2b97d28bef50>