In [435]:
import pandas as pd
import pybedtools
from glob import glob

# Load peak files

In [436]:
peak_filepaths = sorted(glob('6_overlapped_confident_peaks/*cleaned.tsv'))

peak_filepaths


file_id_to_df = {}

for peak_filepath in peak_filepaths:    
    print(peak_filepath)
    file_id = peak_filepath.split('/')[-1].split('.tsv')[0]
    print('\t', file_id)
    file_id_to_df[file_id] = pd.read_csv(peak_filepath, sep='\t', index_col=0)

6_overlapped_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned.tsv
	 RBFOX2_ai_FOX2-8e_cleaned
6_overlapped_confident_peaks/RBFOX2_ai_FOX2-APOBEC1_cleaned.tsv
	 RBFOX2_ai_FOX2-APOBEC1_cleaned
6_overlapped_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned.tsv
	 RBFOX2_ai_FOX2-MinA_cleaned
6_overlapped_confident_peaks/RBFOX2_ai_FOX2-P0078_cleaned.tsv
	 RBFOX2_ai_FOX2-P0078_cleaned
6_overlapped_confident_peaks/RBFOX2_both_FOX2-P0078_cleaned.tsv
	 RBFOX2_both_FOX2-P0078_cleaned
6_overlapped_confident_peaks/RBFOX2_ct_FOX2-8e_cleaned.tsv
	 RBFOX2_ct_FOX2-8e_cleaned
6_overlapped_confident_peaks/RBFOX2_ct_FOX2-APOBEC1_cleaned.tsv
	 RBFOX2_ct_FOX2-APOBEC1_cleaned
6_overlapped_confident_peaks/RBFOX2_ct_FOX2-MinA_cleaned.tsv
	 RBFOX2_ct_FOX2-MinA_cleaned
6_overlapped_confident_peaks/RBFOX2_ct_FOX2-P0078_cleaned.tsv
	 RBFOX2_ct_FOX2-P0078_cleaned
6_overlapped_confident_peaks/SLBP_ai_SLBP-8e_cleaned.tsv
	 SLBP_ai_SLBP-8e_cleaned
6_overlapped_confident_peaks/SLBP_ai_SLBP-APOBEC1_cleaned.tsv
	 SLBP_ai_SLBP-APOBEC1

# Make bedtools from peak files

In [437]:
file_id_to_bedtool = {}
for file_id, df in file_id_to_df.items():
    print(file_id)
    
    #df = df[df.score > .99]
    bedtool = pybedtools.BedTool.from_dataframe(df[['chrom', 'start', 'end', 'strand']])
    print('\t', len(bedtool))
    file_id_to_bedtool[file_id] = bedtool

RBFOX2_ai_FOX2-8e_cleaned
	 7882
RBFOX2_ai_FOX2-APOBEC1_cleaned
	 540
RBFOX2_ai_FOX2-MinA_cleaned
	 736
RBFOX2_ai_FOX2-P0078_cleaned
	 3236
RBFOX2_both_FOX2-P0078_cleaned
	 4003
RBFOX2_ct_FOX2-8e_cleaned
	 729
RBFOX2_ct_FOX2-APOBEC1_cleaned
	 1897
RBFOX2_ct_FOX2-MinA_cleaned
	 338
RBFOX2_ct_FOX2-P0078_cleaned
	 1151
SLBP_ai_SLBP-8e_cleaned
	 1868
SLBP_ai_SLBP-APOBEC1_cleaned
	 1091
SLBP_ai_SLBP-MinA_cleaned
	 1271
SLBP_ai_SLBP-P0078_cleaned
	 1180
SLBP_both_SLBP-P0078_cleaned
	 1581
SLBP_ct_SLBP-8e_cleaned
	 402
SLBP_ct_SLBP-APOBEC1_cleaned
	 885
SLBP_ct_SLBP-MinA_cleaned
	 398
SLBP_ct_SLBP-P0078_cleaned
	 462


# Load regions in which to shuffle STAMP peaks

In [438]:
# Load regions
region_map_bedtool = pybedtools.BedTool('/projects/ps-yeolab3/ekofman/ReferenceData/peakcalling_regions/hg38/hg38_cellranger_region_map.bed')
len(region_map_bedtool)

620869

# Figure out which region each peak was in

In [439]:
def add_index(r):
    return '{}:{}-{}({})'.format(r.chrom, r.start, r.end, r.strand)

In [440]:
names = 'chrom   start   end     edit_fraction   strand  target_bases    edited_bases    num_edited_reads        total_reads_in_region   fraction_reads_edited   mean_depth      num_substrate_bases     subregion_coverage      subregion_conversions   region_coverage region_conversions      gene_coverage   gene_conversions        score'.split()
#names = ['{}_region'.format(l) for l in names]
names


['chrom',
 'start',
 'end',
 'edit_fraction',
 'strand',
 'target_bases',
 'edited_bases',
 'num_edited_reads',
 'total_reads_in_region',
 'fraction_reads_edited',
 'mean_depth',
 'num_substrate_bases',
 'subregion_coverage',
 'subregion_conversions',
 'region_coverage',
 'region_conversions',
 'gene_coverage',
 'gene_conversions',
 'score']

In [441]:
file_id_to_intersected = {}

for file_id, bedtool in file_id_to_bedtool.items():
    print(file_id)
    intersected = region_map_bedtool.intersect(bedtool, wa=True, wb=True).to_dataframe(names=['chrom_region', 'start_region', 'end_region', 'symbol_region', 'label_region', 'strand_region'] +names )

    intersected.index = intersected.apply(add_index, axis=1)
    intersected = intersected[(intersected.start >= intersected.start_region) & (intersected.end <= intersected.end_region)]
    intersected['peak_size'] = intersected['end'] - intersected['start']
    intersected = intersected[~intersected.index.duplicated(keep='first')]
    print('\t', len(intersected), len(bedtool))
    file_id_to_intersected[file_id] = intersected
    

RBFOX2_ai_FOX2-8e_cleaned
	 7838 7882
RBFOX2_ai_FOX2-APOBEC1_cleaned
	 540 540
RBFOX2_ai_FOX2-MinA_cleaned
	 732 736
RBFOX2_ai_FOX2-P0078_cleaned
	 3232 3236
RBFOX2_both_FOX2-P0078_cleaned
	 3997 4003
RBFOX2_ct_FOX2-8e_cleaned
	 727 729
RBFOX2_ct_FOX2-APOBEC1_cleaned
	 1895 1897
RBFOX2_ct_FOX2-MinA_cleaned
	 337 338
RBFOX2_ct_FOX2-P0078_cleaned
	 1149 1151
SLBP_ai_SLBP-8e_cleaned
	 1854 1868
SLBP_ai_SLBP-APOBEC1_cleaned
	 1080 1091
SLBP_ai_SLBP-MinA_cleaned
	 1264 1271
SLBP_ai_SLBP-P0078_cleaned
	 1174 1180
SLBP_both_SLBP-P0078_cleaned
	 1573 1581
SLBP_ct_SLBP-8e_cleaned
	 402 402
SLBP_ct_SLBP-APOBEC1_cleaned
	 880 885
SLBP_ct_SLBP-MinA_cleaned
	 397 398
SLBP_ct_SLBP-P0078_cleaned
	 459 462


# Permute the STAMP peaks within their respective region

In [442]:
import random


NUM_RAND_REGIONS = 20

def get_randomized_region(chrom, start, end, strand, peak_size):
    random_regions = []
    for n in range(NUM_RAND_REGIONS):
        random_start = random.randint(start, end-peak_size)
        random_end = random_start + peak_size
        
        random_region = '{}:{}-{}({})'.format(chrom, random_start, random_end, strand)
        random_regions.append(random_region)
    
    return random_regions
    

In [443]:
def get_random_regions(chrom, start, end, peak_start, peak_end, strand):
    peak_size = peak_end - peak_start
    return get_randomized_region(chrom, start, end, strand, peak_size)
    

In [444]:
file_id_to_intersected.keys()

dict_keys(['RBFOX2_ai_FOX2-8e_cleaned', 'RBFOX2_ai_FOX2-APOBEC1_cleaned', 'RBFOX2_ai_FOX2-MinA_cleaned', 'RBFOX2_ai_FOX2-P0078_cleaned', 'RBFOX2_both_FOX2-P0078_cleaned', 'RBFOX2_ct_FOX2-8e_cleaned', 'RBFOX2_ct_FOX2-APOBEC1_cleaned', 'RBFOX2_ct_FOX2-MinA_cleaned', 'RBFOX2_ct_FOX2-P0078_cleaned', 'SLBP_ai_SLBP-8e_cleaned', 'SLBP_ai_SLBP-APOBEC1_cleaned', 'SLBP_ai_SLBP-MinA_cleaned', 'SLBP_ai_SLBP-P0078_cleaned', 'SLBP_both_SLBP-P0078_cleaned', 'SLBP_ct_SLBP-8e_cleaned', 'SLBP_ct_SLBP-APOBEC1_cleaned', 'SLBP_ct_SLBP-MinA_cleaned', 'SLBP_ct_SLBP-P0078_cleaned'])

In [445]:
file_id_to_shuffle_dfs = {}


for file_id, intersected in file_id_to_intersected.items():
    print(file_id)
    
    all_regions = []
    all_indices = []

    for r in intersected.iterrows():
        index = r[0]
        r = r[1]
        chrom = r.loc['chrom_region']
        start = r.loc['start_region']
        end = r.loc['end_region']
        peak_start = r.loc['start']
        peak_end = r.loc['end']
        strand = r.loc['strand_region']

        try:
            n_regions = get_random_regions(chrom, start, end, peak_start, peak_end, strand)
            all_regions.append(n_regions)

            all_indices.append(index)
        except Exception as e:
            print(chrom, start, end, peak_start, peak_end, strand)
            print(e)
            
    shuffle_df = pd.DataFrame(all_regions, index=all_indices)
    shuffle_df = shuffle_df[~shuffle_df.index.duplicated(keep='first')]
    
    print('\t', len(shuffle_df), len(shuffle_df.columns))
    file_id_to_shuffle_dfs[file_id] = shuffle_df

RBFOX2_ai_FOX2-8e_cleaned
	 7838 20
RBFOX2_ai_FOX2-APOBEC1_cleaned
	 540 20
RBFOX2_ai_FOX2-MinA_cleaned
	 732 20
RBFOX2_ai_FOX2-P0078_cleaned
	 3232 20
RBFOX2_both_FOX2-P0078_cleaned
	 3997 20
RBFOX2_ct_FOX2-8e_cleaned
	 727 20
RBFOX2_ct_FOX2-APOBEC1_cleaned
	 1895 20
RBFOX2_ct_FOX2-MinA_cleaned
	 337 20
RBFOX2_ct_FOX2-P0078_cleaned
	 1149 20
SLBP_ai_SLBP-8e_cleaned
	 1854 20
SLBP_ai_SLBP-APOBEC1_cleaned
	 1080 20
SLBP_ai_SLBP-MinA_cleaned
	 1264 20
SLBP_ai_SLBP-P0078_cleaned
	 1174 20
SLBP_both_SLBP-P0078_cleaned
	 1573 20
SLBP_ct_SLBP-8e_cleaned
	 402 20
SLBP_ct_SLBP-APOBEC1_cleaned
	 880 20
SLBP_ct_SLBP-MinA_cleaned
	 397 20
SLBP_ct_SLBP-P0078_cleaned
	 459 20


# From aggregate dfs containing shuffled information, extract each column as a random bed file and save

In [446]:
def expand_string_to_df(label):
    chrom = label.split(':')[0]
    start = label.split(':')[1].split('-')[0]
    end = label.split(':')[1].split('-')[1].split('(')[0]
    strand = label.split('(')[1].split(')')[0]
    return chrom, start, end, strand


file_id_to_list_of_random_instance_dfs = {}


for file_id, shuffle_df in file_id_to_shuffle_dfs.items():
    print(file_id)
    random_instance_dfs = []

    for random_instance_index in shuffle_df.columns:
        print('{}/{}'.format(random_instance_index, len(shuffle_df.columns)))
        random_instance = shuffle_df[[random_instance_index]]
        random_instance_df = pd.DataFrame(zip(*random_instance[random_instance_index].apply(expand_string_to_df))).T
        random_instance_df.columns = ['chrom', 'start', 'end', 'strand']
        random_instance_dfs.append(random_instance_df)

    file_id_to_list_of_random_instance_dfs[file_id] = random_instance_dfs

RBFOX2_ai_FOX2-8e_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_ai_FOX2-APOBEC1_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_ai_FOX2-MinA_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_ai_FOX2-P0078_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_both_FOX2-P0078_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_ct_FOX2-8e_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_ct_FOX2-APOBEC1_cleaned
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
RBFOX2_ct_FOX2-MinA_cleaned


# Make directories into which to put shuffles

In [447]:
import os

for file_id, list_of_random_instance_dfs in file_id_to_list_of_random_instance_dfs.items():
    print(file_id)
    dir_name = '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/{}'.format(file_id)

    try:
        os.mkdir(dir_name)
    except Exception as e:
        print(e)
        
    # Save beds
    
    for i, df in enumerate(list_of_random_instance_dfs):
        df.to_csv('{}/{}_shuffle{}.tsv'.format(dir_name, file_id, i), index=False, sep='\t')

RBFOX2_ai_FOX2-8e_cleaned
RBFOX2_ai_FOX2-APOBEC1_cleaned
RBFOX2_ai_FOX2-MinA_cleaned
RBFOX2_ai_FOX2-P0078_cleaned
RBFOX2_both_FOX2-P0078_cleaned
RBFOX2_ct_FOX2-8e_cleaned
RBFOX2_ct_FOX2-APOBEC1_cleaned
RBFOX2_ct_FOX2-MinA_cleaned
RBFOX2_ct_FOX2-P0078_cleaned
SLBP_ai_SLBP-8e_cleaned
SLBP_ai_SLBP-APOBEC1_cleaned
SLBP_ai_SLBP-MinA_cleaned
SLBP_ai_SLBP-P0078_cleaned
SLBP_both_SLBP-P0078_cleaned
SLBP_ct_SLBP-8e_cleaned
SLBP_ct_SLBP-APOBEC1_cleaned
SLBP_ct_SLBP-MinA_cleaned
SLBP_ct_SLBP-P0078_cleaned


In [448]:
sorted(glob('/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/*'))

['/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/FOX_motif_presence_in_shuffles.tsv',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_8e_alone',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_APOBEC1_alone',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-APOBEC1',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-APOBEC1_cleaned',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned',
 '

# Add sequences and motif presence

In [449]:
shuffled_peak_filepaths = [f for f in glob('/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/*/*cleaned*') \
                           if 'with_sequence' not in f and 'motif_presence' not in f]
len(shuffled_peak_filepaths)

360

In [450]:
shuffled_peak_filepaths[0:3]

['/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned/RBFOX2_ai_FOX2-MinA_cleaned_shuffle0.tsv',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned/RBFOX2_ai_FOX2-MinA_cleaned_shuffle6.tsv',
 '/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned/RBFOX2_ai_FOX2-MinA_cleaned_shuffle7.tsv']

In [451]:
from pyfaidx import Fasta
import re
fasta = '/projects/ps-yeolab3/ekofman/ReferenceData/hg38/cellranger-GRCh38-3.0.0/fasta/genome.fa'
FA = Fasta(fasta, rebuild=False)
import math

def get_sequence(r): 
    chrom = str(r.chrom)
    start = r.start
    end = r.end
    strand = r.strand
    
    sequence = FA[chrom][start:end].seq
    sequence = sequence.upper()
        
    return sequence

def get_extended_sequence(r): 
    chrom = str(r.chrom)
    
    midpoint = r.start + (int((r.end - r.start)/2))
    start = midpoint - 150
    end = midpoint + 150
    strand = r.strand
    
    sequence = FA[chrom][start:end].seq
    sequence = sequence.upper()
        
    return sequence


In [452]:

for peak_filepath in sorted(shuffled_peak_filepaths):
    print('Processing {}...'.format(peak_filepath))
    peak_df = pd.read_csv(peak_filepath, sep='\t')
    filename = peak_filepath.split('/')[-1]
    
    print('\tfilename is {}'.format(filename))
    
    folder = peak_filepath.split(filename)[0]
    print('\tfolder is {}'.format(folder))
    output_filepath = '{}/{}.with_sequence.bed'.format(folder, filename[0:-4])
    print(output_filepath)
    
    peak_df['sequence(+)'] = peak_df.apply(get_sequence, axis=1)
    peak_df['extended_sequence(+)'] = peak_df.apply(get_extended_sequence, axis=1)

    print('\tAssigned sequences')
    peak_df.to_csv(output_filepath, sep='\t', index=False)
    print('\tOutput file at {}'.format(output_filepath))

Processing /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned/RBFOX2_ai_FOX2-8e_cleaned_shuffle0.tsv...
	filename is RBFOX2_ai_FOX2-8e_cleaned_shuffle0.tsv
	folder is /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned/
/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned//RBFOX2_ai_FOX2-8e_cleaned_shuffle0.with_sequence.bed
	Assigned sequences
	Output file at /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned//RBFOX2_ai_FOX2-8e_cleaned_shuffle0.with_sequence.bed
Processing /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-8e_cleaned/RBFOX2_ai_FOX2-8e_cleaned_shuffle1.tsv...
	filename is RBFOX2_ai_FOX2-8e_cleaned_shuffle1.tsv
	folder is /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/R

# Determine motif presence in shuffled sequence-assigned .bed files 

In [453]:


complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
def reverse_complement(seq):
    return "".join(complement.get(base, base) for base in reversed(seq))

def reverse(seq):
    return seq[::-1]

def motif_present(sequence, motif, strand):
    if strand == '+':
        return motif in sequence
    elif strand == '-':
        return reverse_complement(motif) in sequence

def motif_distance_from_center(sequence, motif, strand):
    #print(sequence)
    if strand == '+':
        search_term = motif
    elif strand == '-':
        search_term = reverse_complement(motif)
    
    if search_term in sequence:
        closest_position_in_first_half = 1000
        closest_position_in_second_half = 1000
        
        midpoint = math.ceil(len(sequence)/2)
        #print('\tmidpoint is {}'.format(midpoint))
        start_of_second_half = midpoint-len(motif)

        sequence_first_half = sequence[0:midpoint]
        sequence_second_half = sequence[start_of_second_half:]

        #print('\tSplit:', sequence_first_half, sequence_second_half)
        if search_term in sequence_first_half:
            flipped_sequence = reverse(sequence_first_half)
            flipped_search_term = reverse(search_term)
            
            #print('searching for {} in {}'.format(flipped_search_term, flipped_sequence))
            closest_position_in_first_half = - flipped_sequence.index(flipped_search_term) - math.floor(len(motif)/2)
            
        if search_term in sequence_second_half:
            closest_position_in_second_half = sequence_second_half.index(search_term) - math.floor(len(motif)/2)

        if min(abs(closest_position_in_second_half), abs(closest_position_in_first_half)) == abs(closest_position_in_second_half):
            return closest_position_in_second_half
        else:
            return closest_position_in_first_half
    else:
        return None
    
motifs = ['TGCATG']#, 'GAATG', 'GTTTG', 'GTGTG', 'GTATG', 'GCTTG', 'GCCTG']

def add_sequence_presences(r, window_size=70):
    extended_sequence = r['extended_sequence(+)']
    strand = r.strand

    distance = motif_distance_from_center(extended_sequence, 'TGCATG', strand)
    
    sequence = r['sequence(+)']

    presence_map = {motif: motif_present(sequence, motif, strand) for motif in motifs}
    #return presence_map.get(motifs[0]), presence_map.get(motifs[1]), presence_map.get(motifs[2]), presence_map.get(motifs[3]), presence_map.get(motifs[4]), presence_map.get(motifs[5]), presence_map.get(motifs[6]), distance
    return presence_map.get(motifs[0]), distance

def calculate_fractions(p_df):
    fractions_dict = {}
    counts_dict = {}
    
    for motif in motifs:# + ['any_motif']:
        motif_present_count = p_df[motif].sum()
        motif_present_fraction = motif_present_count/len(p_df)
        fractions_dict[motif] = motif_present_fraction
        counts_dict[motif] = motif_present_count
    
    return fractions_dict, counts_dict

### tests
print(motif_present('TGCATG', 'TGCATG', '+'))
print(motif_present('TGCATG', 'TGCATG', '-'))
print(motif_present('TGCATG', 'CATGCA', '-'))

print(reverse('GCATG'))

print(motif_distance_from_center('GCATGZZZZZZZZZZ', 'GCATG', '+'))
print(motif_distance_from_center('ZZZZZGCATGZZZZZ', 'GCATG', '+'))
print(motif_distance_from_center('ZZZZZZGCATGZZZZ', 'GCATG', '+'))
print(motif_distance_from_center('ZZZZZZZGCATGZZZ', 'GCATG', '+'))
print(motif_distance_from_center('ZZZZZZZZZZGCATG', 'GCATG', '+'))
print(motif_distance_from_center('GCATGZGCATGZZZZZ', 'BAB', '+'))
print(motif_distance_from_center('ZZZGCATGZZZGCATG', 'GCATG', '+'))
print(motif_distance_from_center('AAGCATGCATGCATGCATGCATGA', 'GCATG', '+'))



True
False
True
GTACG
-5
0
1
2
5
None
-2
1


In [454]:
shuffled_peak_with_sequence_filepaths = glob('/projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/*/*cleaned*with_sequence.bed')




In [455]:
len(shuffled_peak_with_sequence_filepaths)

360

In [456]:
all_fractions = pd.DataFrame()
all_counts = pd.DataFrame()

num_peak_per_sample_id = {}

for filepath in shuffled_peak_with_sequence_filepaths:
    # Extract sample ID from filepath
    sample_id = filepath.split('/')[-1].split('.with_sequence.bed')[0]
    
    if 'SLBP' in sample_id:
        continue
    else:
        folder = filepath.split(sample_id)[0]

        print('\t', folder, sample_id)

        output_filename = '{}/{}.motif_presence.bed'.format(folder, sample_id)
        print('\t\t', output_filename)

        print('\t...Reading...')
        p_df = pd.read_csv(filepath, sep='\t')
        num_peaks = len(p_df)
        num_peak_per_sample_id[sample_id] = num_peaks

        # Calculate sequence presence
        #p_df[motifs[0]],p_df[motifs[1]],p_df[motifs[2]],p_df[motifs[3]],p_df[motifs[4]],p_df[motifs[5]],p_df[motifs[6]],p_df['GCATG_dist_from_center'] = zip(*p_df.apply(add_sequence_presences, axis=1))
        p_df[motifs[0]],p_df['TGCATG_dist_from_center'] = zip(*p_df.apply(add_sequence_presences, axis=1))

        print('\tOutputting file {}'.format(output_filename))

        #p_df['any_motif'] = p_df[[motifs[0], motifs[1], motifs[2], motifs[3], motifs[4], motifs[5], motifs[6]]].any(axis=1)

        p_df.to_csv(output_filename, sep='\t', index=False, header=True)

        new_fractions_dict, new_counts_dict = calculate_fractions(p_df)

        new_fractions = pd.DataFrame.from_dict(new_fractions_dict, orient='index', columns=[sample_id])
        new_counts = pd.DataFrame.from_dict(new_counts_dict, orient='index', columns=[sample_id])    

        if all_fractions.empty:
            all_fractions = new_fractions
        else:
            all_fractions = all_fractions.join(new_fractions, how='inner')

        if all_counts.empty:
            all_counts = new_counts
        else:
            all_counts = all_counts.join(new_counts, how='inner')
        print('\t', len(all_fractions.columns))

#all_fractions[sorted(all_fractions.columns)].to_csv('7_shuffled_confident_peaks/FOX_motif_presence_in_shuffles.tsv', sep='\t')
all_fractions[sorted(all_fractions.columns)].to_csv('7_shuffled_confident_peaks/FOX_cleaned_motif_presence_in_shuffles.tsv', sep='\t')



	 /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned/ RBFOX2_ai_FOX2-MinA_cleaned_shuffle7
		 /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned//RBFOX2_ai_FOX2-MinA_cleaned_shuffle7.motif_presence.bed
	...Reading...
	Outputting file /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned//RBFOX2_ai_FOX2-MinA_cleaned_shuffle7.motif_presence.bed
	 1
	 /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned/ RBFOX2_ai_FOX2-MinA_cleaned_shuffle15
		 /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned//RBFOX2_ai_FOX2-MinA_cleaned_shuffle15.motif_presence.bed
	...Reading...
	Outputting file /projects/ps-yeolab3/ekofman/Hugo/Full_RBFOX2_and_SLBP/7_shuffled_confident_peaks/RBFOX2_ai_FOX2-MinA_cleaned//RBFOX2_ai_FO

In [457]:
all_fractions

Unnamed: 0,RBFOX2_ai_FOX2-MinA_cleaned_shuffle7,RBFOX2_ai_FOX2-MinA_cleaned_shuffle15,RBFOX2_ai_FOX2-MinA_cleaned_shuffle19,RBFOX2_ai_FOX2-MinA_cleaned_shuffle4,RBFOX2_ai_FOX2-MinA_cleaned_shuffle0,RBFOX2_ai_FOX2-MinA_cleaned_shuffle18,RBFOX2_ai_FOX2-MinA_cleaned_shuffle10,RBFOX2_ai_FOX2-MinA_cleaned_shuffle16,RBFOX2_ai_FOX2-MinA_cleaned_shuffle2,RBFOX2_ai_FOX2-MinA_cleaned_shuffle5,...,RBFOX2_ai_FOX2-P0078_cleaned_shuffle16,RBFOX2_ai_FOX2-P0078_cleaned_shuffle4,RBFOX2_ai_FOX2-P0078_cleaned_shuffle2,RBFOX2_ai_FOX2-P0078_cleaned_shuffle13,RBFOX2_ai_FOX2-P0078_cleaned_shuffle9,RBFOX2_ai_FOX2-P0078_cleaned_shuffle7,RBFOX2_ai_FOX2-P0078_cleaned_shuffle18,RBFOX2_ai_FOX2-P0078_cleaned_shuffle11,RBFOX2_ai_FOX2-P0078_cleaned_shuffle8,RBFOX2_ai_FOX2-P0078_cleaned_shuffle6
TGCATG,0.047814,0.045082,0.062842,0.056011,0.051913,0.051913,0.054645,0.056011,0.038251,0.043716,...,0.035582,0.034653,0.031869,0.033106,0.034035,0.032488,0.035891,0.035582,0.034653,0.029084


In [433]:
pybedtools.cleanup()