In [1]:
import os
import numpy as np
import scipy as sp
import sklearn as sk
import pandas as pd
import operator
from operator import sub
from decimal import Decimal
import statistics
from statistics import mean

# Import a list of the LUTI genes

In [33]:

genes = pd.read_csv("luti_genes.txt", 
                   sep="\t")


# From ChIP bigwig files, extract out the fold-enrichment over input values 

Performed for the 50 bp upstream and the 500 bp downstream of the TSSs for all genes and specifically for the PROX promoters at sites with LUTI RNAs 

This was performed in bash 

bash extract_bases_for_enrichment_analysis.sh

# Sum the extracted fold-enrichment values

In [34]:

# setup a function

def f(directory):
    _dir = os.fsencode(directory)
    # loops through all of the files in a directory
    for file1 in os.listdir(_dir):
        filename = os.fsdecode(file1)
        lss = list()
        # if file ends in .txt, split the text in the 8th column by "," into a new list
        # sum the list
        if filename.endswith(".txt"): 
            df = pd.read_csv(directory + filename, sep='\t', header=None)
            s = df[7]
            lst = s.tolist()
            for i in lst:
                l = i.split(',')
                l = [float(x) for x in l]
                _sum = np.sum(l)
                lss.append(_sum)
            genes[filename.split('.')[0] + '_sum'] = lss
            


In [35]:

# run the function

directory = "chromatin_mods_FE_files/up50/prox/"
f(directory)
directory = "chromatin_mods_FE_files/down500/prox/"
f(directory)


sums = genes.copy()
sums = sums.set_index('parent')

# Calculating the fold change over the region between time points

In [43]:


# take the ratio of the values 50 bp up and 500 bp down of the ORF tss start site

prefixes = [i[:-12] for i in genes.columns if 'proxup50' in i]
col1 = np.array([genes[i + 'proxup50_sum'] for i in prefixes])
col2 = np.array([genes[i + 'proxdown500_sum'] for i in prefixes])
new_names = [i + '-ratio' for i in prefixes]
for new_name, vals1, vals2 in zip(new_names, col1, col2):
    sums[new_name] = vals1 / vals2
    
    
# get the enrichment for 4h over 2h

prefixes_2h = [i for i in prefixes if '2h' in i]
prefixes_4h = [i.replace('2h', '4h') for i in prefixes_2h]
col3 = np.array([sums[i + '-ratio'] for i in prefixes_2h])
col4 = np.array([sums[i + '-ratio'] for i in prefixes_4h])
new_names = [i + 'foldchange' for i in prefixes_4h]
for new_name, vals1, vals2 in zip(new_names, col4, col3):
    sums[new_name] = vals1 / vals2
    
    
# get the mean values 

fc = sums.filter(regex='foldchange')
fc = fc.reindex(sorted(fc.columns), axis=1)

mods = ['H3K36me3', 'H3K4me2_4h_ab']

for xyz in mods:
    fc[xyz + '_mean_fold_change'] = fc.filter(regex=xyz).mean(axis=1)
    fc[xyz + '_stdev'] = fc.filter(regex=xyz).std(axis=1)

fc.drop(list(fc.filter(regex='-foldchange')), axis=1, inplace=True)

fc.to_csv("mod_enrichment_rel_to_downstream.txt", sep="\t")

