In [210]:
import pandas as pd
import itertools as it
from collections import defaultdict,Counter
import numpy as np
import os
import csv
from mlbootstrap import bootstrap
import json

## Load Loop-Motif Intersection Data (Script 1)

Script is run once per sample. Output a tsv of results.

In [234]:
columns = ['loop_anchor1_chr','loop_anchor1_start','loop_anchor1_end','loop_anchor2_chr','loop_anchor2_start','loop_anchor2_end','motif_chr','motif_start','motif_end','motif_id','motif_name']
df = pd.read_csv(r"C:\Users\Romeo Ignacio\Downloads\loops_overlap_motifs.txt",sep="\t",header =None,names=columns)

# Get rid of all duplicate motif-pair
# (doesn't remove dupliate motifs in a single anchort)
df = df.drop_duplicates().reset_index(drop=True)

In [235]:
# Determine which anchor a motif falls in
df["anchor_1"] = (df['loop_anchor1_start'] <= df['motif_start']) & (df['motif_end'] <= df['loop_anchor1_end']) 
df["anchor_2"] = (df['loop_anchor2_start'] <= df['motif_start']) & (df['motif_end'] <= df['loop_anchor2_end'])
df['anchor_id'] = 1

# Drop duplicates if they occur
df = df.drop_duplicates().reset_index(drop=True)

# Create loop ID
df['anchor1_id'] = df['loop_anchor1_chr'] + ':' + df['loop_anchor1_start'].astype(str)
df['anchor2_id'] = df['loop_anchor2_chr'] + ':' + df['loop_anchor2_start'].astype(str)
df['loop_id'] = df['loop_anchor1_chr'] + ':' + df['loop_anchor1_start'].astype(str) + ':' + df['loop_anchor2_start'].astype(str)

# Drop duplicate motif pairs
df.drop_duplicates(subset=['loop_id', 'anchor2_id', 'motif_id'], inplace=True)

## Count Motif-Pairs in Our Dataset (script 1)

In [238]:


motif_pair_counter = Counter()
# Count up motif pairs
for loop_id, loop_df in df.groupby('loop_id'):
    
    anchor1_df = loop_df.loc[loop_df.anchor_1 == True]
    anchor2_df = loop_df.loc[loop_df.anchor_2 == True]
    
    if anchor1_df.shape[0] == 0 or anchor2_df.shape[0] == 0:
        continue
        
    motifs1 = list(anchor1_df.motif_name)
    motifs2 = list(anchor2_df.motif_name)
    perms = list(it.product(motifs1, motifs2))
    
    for p in perms:
        motif_pair_counter[p] += 1


In [239]:
t = 1
dirpath = r"H:\Genetics Project\Statistics\Pair_Files_Simu"

for motif_pair in motif_pair_counter.items():
    # Clean out characters for file neames
    folder_name = str(motif_pair[0]).replace("'","").replace("(","").replace(")","").replace(" ","_").replace(",","").replace(":",".")
    # Create File of paired observations
    if os.path.exists(dirpath+"\\"+str(folder_name)):
        pass
    else:
        os.makedirs(dirpath+"\\"+str(folder_name))
    # Save observed file in text file
    with open(dirpath+'\\'+str(folder_name)+'\\'+'simulations.txt', 'w', newline='') as csv_file:
        data = [[motif_pair[0],motif_pair[1]]]
        writer = csv.writer(csv_file)
        writer.writerow(['Motif_Pair','Observed'])
        writer.writerows(data)
    t+=1
    print(motif_pair[0])
    if t == 100:
        break

('ZNF768', 'RREB1')
('SP8', 'RREB1')
('SP9', 'RREB1')
('SP3', 'RREB1')
('PATZ1', 'RREB1')
('HES1', 'RREB1')
('KLF17', 'RREB1')
('MAZ', 'RREB1')
('ZNF189', 'RREB1')
('KLF17', 'EHF')
('KLF17', 'FLI1::FOXI1')
('KLF17', 'FOXJ2::ELF1')
('KLF17', 'FOXO1::ELF1')
('KLF17', 'FOXO1::ELK3')
('KLF17', 'FOXO1::FLI1')
('KLF17', 'ZNF449')
('KLF17', 'TFEB')
('KLF17', 'KLF4')
('KLF17', 'KLF5')
('KLF17', 'KLF15')
('KLF17', 'ZNF454')
('KLF17', 'CTCF')
('KLF17', 'CTCFL')
('KLF17', 'ZKSCAN5')
('KLF17', 'ZNF263')
('KLF17', 'ZNF281')
('KLF17', 'ZNF148')
('KLF17', 'SP5')
('KLF17', 'PRDM9')
('KLF17', 'NFIB')
('KLF17', 'ZNF530')
('KLF17', 'ZNF331')
('KLF17', 'REST')
('KLF17', 'MEF2A')
('KLF17', 'MEF2B')
('KLF17', 'MEF2D')
('KLF17', 'ESR2')
('KLF17', 'TFEC')
('KLF17', 'ZNF701')
('KLF17', 'MAZ')
('KLF17', 'ZNF320')
('KLF17', 'ZNF93')
('ZNF669', 'EHF')
('ZNF669', 'FLI1::FOXI1')
('ZNF669', 'FOXJ2::ELF1')
('ZNF669', 'FOXO1::ELF1')
('ZNF669', 'FOXO1::ELK3')
('ZNF669', 'FOXO1::FLI1')
('ZNF669', 'ZNF449')
('ZNF669', 'T

## Bootstrap Test (Genome-Wide) (Script 2)

Run once per each simulation. Use as input, the results from script 1. Output a column of data where each entry is the number of simulated counts

In [298]:
##Inputs needed, dataframe or text file of dataframe, number of sims you want, and directory path
bootstrap_df = df
sims = 2
dirpath = r"H:\Genetics Project\Statistics\Pair_Files_Simu"
##Permutate All motifs across the Chromosome
sim = 1
i = 0
results = []
all_loop_pairs = []
##Create Simulations
while sim < 3:
    # Get length of bootstrap
    n = len(bootstrap_df["motif_name"])

    #Randomly sort out permutation column with motifs
    bootstrap_df["Random_Motif"] = np.random.choice(df['motif_name'],size=n,replace=True)

    #Call in counter function
    sim_motif_pair_counter = Counter()
    
    #Groupby loop id, determine anchors based on positoin
    for loop_id, loop_df in df.groupby('loop_id'):
        anchor1_df = loop_df.loc[loop_df.anchor_1 == True]
        anchor2_df = loop_df.loc[loop_df.anchor_2 == True]

        #If missing anchors, skip loop
        if anchor1_df.shape[0] == 0 or anchor2_df.shape[0] == 0:
            continue

        #Get motifs1
        motifs1 = list(anchor1_df.Random_Motif)

        #Get motifs2
        motifs2 = list(anchor2_df.Random_Motif)

        # get motif-pair combinations
        combs = list(it.product(motifs1, motifs2))

        # count each combination
        for c in combs:
            sim_motif_pair_counter[c] += 1
    results.append(sim_motif_pair_counter)
    print(sim)
    sim += 1

1
2


In [240]:
t = 1
for motif_pair in motif_pair_counter.items():
    sim = 0
    while sim < sims:
        # Clean out characters for file neames
        folder_name = str(motif_pair[0]).replace("'","").replace("(","").replace(")","").replace(" ","_").replace(",","").replace(":",".")
        with open(dirpath+'\\'+str(folder_name)+'\\'+'simulations.txt', 'r') as infile:
            reader = csv.reader(infile)
            data = list(reader)
            

        # Add the new header to the first row of the list of lists
        data[0].append("sim"+str(sim))

        # Add the new data to the remaining rows of the list of lists
        data[1].append(results[sim][motif_pair[0]])
        dataentry = [data[1]]
        # Append the updated list of lists to an existing CSV file
        with open(dirpath+'\\'+str(folder_name)+'\\'+'simulations.txt', 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(data[0])
            writer.writerows(dataentry)
        sim+=1
    t+=1
    print(t)
    if t == 100:
        break
            

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


## Aggregate the simulation data (Script 3)

In [303]:
## Loop through all files

sig_level = 0.95
dirpath = r"H:\Genetics Project\Statistics\Pair_Files_Simu/"
savepath = r"H:\Genetics Project\Statistics\Results"

# Determine if file exists.  If does, delete
outputpath = savepath+"\\"+"p_values.txt"
if os.path.exists(outputpath):
        os.remove(outputpath)
else:
    pass

# Put in names of columns of dataframe
with open(outputpath, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['Motif_Pair','Observed','P-value','Significant'])
for folder_name in os.listdir(dirpath):
    print(folder_name)
    
    with open(dirpath+'\\'+str(folder_name)+'\\'+'simulations.txt', 'r') as infile:
        reader = csv.reader(infile)
        data = list(reader)

    

    with open(outputpath, 'a', newline='') as csv_file:
        listnum = [int(num) for num in data[1][1:]]
        sims = len(listnum[1:])
        p_value = len([num for num in listnum[1:] if num <= listnum[0]])/sims
        sig = p_value >= sig_level

        # If significant, output message, "Yes".  "No" otherwise
        if sig == 1:
            message = "Yes"
        else:
            message = "No"
        writer = csv.writer(csv_file)
        writer.writerows([[data[1][0],data[1][1],p_value,message]])



AR_CTCF
AR_CTCFL
AR_EHF
AR_FLI1..FOXI1
AR_FOXJ2..ELF1
AR_FOXO1..ELF1
AR_FOXO1..ELK3
AR_FOXO1..FLI1
AR_KLF15
AR_KLF4
AR_KLF5
AR_NFIB
AR_PRDM9
AR_SP5
AR_TFEB
AR_ZKSCAN5
AR_ZNF148
AR_ZNF263
AR_ZNF281
AR_ZNF331
AR_ZNF449
AR_ZNF454
AR_ZNF530
HES1_RREB1
KLF17_CTCF
KLF17_CTCFL
KLF17_EHF
KLF17_ESR2
KLF17_FLI1..FOXI1
KLF17_FOXJ2..ELF1
KLF17_FOXO1..ELF1
KLF17_FOXO1..ELK3
KLF17_FOXO1..FLI1
KLF17_KLF15
KLF17_KLF4
KLF17_KLF5
KLF17_MAZ
KLF17_MEF2A
KLF17_MEF2B
KLF17_MEF2D
KLF17_NFIB
KLF17_PRDM9
KLF17_REST
KLF17_RREB1
KLF17_SP5
KLF17_TFEB
KLF17_TFEC
KLF17_ZKSCAN5
KLF17_ZNF148
KLF17_ZNF263
KLF17_ZNF281
KLF17_ZNF320
KLF17_ZNF331
KLF17_ZNF449
KLF17_ZNF454
KLF17_ZNF530
KLF17_ZNF701
KLF17_ZNF93
MAZ_RREB1
PATZ1_RREB1
SP3_RREB1
SP8_RREB1
SP9_RREB1
ZNF189_RREB1
ZNF669_CTCF
ZNF669_CTCFL
ZNF669_EHF
ZNF669_ESR2
ZNF669_FLI1..FOXI1
ZNF669_FOXJ2..ELF1
ZNF669_FOXO1..ELF1
ZNF669_FOXO1..ELK3
ZNF669_FOXO1..FLI1
ZNF669_KLF15
ZNF669_KLF4
ZNF669_KLF5
ZNF669_MAZ
ZNF669_MEF2A
ZNF669_MEF2B
ZNF669_MEF2D
ZNF669_NFIB
ZNF669_PRD

In [302]:
len(listnum[1:])

2