# HOMER for finding motifs enriched at TAD boundaries

In [2]:
#module load homer/4.11

# 80% two tools overlap boudaries motif enrichment

### Input files:<br>
/scratch/ak8725/az_mrg/boundaries_unique_5kb_80_homer.bed<br>
/scratch/ak8725/az_mrg/boundaries_unique_2kb_80_homer.bed<br>

Genome file:<br>
/scratch/ak8725/az_mrg/azucena.fna

See presentation "Micro-C analysis April 2023"

findMotifsGenome.pl \<input\> /scratch/ak8725/az_mrg/azucena.fna homer -size given -len 10 -p 4

## Unmasked genome file used for background

### Output files are in /scratch/ak8725/HOMER<br>
homer_5kb_80<br>
homer_cotton_5kb_80<br>
homer_2kb_80<br>
homer_cotton_2kb_80

Parameters used:
1. -len 10 -size given
2. "cotton": -len 8,10,12 (default) -size 200 #from cotton paper

## Masked genome used as background (azucena - boundaries_80_2kb - boundaries_80_5kb)
### Creating background .bed file with 50000 5kb sequences

In [1]:
import random

# Read in the boundaries1.bed and boundaries2.bed files and store the coordinates in a list of tuples
boundaries = []
with open('/scratch/ak8725/az_mrg/boundaries_unique_5kb_80_homer.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        boundaries.append((chrom, start, end))
        
with open('/scratch/ak8725/az_mrg/boundaries_unique_2kb_80_homer.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        boundaries.append((chrom, start, end))

# Get the length of each chromosome
chrom_lengths = {}
with open('/scratch/ak8725/az_mrg/azucena.fna', 'r') as f:
    chrom = None
    for line in f:
        if line.startswith('>'):
            chrom = line.strip()[1:]
            chrom_lengths[chrom] = 0
        else:
            chrom_lengths[chrom] += len(line.strip())

# Generate a list of possible 5kb sequences that don't overlap with any of the boundaries
possible_seqs = []
for chrom, length in chrom_lengths.items():
    for i in range(length // 5000):
        start = i * 5000
        end = (i + 1) * 5000
        if all((chrom, start, end) not in b for b in boundaries):
            possible_seqs.append((chrom, start, end))

# Choose 5kb sequences randomly from the list of possible sequences
num_seqs = 50000 # set the number of sequences you want
chosen_seqs = random.sample(possible_seqs, num_seqs)

# Write the chosen sequences to a new .bed file
with open('homer_background_seqs.bed', 'w') as f:
    for seq in chosen_seqs:
        f.write('\t'.join([seq[0], str(seq[1]), str(seq[2])]) + '\n')

In [28]:
head /scratch/ak8725/az_mrg/HOMER/homer_background_seqs.bed

chr02	24030000	24035000
chr12	18525000	18530000
chr05	18180000	18185000
chr02	12540000	12545000
chr02	20740000	20745000
chr09	20465000	20470000
chr01	41380000	41385000
chr11	2965000	2970000
chr01	17955000	17960000
chr09	3210000	3215000


Now, use these background sequences to run Homer with two sets of parameters
1. -len 10 -size given
2. "cotton": -len 8,10,12 (default) -size 200 #from cotton paper

## boundaries_unique_2kb_80_homer.bed

## boundaries_unique_5kb_80_homer.bed

1 
findMotifsGenome.pl \<input\> /scratch/ak8725/az_mrg/azucena.fna -size given -len 10 -p 8 -bg /scratch/ak8725/az_mrg/HOMER/homer_background_seqs.bed \

2.
findMotifsGenome.pl \<input\> /scratch/ak8725/az_mrg/azucena.fna -size given -size 200 -p 8 -bg /scratch/ak8725/az_mrg/HOMER/homer_background_seqs.bed \

### Output files are in /scratch/ak8725/HOMER<br>
homer_5kb_80_bg<br>
homer_cotton_5kb_80_bg<br>
homer_2kb_80_bg<br>
homer_cotton_2kb_80_bg

In [32]:
sbatch homer5-cotton.sh
sbatch homer2-cotton.sh

Submitted batch job 31939715
Submitted batch job 31939716


# HiCExplorer boundaries motif enrichment

Input files: <br>
az_mrg/hicFindTADs/hicFindTADs2_out/az_1kb_boundaries.bed
az_mrg/hicFindTADs/hicFindTADs1_out/az_2kb_boundaries.bed
az_mrg/hicFindTADs/hicFindTADs13_out/az_5kb_boundaries.bed

### Creating backgorund bed file with 50000 5kb sequences from masked Azucena genome (no 1kb/2kb/5kb boundaries)

In [4]:
import random

# Read in the boundaries1.bed and boundaries2.bed files and store the coordinates in a list of tuples
boundaries = []
with open('/scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs2_out/az_1kb_boundaries.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        boundaries.append((chrom, start, end))
        
with open('/scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs1_out/az_2kb_boundaries.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        boundaries.append((chrom, start, end))

with open('/scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs13_out/az_5kb_boundaries.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        boundaries.append((chrom, start, end))

# Get the length of each chromosome
chrom_lengths = {}
with open('/scratch/ak8725/az_mrg/azucena.fna', 'r') as f:
    chrom = None
    for line in f:
        if line.startswith('>'):
            chrom = line.strip()[1:]
            chrom_lengths[chrom] = 0
        else:
            chrom_lengths[chrom] += len(line.strip())

# Generate a list of possible 1kb sequences that don't overlap with any of the boundaries
possible_seqs = []
for chrom, length in chrom_lengths.items():
    for i in range(length // 5000):
        start = i * 5000
        end = (i + 1) * 5000
        if all((chrom, start, end) not in b for b in boundaries):
            possible_seqs.append((chrom, start, end))

# Choose 5kb sequences randomly from the list of possible sequences
num_seqs = 50000 # set the number of sequences you want
chosen_seqs = random.sample(possible_seqs, num_seqs)

# Write the chosen sequences to a new .bed file
with open('/scratch/ak8725/az_mrg/HOMER/homer_hicexp_background_seqs.bed', 'w') as f:
    for seq in chosen_seqs:
        f.write('\t'.join([seq[0], str(seq[1]), str(seq[2])]) + '\n')

In [11]:
#background sequences for all HiCExplorer boundaries
!head /scratch/ak8725/az_mrg/HOMER/homer_hicexp_background_seqs.bed

chr01 	1785000	1790000
chr05 	13665000	13670000
chr09 	22560000	22565000
chr08 	13880000	13885000
chr12 	23175000	23180000
chr03 	11175000	11180000
chr01 	4010000	4015000
chr04 	16105000	16110000
chr12 	17005000	17010000
chr07 	23035000	23040000


In [12]:
!head ../az_mrg/hicFindTADs/hicFindTADs2_out/az_1kb_boundaries.bed

chr01	71500	72500	B00065	0.118303686824	.
chr01	83500	84500	B00077	-0.220582454764	.
chr01	174500	175500	B00151	0.068491605502	.
chr01	245500	246500	B00222	0.111921474297	.
chr01	302500	303500	B00270	-0.130352591732	.
chr01	316500	317500	B00284	0.025218289872	.
chr01	421500	422500	B00386	-0.126028470159	.
chr01	466500	467500	B00418	-0.191001059176	.
chr01	482500	483500	B00434	-0.203591428374	.
chr01	493500	494500	B00445	-0.072317785790	.


In [8]:
sbatch az_mrg/HOMER/homer.sh

sbatch: error: Unable to open file az_mrg/HOMER/homer.sh


: 1

Output folders:<br>
hicexp_1kb<br>
hicexp_2kb<br>
hicexp_5kb