# STREME motif identification 

Example script:<br>
boundaries_unique_5kb_80_homer.bed --dna --oc streme_5kb_80% --n streme_background.fa

Create fasta file with background sequences (from entire genome)

It is recommended to use at least two to three times as many background sequences as target sequences. I have 3636 boundaries at 2kb (80%), 1818 at 5kb (80%). I will use 12,000 background sequences for 2kb, 6000 for 5kb.

Choosing random 12/6,000 sequences to create background files:<br>
streme_background_2kb.fa<br>
streme_background_5kb.fa

In [26]:
import random

fasta_file = "/scratch/ak8725/az_mrg/azucena.fna"
output_file = "/scratch/ak8725/STREME/streme_background_5kb.fa"
num_fragments = 6000
fragment_size = 6000

# read in the input fasta file
with open(fasta_file, "r") as f:
    # read the first line (sequence ID)
    seq_id = f.readline().strip().lstrip(">")
    # read the remaining lines (sequence)
    seq = "".join(line.strip() for line in f.readlines())

seq_length = len(seq)

fragments = []
for i in range(num_fragments):
    # generate a random start position within the sequence
    start = random.randint(0, seq_length - fragment_size + 1)
    end = start + fragment_size
    fragment = seq[start:end]
    fragment_id = f"{seq_id}_fragment_{i+1}"
    fragments.append((fragment_id, fragment))

with open(output_file, "w") as f:
    for fragment_id, fragment_seq in fragments:
        f.write(f">{fragment_id}\n{fragment_seq}\n")

In [27]:
import random

fasta_file = "/scratch/ak8725/az_mrg/azucena.fna"
output_file = "/scratch/ak8725/STREME/streme_background_2kb.fa"
num_fragments = 12000
fragment_size = 6000

# read in the input fasta file
with open(fasta_file, "r") as f:
    # read the first line (sequence ID)
    seq_id = f.readline().strip().lstrip(">")
    # read the remaining lines (sequence)
    seq = "".join(line.strip() for line in f.readlines())

seq_length = len(seq)

fragments = []
for i in range(num_fragments):
    # generate a random start position within the sequence
    start = random.randint(0, seq_length - fragment_size + 1)
    end = start + fragment_size
    fragment = seq[start:end]
    fragment_id = f"{seq_id}_fragment_{i+1}"
    fragments.append((fragment_id, fragment))

with open(output_file, "w") as f:
    for fragment_id, fragment_seq in fragments:
        f.write(f">{fragment_id}\n{fragment_seq}\n")

In [None]:
#created fasta files with unique boundaries
boundaries_unique_5kb_80_homer.fna
boundaries_unique_2kb_80_homer.fna

In [3]:
#prepare genome file (changing chromosome names to chr01)
# Open the input and output files
with open('/scratch/ak8725/az_new/map_generation/azucena.fna', 'r') as fna_in, open('/scratch/ak8725/az_mrg/azucena_chr.fna', 'w') as fna_out:

    # Iterate over the lines in the input file
    for line in fna_in:
        if line.startswith('>'):
            # Modify the chromosome name in the header line
            parts = line.strip().split()
            chrom_name = parts[0].replace('Oryza sativa Japonica Group cultivar Azucena chromosome ', 'chr')
            new_line = f'{chrom_name}\n'
            fna_out.write(new_line)
        else:
            # Copy the sequence lines unmodified
            fna_out.write(line)

In [13]:
#create an .fna file with boundary sequences 5kb, 80%
# Open the input and output files
with open('/scratch/ak8725/az_mrg/azucena_chr.fna') as fna_file, \
     open('/scratch/ak8725/az_mrg/boundaries_unique_5kb_80_homer.bed') as bed_file, \
     open('/scratch/ak8725/az_mrg/boundaries_unique_5kb_80_homer.fna', 'w') as out_file:

    # Process the genome in chunks
    genome = {}
    curr_seq = ''
    for line in fna_file:
        if line.startswith('>'):
            if curr_seq:
                genome[curr_seq] = ''.join(genome[curr_seq])
            curr_seq = line.strip()[1:]
            genome[curr_seq] = []
        else:
            genome[curr_seq].append(line.strip())
    genome[curr_seq] = ''.join(genome[curr_seq])

    # Process the bed file
    for line in bed_file:
        fields = line.strip().split('\t')
        seq_id = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        fragment_id = fields[3]
        sequence = genome[seq_id][start:end]
        header = f'>{seq_id}_{fragment_id}\n'
        out_file.write(header + sequence + '\n')

In [2]:
#create an .fna file with conserved/nonconserved boundary sequences for sonia
# Open the input and output files
with open('/scratch/ak8725/NPB_new/map_generation/NPB.fna') as fna_file, \
     open('/scratch/ak8725/NPB_new/conservedBounds.bed') as bed_file, \
     open('/scratch/ak8725/NPB_new/conservedBounds.fna', 'w') as out_file:

    # Process the genome in chunks
    genome = {}
    curr_seq = ''
    for line in fna_file:
        if line.startswith('>'):
            if curr_seq:
                genome[curr_seq] = ''.join(genome[curr_seq])
            curr_seq = line.strip()[1:]
            genome[curr_seq] = []
        else:
            genome[curr_seq].append(line.strip())
    genome[curr_seq] = ''.join(genome[curr_seq])

    # Process the bed file
    for line in bed_file:
        fields = line.strip().split('\t')
        seq_id = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        sequence = genome[seq_id][start:end]
        header = f'>{seq_id}:{start}-{end}\n'
        out_file.write(header + sequence + '\n')

In [21]:
!head -3 /scratch/ak8725/az_mrg/boundaries_unique_2kb_80_homer.fna

>chr01_B1
CCTTGATGACTCATCATACAAATCTCTGTCAGCACCCATGAAATATGCCTGCCCAAGCCGTACCAACTCCATTGATATATACAAAGAGATCGGAATGATGACCTGATATACAATCACCGCCATGAGGAATGTAATGAATATCTGCATCCCCATGCCATAGTAATTGTAGTTCTTCCCTGTCGTGTAATCCTTCTCCCTAAAGAACTGGGTGAATTCCAAATCCCCACGGTGATTCAGTATCCAAATCCCCGCGAGCACACTTGCGGTCGTGCACATCCCAATGAGCATAATGGACAGTATGACAGTCTCCCGGTTCAACTGTGTCTCCAACCGGCTCCGTTTCGACGGCGCCCCGGAGCTATTAAGCATGACCTTGGTCTCCTTTCCAGCATACACAACAACCCCTATCGCCCAGGTGGTATTCTTGAGCTCACATCCACGGAGCACAATGTTCGATGGCCCTAGCGAGACACGCTTGCCATCAATCTCCAAATTCGCCTGAAATCCATAGATGTTCCTGTTTGGCCGCTCACAATGAAGGACACCGCCGATTCCACCGTCCTGCGAGAACCTCAGTTGCGTCTCCTGCTTAGCATACCTCGTCTTGAGGTTGGTCTCCCCGTCAAGATTGACGGTCTGGACGTGCGCGACGCCGGAGGGGTCGCTGGTGGCGAGGAGCACCATGTCGGCAGGGAGCGTCTCGCTGGAGGCGACGCGGACGACGTCCCCGACGCGGATGTGCTTCCATTTCTTGGGCGCGAACTCccccgcggcgggcggcgccagGAGGACCCTGGCGAGGCGGTTGTTCTCCTGGCGgtcggagcggtggcggcggaggtccTCGTAGGCGTCCTTGACGGCGGTGACGAAGAGGACGAAGGCGAGCGGGAGCACGGAGGCGCCGCGGCCGAAGACGGCGACCTGGGGGAGCTGGTTGAGCACGGTGATGGCGAGGAAGTAGACGTACGACAGCCGCCGGAACTGCTC

In [20]:
#create an .fna file with boundary sequences 2kb, 80%
# Open the input and output files
with open('/scratch/ak8725/az_mrg/azucena_chr.fna') as fna_file, \
     open('/scratch/ak8725/az_mrg/boundaries_unique_2kb_80_homer.bed') as bed_file, \
     open('/scratch/ak8725/az_mrg/boundaries_unique_2kb_80_homer.fna', 'w') as out_file:

    # Process the genome in chunks
    genome = {}
    curr_seq = ''
    for line in fna_file:
        if line.startswith('>'):
            if curr_seq:
                genome[curr_seq] = ''.join(genome[curr_seq])
            curr_seq = line.strip()[1:]
            genome[curr_seq] = []
        else:
            genome[curr_seq].append(line.strip())
    genome[curr_seq] = ''.join(genome[curr_seq])

    # Process the bed file
    for line in bed_file:
        fields = line.strip().split('\t')
        seq_id = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        fragment_id = fields[3]
        sequence = genome[seq_id][start:end]
        header = f'>{seq_id}_{fragment_id}\n'
        out_file.write(header + sequence + '\n')

In [5]:
#then run /scratch/ak8725/STREME/streme.sh

In [None]:
streme --oc streme_5kb_80% --n streme_background_5kb.fa --p /scratch/ak8725/az_mrg/boundaries_unique_5kb_80_homer.fna 
streme --oc streme_2kb_80% --n streme_background_2kb.fa --p /scratch/ak8725/az_mrg/boundaries_unique_2kb_80_homer.fna 

# HiCExplorer boundaries at different resolutions

In [2]:
#create .fna with 1kb boundary sequences
# Open the input and output files
with open('/scratch/ak8725/az_mrg/azucena_chr.fna') as fna_file, \
     open('/scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs2_out/az_1kb_boundaries.bed') as bed_file, \
     open('/scratch/ak8725/az_mrg/hicexp_1kb_homer.fna', 'w') as out_file:

    # Process the genome in chunks
    genome = {}
    curr_seq = ''
    for line in fna_file:
        if line.startswith('>'):
            if curr_seq:
                genome[curr_seq] = ''.join(genome[curr_seq])
            curr_seq = line.strip()[1:]
            genome[curr_seq] = []
        else:
            genome[curr_seq].append(line.strip())
    genome[curr_seq] = ''.join(genome[curr_seq])

    # Process the bed file
    for line in bed_file:
        fields = line.strip().split('\t')
        seq_id = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        fragment_id = fields[3]
        sequence = genome[seq_id][start:end]
        header = f'>{seq_id}_{fragment_id}\n'
        out_file.write(header + sequence + '\n')

In [3]:
#create .fna with 2kb boundary sequences
#create .fna with 1kb boundary sequences
# Open the input and output files
with open('/scratch/ak8725/az_mrg/azucena_chr.fna') as fna_file, \
     open('/scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs1_out/az_2kb_boundaries.bed') as bed_file, \
     open('/scratch/ak8725/az_mrg/hicexp_2kb_homer.fna', 'w') as out_file:

    # Process the genome in chunks
    genome = {}
    curr_seq = ''
    for line in fna_file:
        if line.startswith('>'):
            if curr_seq:
                genome[curr_seq] = ''.join(genome[curr_seq])
            curr_seq = line.strip()[1:]
            genome[curr_seq] = []
        else:
            genome[curr_seq].append(line.strip())
    genome[curr_seq] = ''.join(genome[curr_seq])

    # Process the bed file
    for line in bed_file:
        fields = line.strip().split('\t')
        seq_id = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        fragment_id = fields[3]
        sequence = genome[seq_id][start:end]
        header = f'>{seq_id}_{fragment_id}\n'
        out_file.write(header + sequence + '\n')

In [4]:
#create .fna with 5kb boundary sequences
#create .fna with 1kb boundary sequences
# Open the input and output files
with open('/scratch/ak8725/az_mrg/azucena_chr.fna') as fna_file, \
     open('/scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs13_out/az_5kb_boundaries.bed') as bed_file, \
     open('/scratch/ak8725/az_mrg/hicexp_5kb_homer.fna', 'w') as out_file:

    # Process the genome in chunks
    genome = {}
    curr_seq = ''
    for line in fna_file:
        if line.startswith('>'):
            if curr_seq:
                genome[curr_seq] = ''.join(genome[curr_seq])
            curr_seq = line.strip()[1:]
            genome[curr_seq] = []
        else:
            genome[curr_seq].append(line.strip())
    genome[curr_seq] = ''.join(genome[curr_seq])

    # Process the bed file
    for line in bed_file:
        fields = line.strip().split('\t')
        seq_id = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        fragment_id = fields[3]
        sequence = genome[seq_id][start:end]
        header = f'>{seq_id}_{fragment_id}\n'
        out_file.write(header + sequence + '\n')

Input files with boundary sequences:<br>
    /scratch/ak8725/az_mrg/hicexp_1kb_boundaries.fna<br>
    /scratch/ak8725/az_mrg/hicexp_2kb_boundaries.fna<br>
    /scratch/ak8725/az_mrg/hicexp_5kb_boundaries.fna<br>

In [8]:
#creating background sequences. I will use 20,000 seqs for background. Seq length = 6000

import random

fasta_file = "/scratch/ak8725/az_mrg/azucena.fna"
output_file = "/scratch/ak8725/az_mrg/STREME/streme_background_20k.fa"
num_fragments = 20000
fragment_size = 6000

# read in the input fasta file
with open(fasta_file, "r") as f:
    # read the first line (sequence ID)
    seq_id = f.readline().strip().lstrip(">")
    # read the remaining lines (sequence)
    seq = "".join(line.strip() for line in f.readlines())

seq_length = len(seq)

fragments = []
for i in range(num_fragments):
    # generate a random start position within the sequence
    start = random.randint(0, seq_length - fragment_size + 1)
    end = start + fragment_size
    fragment = seq[start:end]
    fragment_id = f"{seq_id}_fragment_{i+1}"
    fragments.append((fragment_id, fragment))

with open(output_file, "w") as f:
    for fragment_id, fragment_seq in fragments:
        f.write(f">{fragment_id}\n{fragment_seq}\n")

In [1]:
#creating background sequences. I will use 12,000 seqs for background for 5kb query seqs. Seq length = 6000

import random

fasta_file = "/scratch/ak8725/az_mrg/azucena.fna"
output_file = "/scratch/ak8725/az_mrg/STREME/streme_background_12k.fa"
num_fragments = 12000
fragment_size = 6000

# read in the input fasta file
with open(fasta_file, "r") as f:
    # read the first line (sequence ID)
    seq_id = f.readline().strip().lstrip(">")
    # read the remaining lines (sequence)
    seq = "".join(line.strip() for line in f.readlines())

seq_length = len(seq)

fragments = []
for i in range(num_fragments):
    # generate a random start position within the sequence
    start = random.randint(0, seq_length - fragment_size + 1)
    end = start + fragment_size
    fragment = seq[start:end]
    fragment_id = f"{seq_id}_fragment_{i+1}"
    fragments.append((fragment_id, fragment))

with open(output_file, "w") as f:
    for fragment_id, fragment_seq in fragments:
        f.write(f">{fragment_id}\n{fragment_seq}\n")