In [None]:
# Saving consensus sequences to MEME file

In [8]:
import numpy as np
from scripts.functions import load_pwms, preprocess
import pickle

In [1]:
motifs = load_pwms()
data, num_labels, matrix_ids, num_to_class, class_to_num, main_classes = preprocess(motifs)

In [32]:
rng = np.random.default_rng(5)
consensus_motifs = []
for mat in data:
    onehot = np.zeros_like(mat, dtype=int)
    for j in range(mat.shape[1]):
        col = mat[:, j]
        max_val = col.max()
        candidates = np.flatnonzero(col == max_val)
        choice = rng.choice(candidates)
        onehot[choice, j] = 1
    consensus_motifs.append(onehot)

In [28]:
def write_meme(bpmats, names, filename,
               bg_freqs=None,
               nsites=None,
               E_values=None,
               tol=1e-6):
    """
    Write a list of letter-probability matrices (bpmats) to a MEME-format file,
    automatically normalizing columns to sum to 1 and ensuring clean formatting.
    
    Parameters:
    - bpmats: list of numpy arrays, each shape (4, w_i) for A,C,G,T probabilities.
    - names: list of motif names (same length as bpmats).
    - filename: output MEME file path.
    - bg_freqs: dict of background frequencies, e.g. {'A':0.3,'C':0.2,'G':0.2,'T':0.3}.
                If None, assumes uniform 0.25 each.
    - nsites: list of integers nsites per motif, or scalar to apply to all.
              If None, defaults to width of each motif.
    - E_values: list of floats E-values per motif, or scalar for all.
                If None, defaults to 1.0 for all.
    - tol: tolerance for checking sums before normalization.
    """
    assert len(bpmats) == len(names), "bpmats and names must have same length"
    
    # Default background frequencies
    if bg_freqs is None:
        bg_freqs = {'A':0.25, 'C':0.25, 'G':0.25, 'T':0.25}
    
    m = len(bpmats)
    # Prepare nsites list
    if nsites is None:
        nsites = [20.0] * m
    # Prepare E_values list
    if E_values is None:
        E_values = [0.0] * m
    elif np.isscalar(E_values):
        E_values = [E_values] * m
    
    with open(filename, 'w') as fh:
        # Header
        fh.write("MEME version 4\n\n")
        fh.write("ALPHABET= ACGT\n\n")
        fh.write("strands: + -\n\n")
        fh.write("Background letter frequencies\n")
        fh.write(f"A {bg_freqs['A']:.6f} C {bg_freqs['C']:.6f} "
                 f"G {bg_freqs['G']:.6f} T {bg_freqs['T']:.6f}\n\n")
        
        # Write each motif
        for mat, name, n, E in zip(bpmats, names, nsites, E_values):
            # Validate shape
            assert mat.shape[0] == 4, f"MATRIX {name} must have 4 rows (A,C,G,T)"
            w = mat.shape[1]
            # Normalize each column to sum to 1
            col_sums = mat.sum(axis=0)
            for i, s in enumerate(col_sums):
                if abs(s - 1.0) > tol:
                    mat[:, i] = mat[:, i] / s
            
            fh.write(f"MOTIF {name}\n")
            fh.write(f"letter-probability matrix: alength= 4 w= {w} "
                     f"nsites= {n} E= {E:.2g}\n")
            # Write matrix rows: one line per position
            for col in range(w):
                a, c, g, t = mat[:, col]
                fh.write(f"{a:1.6f} {c:1.6f} {g:1.6f} {t:1.6f}\n")
            fh.write("\n")

In [33]:
write_meme(consensus_motifs, matrix_ids, 'consensus.meme')