# Information Content Analysis

Processes library metadata to calculate information content and occupancy for each library member, using code derived from Friedman et al. (eLife 2021).

In [None]:
from IPython.display import display, Markdown, HTML
from tqdm.notebook import tqdm

import numpy
import pandas
import scipy

tqdm.pandas()

## Supporting Functions

In [None]:
# Courtesy of Ryan Friedman

def rev_comp(seq):
    """Take the reverse compliment of a sequence
    Parameters
    ----------
    seq : str
        The original sequence.
    Returns
    -------
    new_seq : str
        The reverse compliment.
    """
    compliment = {"A": "T", "C": "G", "G": "C", "T": "A"}
    new_seq = seq[::-1]
    new_seq = "".join([compliment[i] for i in new_seq])
    return new_seq

def peek(fin):
    """ Peek at the next line in a file.
    Parameters
    ----------
    fin : file input stream
    Returns
    -------
    line : str
    """
    pos = fin.tell()
    line = fin.readline()
    fin.seek(pos)
    return line


def gobble(fin):
    """Gobble up lines in the file until we have reached the start of a motif or EOF.
    Parameters
    ----------
    fin : file input stream
    Returns
    -------
    lines : str
        The lines that got gobbled, including newline characters.
    """
    lines = ""
    while True:
        line = peek(fin)
        if len(line) == 0 or line[:5] == "MOTIF":
            break
        else:
            lines += fin.readline()

    return lines


def read_pwm_files(filename):
    """Given a MEME file, read in all PWMs. PWMs are stored as DataFrames, and the list of PWMs is represented as a
    Series, where keys are primary motif identifiers and values are the DataFrames.
    Parameters
    ----------
    filename : str
        Name of the file to read in.
    Returns
    -------
    pwm_ser : pandas.Series
        The list of PWMs parsed from the file.
    """
    pwm_ser = {}
    with open(filename) as fin:
        # Lines before the first motif is encountered
        gobble(fin)

        # Do-while like behavior to read in the data
        # Do <read in motif> while not EOF
        while True:
            # MOTIF <motif name> [alternate ID]
            motif_id = fin.readline().split()[1]

            # Empty line
            fin.readline()
            # "letter-probability matrix: [other info]"
            fin.readline()

            # Every line that starts with a space is a new position in the PWM, if the first character is not a space
            # it is not part of the PWM.
            pwm = []
            while peek(fin)[0] == " ":
                pwm.append(fin.readline().split())

            # Make a DataFrame and add to the list
            pwm = pandas.DataFrame(pwm, dtype=float, columns=["A", "C", "G", "T"])
            pwm_ser[motif_id] = pwm
            # Read up any extra info such as the URL
            gobble(fin)

            # Check if EOF
            if len(peek(fin)) == 0:
                break

    pwm_ser = pandas.Series(pwm_ser)
    return pwm_ser


def ewm_from_letter_prob(pwm_df, pseudocount=0.0001, rt=2.5):
    """Compute an energy weight matrix from a letter probability matrix. Normalize the PWM to the maximum letter
    probability at each position and then compute relative free energies using the formula ddG = -RT ln(p_b,i / p_c,
    i), where p_b,i is the probability of base b, p_c,i is the probability of the consensus base, and ddG is relative
    free energy.
    Parameters
    ----------
    pwm_df : pandas.DataFrame
        The letter probability matrix, where each row is a position of the motif and columns represent A, C, G, T.
    pseudocount : float
        Pseudocount value to add to every value to account for zeros in the PWM.
    rt : float
        The value of RT to use in the formula in kJ/mol.
    Returns
    -------
    ewm_df : pandas.DataFrame
        The weight matrix of free energies relatives to the consensus sequence.
    """
    pwm_df = pwm_df.copy()
    pwm_df += pseudocount
    # Normalize each position by the most frequent letter to get relative Kd
    pwm_df = pwm_df.apply(lambda x: x / x.max(), axis=1)
    # Convert to EWM
    ewm_df = -rt * numpy.log(pwm_df)
    ewm_df.columns = ["A", "C", "G", "T"]
    return ewm_df


def ewm_to_dict(ewm):
    """Convert a DataFrame representation of an EWM to a dictionary for faster indexing.
    Parameters
    ----------
    ewm : pandas.DataFrame
    Returns
    -------
    ewm_dict : {int: {str: float}}
        Dictionary of dictionaries, where the outer keys are positions, the inner keys are letters, and the values
        are values of the matrix
    """
    ewm_dict = ewm.to_dict(orient="index")
    return ewm_dict


def read_pwm_to_ewm(filename, pseudocount=0.0001, rt=2.5):
    """Read in a file of letter probability matrices, convert them to EWMs, and then convert the DataFrames to
    dictionaries for faster indexting.
    Parameters
    ----------
    filename : str
        Name of the file to read in.
     pseudocount : float
        Pseudocount value to add to every value to account for zeros in the PWM.
    rt : float
        The value of RT to use in the formula in kJ/mol.
    Returns
    -------
    ewm_dict : {int: {str: float}}
        Dictionary of dictionaries, where the outer keys are positions, the inner keys are letters, and the values
        are values of the matrix
    """
    # Wrapper function handle to convert each PWM to an EWM
    # Read in the file
    pwms = read_pwm_files(filename)
    # Convert to EWM dicts
    ewms = pwms.apply(ewm_from_letter_prob, args=(pseudocount, rt)).apply(ewm_to_dict)
    return ewms


def energy_landscape(seq, ewm):
    """Scans both strands of a sequence with energy matrix
    Parameters
    ----------
    seq : str
        The sequence to scan.
    ewm : dict {int: {str: float}}
        Dictionary of dictionaries, where the outer keys are positions, the inner keys are letters, and the values
        are delta delta G relative to the consensus sequence.
    Returns
    -------
    fscores, rscores: numpy.array, dtype=float
        Represents the EWM scores for each subsequence on the forward and reverse strand.
    """
    motif_len = len(ewm.keys())
    # Number of positions where the motif can be scored
    n_scores = len(seq) - motif_len + 1
    fscores = numpy.zeros(n_scores)
    # Reverse compliment scores
    rscores = fscores.copy()
    r_seq = rev_comp(seq)

    # Calculate occ for forward and reverse k-mer at every position
    for pos in range(n_scores):
        f_kmer = seq[pos:pos + motif_len]
        r_kmer = r_seq[pos:pos + motif_len]

        # Initialize energy score
        fscore = 0
        rscore = 0
        
        # This is faster than using the enumerate function
        # Calculate the EWM score for the k-mer starting at pos
        for i in range(motif_len):
            fscore += ewm[i][f_kmer[i]]
            rscore += ewm[i][r_kmer[i]]

        fscores[pos] = fscore
        rscores[pos] = rscore

    # rscores needs to be reversed so the indexing corresponds to the appropriate position in the original sequence (
    # i.e. just the compliment, not the reverse compliment)
    rscores = rscores[::-1]

    return fscores, rscores


def occupancy_landscape(seq, ewm, mu):
    """Compute the occupancy landscape by scanning sequence with the energy matrix and then calculate the relative
    free energy for each k-mer subsequence on the forward and reverse strand at chemical potential mu.
    Parameters
    ----------
    seq : str
        The sequence to scan.
    ewm : dict {int: {str: float}}
        Dictionary of dictionaries, where the outer keys are positions, the inner keys are letters, and the values
        are delta delta G relative to the consensus sequence.
    mu : int
        Chemical potential of the TF
    Returns
    -------
    fscores, rscores: numpy.array, dtype=float
        Represents the occupancy scores for each subsequence on the forward and reverse strand.
    """
    fscores, rscores = energy_landscape(seq, ewm)
    # Convert EWM scores to occupancies
    fscores = 1 / (1 + numpy.exp(fscores - mu))
    rscores = 1 / (1 + numpy.exp(rscores - mu))
    return fscores, rscores


def total_landscape(seq, ewms, mu):
    """Compute the occupancy landscape for each TF and join it all together into a DataFrame. Pad the ends of the
    positional information so every TF occupancy landscape is the same length.
    Parameters
    ----------
    seq : str
        The DNA sequence.
    ewms : pandas.Series or dict {str: {int: {str: float}}}
        Keys/index are TF names and values are dictionary representations of the EWMs.
    mu : int or float
        TF chemical potential.
    Returns
    -------
    landscape : pandas.DataFrame, dtype=float
        The occupancy of each TF at each position in each orientation. Rows are positions, columns are TFs and
        orientations, values indicate the predicted occupancy starting at the position.
    """
    landscape = {}
    seq_len = len(seq)
    # For each TF
    for name, ewm in ewms.items():
        # Get the predicted occupancy and add it to the list
        fscores, rscores = occupancy_landscape(seq, ewm, mu)
        landscape[f"{name}_F"] = fscores
        landscape[f"{name}_R"] = rscores

    # Pad the ends of the lists to the length of the sequence
    for key, val in landscape.items():
        amount_to_add = seq_len - len(val)
        landscape[key] = numpy.pad(val, (0, amount_to_add), mode="constant", constant_values=0)

    landscape = pandas.DataFrame(landscape)
    return landscape


def total_occupancy(seq, ewms, mu):
    """For each TF, calculate its predicted occupancy over the sequence given the energy matrix and chemical
    potential. Then, summarize the information as the total occupancy of each TF over the entire sequence.
    Parameters
    ----------
    seq : str
        The DNA sequence.
    ewms : pandas.Series or dict {str: {int: {str: float}}}
        Keys/index are TF names and values are dictionary representations of the EWMs.
    mu : int or float
        TF chemical potential.
    Returns
    -------
    occ_profile : pandas.Series, dtype=float
        The total occupancy profile of each TF on the sequence.
    """
    occ_landscape = total_landscape(seq, ewms, mu)
    occ_profile = {}
    # Add together F and R strand
    if type(ewms) is dict:
        keys = ewms.keys()
    else:
        keys = ewms.index
    for tf in keys:
        occ_profile[tf] = occ_landscape[[f"{tf}_F", f"{tf}_R"]].sum().sum()

    occ_profile = pandas.Series(occ_profile)
    return occ_profile

def information_content(occupancies, diversity_cutoff=0.5, log=numpy.log2):
    """Given a list of TF occupancies, compute total occupancy, diversity, and information content.
    Parameters
    ----------
    occupancies : pandas.Series
        Predicted occupancy for a collection of TFs on a given sequence.
    diversity_cutoff : float
        Cutoff to call a TF "occupied" on the sequence.
    log : Function handle
        Function to use for computing the log. Default is log2 so information content is in bits, natural log should be used for
        biophysical applications.
    Returns
    -------
    result : pandas.Series
        The total occupancy, diversity, and information content of the provided sequence.
    """
    # Calculate total occupancy of all TFs on the sequence
    total_occ = occupancies.sum()
    # Count how many of the TFs are occupied, i.e. have motifs present in the sequence
    diversity = (occupancies > diversity_cutoff).sum()
    # Since the occupancies are continuous values, we need to use the Gamma function to compute entropy. Gamma(n+1)=n!
    # W = N! / prod(N_i!)
    microstates = scipy.special.gamma(total_occ + 1) / (occupancies + 1).apply(scipy.special.gamma).product()
    # S = log W
    info_content = log(microstates)

    result = pandas.Series({
        "total_occupancy": total_occ,
        "diversity": diversity,
        "info_content": info_content
    })
    return result

## Processing

In [None]:
"""
Load TF EWMs
"""

ewm_ser = read_pwm_to_ewm("photoreceptorAndEnrichedMotifs.meme")
display(ewm_ser)

In [None]:
"""
Load library metadata, and nearby genes TSS
"""

library_metadata = pandas.read_csv("library_metadata.tsv", sep="\t")[["library_id", "CRE_sequence"]]
display(library_metadata)

In [None]:
mu = 9

raw_information_contents = {}
for sequence in tqdm(library_metadata["CRE_sequence"].str.upper()):
    occ_ser = total_occupancy(sequence, ewm_ser, mu)
    for motif, pred_occ in occ_ser.to_dict().items():
        short_motif = motif.split("_")[0]
        key = f"predicted_occupancy_{short_motif}"
        if key not in raw_information_contents:
            raw_information_contents[key] = []
        raw_information_contents[key].append(pred_occ)
    inf_cont = information_content(occ_ser).to_dict()
    for key in inf_cont:
        if key not in raw_information_contents:
            raw_information_contents[key] = []
        raw_information_contents[key].append(inf_cont[key])

In [None]:
for key in raw_information_contents:
    library_metadata[key] = raw_information_contents[key]

In [None]:
library_metadata

In [None]:
library_metadata.drop(["CRE_sequence"], axis=1).to_csv("library_information_content.tsv", sep="\t", index=False)