# Roughwork

In [9]:
from Bio import SeqIO
import numpy as np
import math

strong = [str(record.seq) for record in SeqIO.parse("synth_50_strong.fa", "fasta")]
weak = [str(record.seq) for record in SeqIO.parse("synth_50_weak.fa", "fasta")]


In [20]:

def profile(DNA, k, pos, log_odd=True):
    N = len(DNA)# or pos0
    x = np.zeros((N, k), dtype=str) #matrix with k-mers
    W = np.zeros((4, k), dtype=float) #position weight matrix
    for i in range(N):
        x[i, :] = list(DNA[i][pos[i]:pos[i]+k])
    for j in range(k):
        dummy, W[:,j] = totalcount(x[:,j], pseudocount=1)
    if log_odd==True: # log-odds matrix
        W2 = np.log(W*4)/np.log(4)
    else: 
        W2 = W
    return x, W
def totalcount(x, pseudocount=1):
    x=list(x)
    counts = np.array([
        x.count('A'),
        x.count('T'),
        x.count('G'),
        x.count('C')
        ])
    finalcount = (counts + pseudocount)/(len(x) + 4*pseudocount)
    return counts,finalcount


def totalcount2(column, pseudocount=1):
    """
    Computes the position weight values for nucleotides (A, C, G, T) at a given position.

    Parameters:
    column : np.ndarray
        A 1D numpy array containing nucleotides ('A', 'C', 'G', 'T') for a specific column in k-mers.
    pseudocount : int, optional
        A small pseudocount added to avoid zero counts (default: 1).

    Returns:
    np.ndarray
        A 1D numpy array with frequencies of ('A', 'C', 'G', 'T') at the given position.
    """
    nucleotides = "ATGC"
    count = np.array([np.sum(column == nuc) for nuc in nucleotides])  # Count occurrences
    count = count + pseudocount  # Apply pseudocount
    return count / np.sum(count)  # Normalize to get probabilities


a,b = profile(strong,10, [18, 35, 19, 16,  9, 35 ,13, 8, 26, 19])

In [16]:
a

array([['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A'],
       ['C', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'A']], dtype='<U1')

In [17]:
counts, d = totalcount(a[:, 1],1)
print(counts)
d

[ 0  0 10  0]


array([0.07142857, 0.07142857, 0.78571429, 0.07142857])

In [18]:
# Example usage:
column = np.array(['A', 'C', 'G', 'A', 'T', 'G', 'C', 'A'])  # Example nucleotide column
weights = totalcount(column)
print(weights)  # Normalized probabilities of A, C, G, T at this position


(array([3, 1, 2, 2]), array([0.33333333, 0.16666667, 0.25      , 0.25      ]))


In [19]:
totalcount2(column)

array([0.33333333, 0.25      , 0.25      , 0.16666667])