In [131]:
import numpy as np
import random
import pandas as pd

In [132]:
def generate_random_seq(n_nucleotides):
    dna_nuc = ["A", "T", "C", "G"]
    random_dna_seq = "".join(random.choices(dna_nuc, k=n_nucleotides))

    return random_dna_seq


def generate_scoring_matrix(n_rows, n_cols):
    """Generates a zero-filled matrix

    Parameters
    ----------
    n_rows : int
        number of rows
    col_size : int
        number of columns
    """
    matrix = np.zeros((n_rows, n_cols), dtype=np.int32)
    return matrix



In [122]:
query = generate_random_seq(436)
contig = generate_random_seq(532)

In [123]:
scoring_matrix = generate_scoring_matrix(len(query), len(contig))

In [142]:
def match_scoring(nuc1, nuc2, match_score=10, mismatch_score=-5):
    if nuc1 == nuc2:
        return match_score
    else:
        return mismatch_score




def local_align(query, contig, gap=-5, match=10, mismatch=-5):
    """Do a local alignment between x and y"""
    # create a zero-filled matrix

    score_matrix = generate_scoring_matrix(len(contig) + 1, len(query) + 1)

    best_score = 0
    best_score_loc = (0,0)

    # fill in A in the right order
    for i in range(1, len(contig)+1):
        for j in range(1, len(query)+1):
            
            # getting best score per row
            score_matrix[i][j] = max(
            score_matrix[i][j-1] + gap,
            score_matrix[i-1][j] + gap,
            score_matrix[i-1][j-1] + match_scoring(contig[i-1], query[j-1]),
            0
            )
        
            # tracking largest score
            if score_matrix[i][j] >= best_score: 
                best_score = score_matrix[i][j] 
                best_score_loc = (i,j)

    # convert into pandas dataframe
    col_idx = ["*"] + [n for n in query]
    indx = ["*"] + [n for n in contig]
    alignment_df = pd.DataFrame(score_matrix, columns=col_idx, index=indx)
    alignment_df.to_csv("alignment.csv")


    # return the opt score and the best location
    return best_score, best_score_loc, score_matrix, alignment_df

In [143]:
_, _, matrix = local_align("acgt", "cg")
matrix

array([[ 0,  0,  0,  0,  0],
       [ 0,  0, 10,  5,  0],
       [ 0,  0,  5, 20, 15]], dtype=int32)