In [15]:
def get_suffix_array(s):
    """
    Naive implementation of suffix array generation (0-indexed). You do not have to implement the
    KS Algorithm. Make this code fast enough so you have enough time in Aligner.__init__ (see bottom).

    Input:
        s: a string of the alphabet ['A', 'C', 'G', 'T'] already terminated by a unique delimiter '$'
    
    Output: list of indices representing the suffix array

    >>> get_suffix_array('GATAGACA$')
    [8, 7, 5, 3, 1, 6, 4, 0, 2]
    """
    suffixes = [(s[i:], i) for i in range(len(s))]
    suffixes = sorted(suffixes)
    return [i[1] for i in suffixes]

def get_bwt(s, sa):
    """
    Input:
        s: a string terminated by a unique delimiter '$'
        sa: the suffix array of s

    Output:
        L: BWT of s as a string
    """
    return ''.join([s[i-1] for i in sa])

def get_F(L):
    """
    Input: L = get_bwt(s)

    Output: F, first column in Pi_sorted
    """
    return ''.join(sorted(list(L)))

def get_M(F):
    """
    Returns the helper data structure M (using the notation from class). M is a dictionary that maps character
    strings to start indices. i.e. M[c] is the first occurrence of "c" in F.

    If a character "c" does not exist in F, you may set M[c] = -1
    """
    M = {}
    for i in range(len(F)):
        char = F[i]
        if char not in M:
            M[char] = i
    for char in ALPHABET:
        if char not in M:
            M[char] = -1

def get_occ(L):
    """
    Returns the helper data structure OCC (using the notation from class). OCC should be a dictionary that maps 
    string character to a list of integers. If c is a string character and i is an integer, then OCC[c][i] gives
    the number of occurrences of character "c" in the bwt string up to and including index i
    """
    occ = {i: [0] for i in ALPHABET}
    for i in L:
        for j in occ:
            if i != j:
                occ[j].append(occ[j][-1])
            else:
                occ[i].append(occ[i][-1]+1)
    for i in occ:
        occ[i].pop(0)
    return occ
def construct_L(M, occ):
    length = 0
    for char in occ:
        length += char[-1]
    L = ' '*length
    for char in occ:
        for i in range(char[-1]):
            L[occ[char].index(i+1)] = char
    return L
ALPHABET=['A','T','C','G','$']

In [8]:
get_suffix_array('GATAGACA$')

[8, 7, 5, 3, 1, 6, 4, 0, 2]

In [None]:
for i in 'GATAG'

In [31]:
def construct_L(M, occ):
    length = 0
    for char in occ:
        length += occ[char][-1]
    L = ['']*length
    for char in occ:
        for i in range(occ[char][-1]):
            L[occ[char].index(i+1)] = char
    return ''.join(L)

In [32]:
s = 'GACTACGTAAC$'
L = get_bwt(s,get_suffix_array(s))

In [33]:
construct_L(get_M(get_F(L)), get_occ(L))

'CTATGAAA$CGC'

In [27]:
L

'CTATGAAA$CGC'