## An alternative (more conservative and faster) mutation function


In [None]:
from itertools import combinations, product

def generate_mutations(seq, matrix, max_num_pw_mut):
    alp = matrix.alphabet
    candidates = []
    if max_num_pw_mut > len(seq):
        max_num_pw_mut = len(seq)
    # Create candidate symbols for each symbol in the sequence
    for i, aa in enumerate(seq):
        candidates.append([])
        # Consider substitutions with non-negative scores
        for c_aa in alp:
            if c_aa == aa: # ignore itself
                continue
            if matrix[aa][c_aa] >= -1e-4: # for floating point precision
                candidates[i].append(c_aa)

    # Select up to max_num_pw_mut points of mutation and generate mutated sequences
    cartesian_product_dict = {} # For dynamic programming purposes
    potential_mut_counts = range(1, max_num_pw_mut + 1)
    final_mutations = []
    for mut_count in potential_mut_counts:
        combs_of_mut_indices = list(combinations(range(len(seq)), mut_count))
        for mutation_indices in combs_of_mut_indices:
            # check if we generated all mutations for this particular subsequence before
            only_this_mut_string = ""
            for idx in mutation_indices:
                only_this_mut_string += seq[idx]
            if only_this_mut_string not in cartesian_product_dict:
                # generate all mutations of this particular subsequence
                only_this_mut_candidates = []
                for idx in mutation_indices:
                    only_this_mut_candidates.append(candidates[idx])
                cartesian_product_dict[only_this_mut_string] = list(product(*only_this_mut_candidates))
            # add all the calculated pointwise mutated strings to result
            for particular_mutation in cartesian_product_dict[only_this_mut_string]:
                curr_idx = 0
                fin_mut = ""
                for i, aa in enumerate(seq):
                    if i in mutation_indices:
                        fin_mut += particular_mutation[curr_idx]
                        curr_idx += 1
                    else:
                        fin_mut += aa
                final_mutations.append(fin_mut)
    return final_mutations
                



### Example output


In [None]:
from Bio.Align import substitution_matrices

blosum62 = substitution_matrices.load("BLOSUM62")
pam250 = substitution_matrices.load("PAM250")

# print(generate_mutations("HVY", blosum62, 5))
print(generate_mutations("LLL", pam250, 5))
muts = generate_mutations("HYEVVVVEE", blosum62, 7)
# print(muts)
print(len(muts))
print(len(set(muts)))


### Example usage in training


In [None]:
# %%prun -s cumulative
init_vocab = alphabet.copy()

# Add 100 words + their mutations
for i in range(1000):
    best_pair = merge_heap.pop()
    merge_pair(best_pair)
    merged_string = best_pair.merged()
    init_vocab.append(merged_string)

    # For the mutations:
    mutations = generate_mutations(merged_string, blosum62, 5)
    for mutated_str, score in mutations:
        pairs_to_merge = merge_heap.merged_to_pair.get(mutated_str, [])
        if len(pairs_to_merge) > 0:
            init_vocab.append(mutated_str)
        for pair in pairs_to_merge:
            merge_heap.remove_by_value(pair)
            merge_pair(pair)

print(init_vocab)