# Processing mutations

The variant calls from the BreSeq pipeline are parsed and analysed. All strains with _mutX_ mutations are excluded as they accumulate SNPs at a very high rate, masking interesting mutations.

Mutations that are present in all strains are also excluded as they have most likely also been in the background strain, prior to the evolution experiments.

In [206]:
import pandas as pd
import json

In [207]:
breseq_file = "../Data/Mutation_data/Strain_mutations_v1.txt"

# Read the breseq file into a {strain: set(mutations)} dictionary.
strains = {}
with open(breseq_file) as infile:
    for line in infile:
        strain, *mutations = line.strip().split("\t")
        strains[strain] = set(mutations)
        

print(len(strains), "strains with a total of", len(set.union(*strains.values())), "mutations")

242 strains with a total of 5991 mutations


In [208]:
common_mutations = set()
mutation_counts = pd.DataFrame(
    {s: {m: 1 for m in muts} for s, muts in strains.items()}
).fillna(0).sum(1).astype("int").sort_values(ascending=False)
for mut, count in mutation_counts.items():
    if count > 230:
        common_mutations.add(mut)
print(len(common_mutations), "mutations are present in more than 230 strains")
print(
    "The most common of the remaining mutations is present in",
    mutation_counts[~mutation_counts.index.isin(common_mutations)].max()
)

# Remove common mutations from strain genotypes
strains = {strain: muts - common_mutations for strain, muts in strains.items()}

# Exclude "DEL-1299499-1199" as this seemed like an erroneous call
strains = {strain: muts - {"DEL-1299499-1199"} for strain, muts in strains.items()}

7 mutations are present in more than 230 strains
The most common of the remaining mutations is present in 67


In [210]:
# The strain 12PD6-9 had many low-coverage regions most likely due to sequencing issues.
# These missing-coverage calls are removed
strains["12PD6-9"] = frozenset(mut for mut in strains["12PD6-9"] if not mut.startswith("MCDEL"))

In [221]:
# Save file
with open("../Data/Mutation_data/All_strains_to_mutations.json", "w") as outfile:
    json.dump({k: list(v) for k, v in strains.items()}, outfile)

In [211]:
# Load mapping of mutations to affected genes.
# Mutations are mapped to any genes whose CDS contain the mutation.
# Intergenic mutations are mapped to the closest neighbor gene which the mutation is upstream of.
with open("../Data/Mutation_data/Mutations_to_gene_names.json") as infile:
    mutations_to_gene_names = {mut: set(genes) for mut, genes in json.load(infile).items()}

In [213]:
# A {strain: mutated_genes} dictionary is constructed

strain_to_gene_names = {strain: set.union(*(mutations_to_gene_names[mut] for mut in muts)) for strain, muts in strains.items()}

# Count how many strains have mutations in each gene
gene_counts = pd.DataFrame({s: {m: 1 for m in muts} for s, muts in strain_to_gene_names.items()}).fillna(0).sum(1).astype("int").sort_values(ascending=False)

# Find genes that are mutated in (almost) all strains, meaning it was most likely mutated in the background strain
common_gene_mutations = set(gene_counts[gene_counts > 240].index)


# Remove genes that are mutated in all strains
strain_to_gene_names = {strain: genes - common_gene_mutations for strain, genes in strain_to_gene_names.items()}
print(len(set.union(*strain_to_gene_names.values())), "total genes are mutated")
print("Most commonly mutated gene is mutated in", gene_counts[~gene_counts.index.isin(common_gene_mutations)].max(), "strains")

with open("../Data/Mutation_data/All_strains_to_gene_names.json", "w") as outfile:
    json.dump({strain: list(genes) for strain, genes in strain_to_gene_names.items()}, outfile)

2811 total genes are mutated
Most commonly mutated gene is mutated in 88 strains


In [218]:
# Find strains that have mutations in mutX genes
hypermutators = []
for strain, gens in strain_to_gene_names.items():
    for gen in gens:
        if gen.startswith("mut"):
            #print(strain, gen)
            hypermutators.append(strain)
hypermutators = set(hypermutators)
non_hyper_mutators = set(strains) - hypermutators
print(len(non_hyper_mutators), "strains are not hypermutators.")

190 strains are not hypermutators.


In [219]:
# Save data
with open("../Data/Mutation_data/Strain_to_genes.json", "w") as outfile:
    json.dump({strain: list(strain_to_gene_names[strain]) for strain in non_hyper_mutators}, outfile)

In [None]:
# This should be moved to the mutation histogram plot notebook

def overlaps(start1, end1, start2, end2):
    """Function to check whether two mutations overlap"""
    if start1 <= end2 and start2 <= end1:
        return True
    else:
        return False
    
# Remove any MCDEL calls that overlap with a DEL call
mcdels_to_remove = {}  # Remove duplicate MCDEL / DEL calls
for strain, muts in strains.items():
    dels = { # mut: (start, end) #
        mut: (int(mut.split("-")[1]), int(mut.split("-")[1])+int(mut.split("-")[2]))
        for mut in muts if mut.startswith("DEL")
    }
    mcdels = {
        mut: (int(mut.split("-")[1]), int(mut.split("-")[2]))
        for mut in muts if mut.startswith("MCDEL")
    }
    remove_list = []
    for mcdel, (start1, end1) in mcdels.items():
        for del_id, (start2, end2) in dels.items():
            if overlaps(start1, end1, start2, end2):
                remove_list.append(mcdel)
                break
    mcdels_to_remove[strain] = set(remove_list)

strains = {strain: muts - mcdels_to_remove[strain] for strain, muts in strains.items()}
print("Removing", len(set.union(*mcdels_to_remove.values())), "MCDEL mutations")
print(len(set.union(*strains.values())), "unique mutations")