# Processing mutations

The variant calls from the BreSeq pipeline are parsed and analysed. All strains with _mutX_ mutations are excluded as they accumulate SNPs at a very high rate, masking interesting mutations.

Mutations that are present in all strains are also excluded as they have most likely also been in the background strain, prior to the evolution experiments.

In [729]:
import pandas as pd
import json
import copy

In [471]:
compounds = ["HMDA", "putrescine", "1,2-propanediol", "2,3-butanediol", "glutarate", "adipate", "hexanoate", "octanoate", "coumarate", "isobutyrate", "butanol"]

mapping = pd.read_csv("../Data/Mutation_data/Variant_calls/mapping.csv", header=None)
compound_mapping = {
    1: "butanol",
    2: "glutarate",
    3: "coumarate",
    5: "HMDA",
    6: "putrescine",
    7: "adipate",
    8: "isobutyrate",
    9: "hexanoate",
    10: "2,3-butanediol",
    11: "1,2-propanediol",
    12: "octanoate"
}
rev_compound_mapping = {v: k for k, v in compound_mapping.items()}

code_to_compound = {
    "BUT": "butanol",
    "GLUT": "glutarate",
    "COUM": "coumarate",
    "HMDA": "HMDA",
    "PUTR": "putrescine",
    "ADIP": "adipate",
    "IBUA": "isobutyrate",
    "HEXA": "hexanoate",
    "23BD": "2,3-butanediol",
    "12PD": "1,2-propanediol",
    "OCTA": "octanoate"
}
comp_to_code = {v: k for k, v in code_to_compound.items()}

In [621]:
def generate_mut_id(row):
    """
    Generate a mutation id from the information from BreSeq.
    """
    typ = row["Mutation Type"]
    pos = str(int(row["Position"].replace(",", "")))
    if typ == "SNP":
        last = row["Sequence Change"][-1]
    elif typ == "DEL":
        if "(" in row["Sequence Change"]:
            last = str(len(row["Sequence Change"].split(")")[0][1:]))
        else:
            last = str(int(row["Sequence Change"][1:-3].replace(",", "")))
    elif typ == "INS":
        if "(" in row["Sequence Change"]:
            last = row["Sequence Change"].split(")")[0][1:]
        else:
            last = row["Sequence Change"][1:]
    elif typ == "MOB":
        fields = row["Sequence Change"].split("\xa0")
        last = fields[0] + "-" + fields[2][1:]
    else:
        last = ""
    return "-".join((typ, pos, last))

def strain_code_to_name(strain_code, compound):
    """
    Convert the strain code from the breseq results to the real strain names using the mapping file.
    """
    compound_num = rev_compound_mapping[compound]
    fields = strain_code.split(" ")[-4:]
    fields = [f[1:] for f in fields]
    code = (int(fields[0]), int(fields[2]), int(fields[3]))
    strain = mapping[mapping[0] == compound_num].groupby((1, 2, 3)).first().loc[code][4]
    return strain
    

In [719]:
excluded_mutations = {
    'DEL-1299499-1199', # Randomly called across populations
}

def convert_to_int(string):
    if string[0] == "‑": # Weird non-standard minus sign
        string = "-" + string[1:] # Replace with normal minus sign
    return int(string)

def pick_gene(rel_coords):
    """Given relative coordinates of an intergenic mutation, pick the gene that is closest downstream"""
    if min(rel_coords) > 0:
        return None
    elif max(rel_coords) <= 0:
        if rel_coords[0] > rel_coords[1]:
            return 0
        else:
            return 0
    elif rel_coords[0] <= 0:
        return 0
    elif rel_coords[1] <= 0:
        return 1
    else:
        raise RuntimeError("What else is there?")

strain_to_muts = {}
mut_to_genes = {}
for comp in compounds:
    df = pd.read_csv("../Data/Mutation_data/Variant_calls/%s.csv" % comp)
    df["mut_id"] = df.apply(generate_mut_id, 1)
    
    for idx, row in df.iterrows():
        genes = row["Gene"]
        if "genes" in genes:
            genes = genes.split("genes")[-1] # Fix inconsistent format for large deletions
        genes = genes.split(", ")
        genes = [gene.strip("[] >") for gene in genes]
        change = row["Protein change"]
        if isinstance(change, str) and change.startswith("intergenic"):
            nums = change.split("(")[1][:-1].split("/")
            nums = convert_to_int(nums[0]), convert_to_int(nums[1])
            nearest = pick_gene(nums)
            if nearest is None:
                genes = []
            else:
                genes = [genes[nearest]]
                
        mut_id = row["mut_id"]
        mut_to_genes[mut_id] = genes
        
    for strain_code in df.columns[10:]:
        if strain_code == "mut_id":
            continue
        strain = strain_code_to_name(strain_code, comp)
        if strain.endswith("-rerun"):
            strain = strain[:-6]

        muts = list(df[df[strain_code] == 1]["mut_id"])
        strain_to_muts[strain] = set(muts) - excluded_mutations

In [757]:
print(
    "There are",
    len(strain_to_muts),
    "strains with a total of",
    len(set.union(*strain_to_muts.values())), "mutations."
)

There are 224 strains with a total of 3733 mutations.


In [740]:
strain_to_genes = {}

# Add genes that are specifically mutated (mutation only has 1 gene target)
for strain, muts in strain_to_muts.items():
    for mut in muts:
        if len(mut_to_genes[mut]) == 1:
            strain_to_genes.setdefault(strain, []).append(mut_to_genes[mut][0])
            
strain_to_genes = {k: set(v) for k, v in strain_to_genes.items()}

certain_mutations = {
    comp: set.union(*(v for k, v in strain_to_genes.items() if k.startswith(comp))) for comp in code_to_compound
}
strain_to_all_genes = copy.deepcopy(strain_to_genes)

# For mutations that affect multiple genes: only add genes that are already mutated in strains from that compound
# (or mutator genes)
for comp in code_to_compound:
    for strain in (s for s in strain_to_muts if s.startswith(comp)):
        for mut in strain_to_muts[strain]:
            if mut.startswith("DUP"):
                continue
            if len(mut_to_genes[mut]) > 1:
                for gene in mut_to_genes[mut]:
                    strain_to_all_genes[strain].add(gene)
                    if gene in certain_mutations[comp] or gene.startswith("mut"):
                        strain_to_genes[strain].add(gene)

In [758]:
print(
    "A total of",
    len(set.union(*strain_to_all_genes.values())),
    "genes are mutated."
)

A total of 2028 genes are mutated.


In [759]:
# Hypermutators are strains with mutations in a mutX gene
hypermutators = [s for s, genes in strain_to_genes.items() if "mut" in set(g[:3] for g in genes)]
non_hypermutators = set(strain_to_genes) - set(hypermutators)

In [764]:
print(
    "There are",
    len(non_hypermutators),
    "non-mutator strains with a total of",
    len(set.union(*[v for k, v in strain_to_muts.items() if k in non_hypermutators])),
    "mutations affecting",
    len(set.union(*[v for k, v in strain_to_all_genes.items() if k in non_hypermutators])),
    "genes."
)

There are 192 non-mutator strains with a total of 911 mutations affecting 366 genes.


In [749]:
with open("../Data/Mutation_data/Strain_to_genes.json", "w") as outfile:
    json.dump({k: list(v) for k, v in strain_to_genes.items() if k in non_hypermutators}, outfile)
    
with open("../Data/Mutation_data/Strain_to_all_genes.json", "w") as outfile:
    json.dump({k: list(v) for k, v in strain_to_all_genes.items() if k in non_hypermutators}, outfile)
    
with open("../Data/Mutation_data/All_strains_to_gene_names.json", "w") as outfile:
    json.dump({k: list(v) for k, v in strain_to_genes.items()}, outfile)
    
with open("../Data/Mutation_data/All_strains_to_mutations.json", "w") as outfile:
    json.dump({k: list(v) for k, v in strain_to_muts.items()}, outfile)
    
with open("../Data/Mutation_data/All_mutated_genes.txt", "w") as outfile:
    mutated_genes = set.union(*[strain_to_genes[s] for s in non_hypermutators])
    outfile.write("\n".join(mutated_genes))

In [753]:
print("Number of mutations in each of the strains classified as a hypermutator:")
for s in sorted(hypermutators):
    print(s, len(strain_to_genes[s]), sep=" \t")

Number of mutations in each of the strains classified as a hypermutator:
12PD1-10 	134
12PD1-2 	146
12PD1-4 	112
12PD2-8 	157
12PD2-9 	158
12PD3-10 	147
12PD3-7 	134
12PD3-8 	179
12PD4-8 	93
12PD4-9 	94
12PD5-1 	131
12PD5-3 	131
12PD7-5 	109
12PD7-6 	193
12PD8-10 	154
12PD8-6 	151
12PD8-7 	136
23BD3-3 	280
23BD3-4 	241
23BD3-9 	263
ADIP5-2 	215
ADIP5-6 	181
HMDA4-2 	45
HMDA4-6 	47
HMDA4-9 	44
HMDA6-3 	85
HMDA6-7 	87
IBUA3-2 	42
OCTA6-5 	106
OCTA6-6 	107
OCTA6-7 	109
