# Processing mutations

The variant calls from the BreSeq pipeline are parsed and analysed. All strains with _mutX_ mutations are excluded as they accumulate SNPs at a very high rate, masking interesting mutations.

Mutations that are present in all strains are also excluded as they have most likely also been in the background strain, prior to the evolution experiments.

In [143]:
import pandas as pd
import json

In [471]:
compounds = ["HMDA", "putrescine", "1,2-propanediol", "2,3-butanediol", "glutarate", "adipate", "hexanoate", "octanoate", "coumarate", "isobutyrate", "butanol"]

mapping = pd.read_csv("../Data/Mutation_data/Variant_calls/mapping.csv", header=None)
compound_mapping = {
    1: "butanol",
    2: "glutarate",
    3: "coumarate",
    5: "HMDA",
    6: "putrescine",
    7: "adipate",
    8: "isobutyrate",
    9: "hexanoate",
    10: "2,3-butanediol",
    11: "1,2-propanediol",
    12: "octanoate"
}
rev_compound_mapping = {v: k for k, v in compound_mapping.items()}

code_to_compound = {
    "BUT": "butanol",
    "GLUT": "glutarate",
    "COUM": "coumarate",
    "HMDA": "HMDA",
    "PUTR": "putrescine",
    "ADIP": "adipate",
    "IBUA": "isobutyrate",
    "HEXA": "hexanoate",
    "23BD": "2,3-butanediol",
    "12PD": "1,2-propanediol",
    "OCTA": "octanoate"
}
comp_to_code = {v: k for k, v in code_to_compound.items()}

In [469]:
def mut_id(row):
    typ = row["Mutation Type"]
    pos = str(int(row["Position"].replace(",", "")))
    if typ == "SNP":
        last = row["Sequence Change"][-1]
    elif typ == "DEL":
        if "(" in row["Sequence Change"]:
            last = str(len(row["Sequence Change"].split(")")[0][1:]))
        else:
            last = str(int(row["Sequence Change"][1:-3].replace(",", "")))
    elif typ == "INS":
        if "(" in row["Sequence Change"]:
            last = row["Sequence Change"].split(")")[0][1:]
        else:
            last = row["Sequence Change"][1:]
    elif typ == "MOB":
        fields = row["Sequence Change"].split("\xa0")
        last = fields[0] + "-" + fields[2][1:]
    else:
        last = ""
    return "-".join((typ, pos, last))

def strain_code_to_name(strain_code, compound):
    compound_num = rev_compound_mapping[compound]
    fields = strain_code.split(" ")[-4:]
    fields = [f[1:] for f in fields]
    code = (int(fields[0]), int(fields[2]), int(fields[3]))
    strain = mapping[mapping[0] == compound_num].groupby((1, 2, 3)).first().loc[code][4]
    return strain
    

In [574]:
strain_to_muts = {}
for comp in compounds:
    df = pd.read_csv("../Data/Mutation_data/Variant_calls/%s.csv" % comp)
    df["mut_id"] = df.apply(mut_id, 1)
    
    for strain_code in df.columns[10:]:
        if strain_code == "mut_id":
            continue
        strain = strain_code_to_name(strain_code, comp)

        muts = list(df[df[strain_code] == 1]["mut_id"])
        strain_to_muts[strain] = set(muts)

In [497]:
strain_to_muts["GLUT4-4"]

KeyError: 'GLUT4-4'

In [496]:
mapping[mapping[0] == 2]

Unnamed: 0,0,1,2,3,4
20,2,1,1,1,GLUT1-3
21,2,1,2,1,GLUT1-9
22,2,1,3,1,GLUT1-10
23,2,2,1,1,GLUT2-1
24,2,2,2,1,GLUT2-9
25,2,2,3,1,GLUT2-10
26,2,2,3,2,GLUT2-10-rerun
27,2,3,1,1,GLUT3-5
28,2,3,2,1,GLUT3-7
29,2,3,2,2,GLUT3-7-rerun


In [577]:
for comp in code_to_compound:
    print(
        comp,
        len(set.union(*(v for k, v in strain_to_muts.items()  if comp in k))),
        len(set.union(*(v for k, v in old_strain_muts.items() if comp in k)))
    )

HEXA 54 66
IBUA 161 121
COUM 245 87
12PD 1710 1819
23BD 568 502
BUT 37 69
OCTA 197 222
GLUT 103 119
HMDA 262 266
ADIP 460 470
PUTR 84 99


In [549]:
len(set.union(*strain_to_muts.values()))

70

In [550]:
len(set.union(*old_strain_muts.values()))

119

In [536]:
len(strain_to_muts)

224

In [537]:
len(old_strain_muts)

224

In [480]:
for mut in set.union(*strain_to_muts.values()):
    if mut.startswith("DEL"):
        assert int(mut.split("-")[2]) >= 1

In [558]:
comp = "COUM"
df = pd.read_csv("../Data/Mutation_data/Variant_calls/%s.csv" % code_to_compound[comp])
df["mut_id"] = df.apply(mut_id, 1)

In [569]:
df[df["Mutation Type"] == "DUP"]

Unnamed: 0,Position,Mutation Type,Sequence Change,Gene,Function,Product,GO Process,GO Component,Protein change,TOL coumarate A0 F0 I1 R1,...,TOL coumarate A5 F50 I2 R1,TOL coumarate A5 F50 I3 R1,TOL coumarate A6 F50 I1 R1,TOL coumarate A6 F50 I2 R1,TOL coumarate A6 F50 I3 R1,TOL coumarate A7 F50 I1 R1,TOL coumarate A7 F50 I2 R1,TOL coumarate A8 F50 I1 R1,TOL coumarate A8 F50 I2 R1,mut_id
0,0,DUP,575 bp x2.0,"thrL, [thrA]",,,,,Duplication,,...,,,,,1.0,,,,,DUP-0-
2,4261,DUP,"2,144 bp x1.9","[thrC], yaaX, [yaaA]",,,,,Duplication,,...,,,,,1.0,,,,,DUP-4261-
3,7819,DUP,"7,180 bp x1.9","[yaaJ], talB, mog, satP, yaaW, yaaI, dna...",,,,,Duplication,,...,,,,,1.0,,,,,DUP-7819-
4,16964,DUP,"5,102 bp x2.1","nhaA, nhaR, insB1, insA, rpsT, yaaY, [ribF]",,,,,Duplication,,...,,,,,1.0,,,,,DUP-16964-
5,19520,DUP,"1,032 bp x2.7","[nhaR], insB1, insA",,,,,Duplication,,...,,,,1.0,,,,,,DUP-19520-
6,19809,DUP,729 bp x2.9,"insB1, insA",,,,,Duplication,,...,,,,,,,,,,DUP-19809-
7,27107,DUP,"2,626 bp x1.9","[ispH], rihC, dapB, [carA]",,,,,Duplication,,...,,,,,1.0,,,,,DUP-27107-
8,31546,DUP,"12,593 bp x1.9","[carB], caiF, caiE, caiD, caiC, caiB, ca...",,,,,Duplication,,...,,,,,1.0,,,,,DUP-31546-
9,46951,DUP,"4,223 bp x1.8","[yaaU], kefF, kefC, folA, [apaH]",,,,,Duplication,,...,,,,,1.0,,,,,DUP-46951-
10,53132,DUP,"7,937 bp x1.8","[pdxA], surA, lptD, djlA, yabP, rluA, [r...",,,,,Duplication,,...,,,,,1.0,,,,,DUP-53132-


In [560]:
strain_to_muts = {}
for strain_code in df.columns[10:]:
    if strain_code == "mut_id":
        continue
    strain = strain_code_to_name(strain_code, code_to_compound[comp])
    if "rerun" in strain:
        continue
    muts = list(df[df[strain_code] == 1]["mut_id"])
    strain_to_muts[strain] = set(muts)

In [561]:
def sanitize_mut(mut):
    if mut.startswith("MCDEL"):
        a, b, c = mut.split("-")
        a = "DEL"
        c = str(int(c) - int(b) + 1)
        return "-".join((a, b, c))
    else:
        return mut

In [576]:
old_strain_muts = {}
with open("../Data/Mutation_data/Strain_mutations_v2.txt") as infile:
    for line in infile:
        strain, *muts = line.strip().split("\t")
        muts = list(map(sanitize_mut, muts))
        old_strain_muts[strain] = set(muts)
common_muts = set.intersection(*old_strain_muts.values())
#for strain in list(old_strain_muts):
#    old_strain_muts[strain] = old_strain_muts[strain] - common_muts
#old_strain_muts = {k: v for k, v in old_strain_muts.items() if k.startswith(comp)}

In [564]:
set.intersection(*old_strain_muts.values())

{'DEL-1978503-776',
 'DEL-2173361-2',
 'DEL-257908-776',
 'INS-3560455-G',
 'INS-4296380-CG'}

In [565]:
def permute_dict(di):
    k = di.keys()
    v = list(di.values())
    random.shuffle(v)
    return dict(zip(k, v))

#strain_to_muts = permute_dict(strain_to_muts)

In [567]:
for strain1 in list(strain_to_muts):
    for strain2 in list(strain_to_muts):
        if strain1 == strain2:
            print("|", end="")
        # print("%2d" % (100*len(strain_to_muts[strain1] & old_strain_muts[strain2]) / len(strain_to_muts[strain1] | old_strain_muts[strain2])), end=" ")
        # print("%2d" % (len(strain_to_muts[strain1] & old_strain_muts[strain2])), end=" ")
        print("%2d" % (len(strain_to_muts[strain1]) - len(old_strain_muts[strain2])), end=" ")
    print("")

|-8 -4 -3 -8 -7 -8 -5 -5 -3 -4 -5 -5 -4 -4 -5 -6 -7 -6 -6 -8 
-10 |-6 -5 -10 -9 -10 -7 -7 -5 -6 -7 -7 -6 -6 -7 -8 -9 -8 -8 -10 
-8 -4 |-3 -8 -7 -8 -5 -5 -3 -4 -5 -5 -4 -4 -5 -6 -7 -6 -6 -8 
-7 -3 -2 |-7 -6 -7 -4 -4 -2 -3 -4 -4 -3 -3 -4 -5 -6 -5 -5 -7 
41 45 46 41 |42 41 44 44 46 45 44 44 45 45 44 43 42 43 43 41 
-6 -2 -1 -6 -5 |-6 -3 -3 -1 -2 -3 -3 -2 -2 -3 -4 -5 -4 -4 -6 
-9 -5 -4 -9 -8 -9 |-6 -6 -4 -5 -6 -6 -5 -5 -6 -7 -8 -7 -7 -9 
-8 -4 -3 -8 -7 -8 -5 |-5 -3 -4 -5 -5 -4 -4 -5 -6 -7 -6 -6 -8 
-10 -6 -5 -10 -9 -10 -7 -7 |-5 -6 -7 -7 -6 -6 -7 -8 -9 -8 -8 -10 
-10 -6 -5 -10 -9 -10 -7 -7 -5 |-6 -7 -7 -6 -6 -7 -8 -9 -8 -8 -10 
-10 -6 -5 -10 -9 -10 -7 -7 -5 -6 |-7 -7 -6 -6 -7 -8 -9 -8 -8 -10 
-7 -3 -2 -7 -6 -7 -4 -4 -2 -3 -4 |-4 -3 -3 -4 -5 -6 -5 -5 -7 
-10 -6 -5 -10 -9 -10 -7 -7 -5 -6 -7 -7 |-6 -6 -7 -8 -9 -8 -8 -10 
120 124 125 120 121 120 123 123 125 124 123 123 124 |124 123 122 121 122 122 120 
-7 -3 -2 -7 -6 -7 -4 -4 -2 -3 -4 -4 -3 -3 |-4 -5 -6 -5 -5 -7 
-8 -4 -3 -8 -7 -8 -5 -5 -3 -4 

In [568]:
strain_to_muts
for strain in strain_to_muts:
    print(strain)
    print("   ", "\n    ".join(sorted(set(strain_to_muts[strain]).intersection(set(old_strain_muts[strain])))))
    print("")
    print("     ", "\n      ".join(sorted(set(strain_to_muts[strain]).difference(set(old_strain_muts[strain])))))
    print("")
    print("     ", "\n      ".join(sorted(set(old_strain_muts[strain]).difference(set(strain_to_muts[strain])))))

COUM3-9
    DEL-2810080-1165
    MOB-1293196-IS5-4
    SNP-1903497-C
    SNP-3922483-A
    SNP-3966727-T
    SNP-4183802-G
    SNP-663746-T

      DEL-3815810-1

      DEL-1299499-1199
      DEL-1978503-776
      DEL-2173361-2
      DEL-257908-776
      DEL-3423705-895
      DEL-3815809-1
      DEL-780632-982
      INS-3560455-G
      INS-4296380-CG
COUM5-5
    MOB-2784452-IS5-4
    SNP-1352163-A
    SNP-3966751-T
    SNP-4185540-T
    SNP-667158-T

      DEL-4628214-1

      DEL-1978503-776
      DEL-2173361-2
      DEL-257908-776
      DEL-3423495-3270
      DEL-4628212-1
      INS-3560455-G
      INS-4296380-CG
COUM4-2
    SNP-2141832-T
    SNP-3440924-G
    SNP-3473615-T
    SNP-4375431-C
    SNP-4627567-T

      DUP-3316547-
      DUP-3361691-
      DUP-3367122-

      DEL-1978503-776
      DEL-2173361-2
      DEL-257908-776
      DEL-3423726-828
      INS-3560455-G
      INS-4296380-CG
COUM5-8
    MOB-2784452-IS5-4
    SNP-1352163-A
    SNP-1768309-T
    SNP-3966751-T
    SNP-418

In [278]:
df["mut_id"]

0         SNP-7618-G
1        SNP-11664-C
2         DUP-19817-
3         DUP-19821-
4         DUP-19836-
5         DUP-19838-
6         DUP-19839-
7        SNP-52332-C
8        SNP-67483-T
9        SNP-82372-C
10       SNP-83066-C
11      SNP-111300-C
12      SNP-112052-G
13      SNP-146566-A
14      SNP-156869-G
15      SNP-206330-G
16      SNP-257250-C
17      SNP-301521-C
18      SNP-358399-G
19      DEL-367852-2
20      INS-380022-G
21      SNP-403627-G
22      SNP-430137-G
23      SNP-436419-C
24      SNP-457616-G
25      SNP-473636-G
26      SNP-511572-C
27      DEL-569541-1
28      SNP-587023-C
29      SNP-622725-C
           ...      
232    SNP-4128549-C
233    SNP-4133696-C
234    SNP-4143217-C
235    SNP-4181706-T
236    SNP-4181786-T
237    SNP-4185708-C
238    SNP-4188767-T
239    SNP-4256722-C
240    SNP-4257602-T
241    SNP-4272717-C
242    SNP-4290993-C
243    SNP-4304847-G
244    SNP-4321045-C
245    SNP-4369961-C
246    SNP-4372620-C
247    SNP-4378331-G
248    DEL-44

In [64]:
df.columns

Index(['Position', 'Mutation Type', 'Sequence Change', 'Gene', 'Function',
       'Product', 'GO Process', 'GO Component', 'Protein change',
       'TOL 20C nbutanol A0 F0 I1 R1', 'TOL 20C nbutanol A1 F50 I1 R1',
       'TOL 20C nbutanol A1 F50 I2 R1', 'TOL 20C nbutanol A1 F50 I3 R1',
       'TOL 20C nbutanol A2 F50 I1 R1', 'TOL 20C nbutanol A3 F50 I1 R1',
       'TOL 20C nbutanol A3 F50 I2 R1', 'TOL 20C nbutanol A3 F50 I3 R1',
       'TOL 20C nbutanol A4 F50 I1 R1', 'TOL 20C nbutanol A4 F50 I2 R1',
       'TOL 20C nbutanol A4 F50 I3 R1', 'TOL 20C nbutanol A5 F50 I1 R1',
       'TOL 20C nbutanol A5 F50 I2 R1', 'TOL 20C nbutanol A6 F50 I1 R1',
       'TOL 20C nbutanol A6 F50 I2 R1', 'TOL 20C nbutanol A6 F50 I3 R1',
       'TOL 20C nbutanol A7 F50 I1 R1', 'TOL 20C nbutanol A7 F50 I2 R1',
       'TOL 20C nbutanol A7 F50 I3 R1', 'TOL 20C nbutanol A8 F50 I1 R1',
       'TOL 20C nbutanol A8 F50 I2 R1'],
      dtype='object')

In [63]:
mapping[mapping[0] == 1]

Unnamed: 0,0,1,2,3,4
0,1,1,1,1,BUT1-2
1,1,1,2,1,BUT1-3
2,1,1,3,1,BUT1-5
3,1,2,1,1,BUT2-9
4,1,3,1,1,BUT3-3
5,1,3,2,1,BUT3-6
6,1,3,3,1,BUT3-7
7,1,4,1,1,BUT4-4
8,1,4,2,1,BUT4-7
9,1,4,3,1,BUT4-9


In [46]:
len(mapping[4])

233

In [207]:
breseq_file = "../Data/Mutation_data/Strain_mutations_v1.txt"

# Read the breseq file into a {strain: set(mutations)} dictionary.
strains = {}
with open(breseq_file) as infile:
    for line in infile:
        strain, *mutations = line.strip().split("\t")
        strains[strain] = set(mutations)
        

print(len(strains), "strains with a total of", len(set.union(*strains.values())), "mutations")

242 strains with a total of 5991 mutations


In [208]:
common_mutations = set()
mutation_counts = pd.DataFrame(
    {s: {m: 1 for m in muts} for s, muts in strains.items()}
).fillna(0).sum(1).astype("int").sort_values(ascending=False)
for mut, count in mutation_counts.items():
    if count > 230:
        common_mutations.add(mut)
print(len(common_mutations), "mutations are present in more than 230 strains")
print(
    "The most common of the remaining mutations is present in",
    mutation_counts[~mutation_counts.index.isin(common_mutations)].max()
)

# Remove common mutations from strain genotypes
strains = {strain: muts - common_mutations for strain, muts in strains.items()}

# Exclude "DEL-1299499-1199" as this seemed like an erroneous call
strains = {strain: muts - {"DEL-1299499-1199"} for strain, muts in strains.items()}

7 mutations are present in more than 230 strains
The most common of the remaining mutations is present in 67


In [210]:
# The strain 12PD6-9 had many low-coverage regions most likely due to sequencing issues.
# These missing-coverage calls are removed
strains["12PD6-9"] = frozenset(mut for mut in strains["12PD6-9"] if not mut.startswith("MCDEL"))

In [221]:
# Save file
with open("../Data/Mutation_data/All_strains_to_mutations.json", "w") as outfile:
    json.dump({k: list(v) for k, v in strains.items()}, outfile)

In [211]:
# Load mapping of mutations to affected genes.
# Mutations are mapped to any genes whose CDS contain the mutation.
# Intergenic mutations are mapped to the closest neighbor gene which the mutation is upstream of.
with open("../Data/Mutation_data/Mutations_to_gene_names.json") as infile:
    mutations_to_gene_names = {mut: set(genes) for mut, genes in json.load(infile).items()}

In [213]:
# A {strain: mutated_genes} dictionary is constructed

strain_to_gene_names = {strain: set.union(*(mutations_to_gene_names[mut] for mut in muts)) for strain, muts in strains.items()}

# Count how many strains have mutations in each gene
gene_counts = pd.DataFrame({s: {m: 1 for m in muts} for s, muts in strain_to_gene_names.items()}).fillna(0).sum(1).astype("int").sort_values(ascending=False)

# Find genes that are mutated in (almost) all strains, meaning it was most likely mutated in the background strain
common_gene_mutations = set(gene_counts[gene_counts > 240].index)


# Remove genes that are mutated in all strains
strain_to_gene_names = {strain: genes - common_gene_mutations for strain, genes in strain_to_gene_names.items()}
print(len(set.union(*strain_to_gene_names.values())), "total genes are mutated")
print("Most commonly mutated gene is mutated in", gene_counts[~gene_counts.index.isin(common_gene_mutations)].max(), "strains")

with open("../Data/Mutation_data/All_strains_to_gene_names.json", "w") as outfile:
    json.dump({strain: list(genes) for strain, genes in strain_to_gene_names.items()}, outfile)

2811 total genes are mutated
Most commonly mutated gene is mutated in 88 strains


In [218]:
# Find strains that have mutations in mutX genes
hypermutators = []
for strain, gens in strain_to_gene_names.items():
    for gen in gens:
        if gen.startswith("mut"):
            #print(strain, gen)
            hypermutators.append(strain)
hypermutators = set(hypermutators)
non_hyper_mutators = set(strains) - hypermutators
print(len(non_hyper_mutators), "strains are not hypermutators.")

190 strains are not hypermutators.


In [219]:
# Save data
with open("../Data/Mutation_data/Strain_to_genes.json", "w") as outfile:
    json.dump({strain: list(strain_to_gene_names[strain]) for strain in non_hyper_mutators}, outfile)

In [None]:
# This should be moved to the mutation histogram plot notebook

def overlaps(start1, end1, start2, end2):
    """Function to check whether two mutations overlap"""
    if start1 <= end2 and start2 <= end1:
        return True
    else:
        return False
    
# Remove any MCDEL calls that overlap with a DEL call
mcdels_to_remove = {}  # Remove duplicate MCDEL / DEL calls
for strain, muts in strains.items():
    dels = { # mut: (start, end) #
        mut: (int(mut.split("-")[1]), int(mut.split("-")[1])+int(mut.split("-")[2]))
        for mut in muts if mut.startswith("DEL")
    }
    mcdels = {
        mut: (int(mut.split("-")[1]), int(mut.split("-")[2]))
        for mut in muts if mut.startswith("MCDEL")
    }
    remove_list = []
    for mcdel, (start1, end1) in mcdels.items():
        for del_id, (start2, end2) in dels.items():
            if overlaps(start1, end1, start2, end2):
                remove_list.append(mcdel)
                break
    mcdels_to_remove[strain] = set(remove_list)

strains = {strain: muts - mcdels_to_remove[strain] for strain, muts in strains.items()}
print("Removing", len(set.union(*mcdels_to_remove.values())), "MCDEL mutations")
print(len(set.union(*strains.values())), "unique mutations")