# Parse growth rates

This notebook reads the output from croissance and extracts the relevant growth rates and associates them with the right strains using the plate layouts.

In [32]:
from croissance_parsing import parse_plate_data # Local module in this directory
import pandas as pd
import xlrd
import json
import re

## Initial screens (Biolector)

In [21]:
data_dir = "../Data/Growth_data/initial_tolerance_tests/"
screen_exps = {
    'butanol': 'tolerance_012714_butanol_coumarate.xlsx',
    'glutarate': 'tolerance_021014_coumarate_glutarate.xlsx',
    'coumarate': 'tolerance_021014_coumarate_glutarate.xlsx',
    'putrescine': 'tolerance_021114_adipic-acid_putrescine.xlsx',
    'HMDA': 'tolerance_012814_HDMA_6-aminohexanoic.xlsx',
    'adipate': 'tolerance_031914_adipate_propionate.xlsx',
    'isobutyrate': 'tolerance_040214_2,5-furandicarboxylate_isobutyrate.xlsx',
    'hexanoate': 'tolerance_021214_2,3-butanediol_hexanoic-acid.xlsx',
    '2,3-butanediol': 'tolerance_021214_2,3-butanediol_hexanoic-acid.xlsx',
    '1,2-propanediol': 'tolerance_032914_1,2-pentanediol_1,2-propanediol.xlsx',
    'octanoate': 'tolerance_081914_octanoate_caffeate.xlsx',
}

In [22]:
def parse_content(string):
    """Parse the media content of a well in the excel layout file"""
    string = string.replace("%", " %")
    try:
        conc, unit, comp, *junk = string.split()
    except ValueError:
        raise ValueError(string)
    assert unit.lower() in ("g/l", "v/v", "%"), unit
    conc = float(conc)
    return conc, unit, comp

# Read the excel files
layouts = {}
for comp, filename in screen_exps.items():
    layouts[comp] = {}
    workbook = xlrd.open_workbook(data_dir+"raw Biolector data/"+filename)
    sheet = workbook.sheet_by_name("subtracted")
    for well, content in zip(sheet.col_slice(0), sheet.col_slice(2)):
        well, content = well.value, content.value
        if well == "":
            continue
        # Skip any cells containing "pentane" to not confuse 12-propanediol with 12-pentanediol
        if comp[:4] in content and "pentane" not in content: 
            layouts[comp][well] = parse_content(content)

In [23]:
# Parse the growth phases from the croissance output
growth_rates = {}
for comp, name in screen_exps.items():
    growth_rates[comp] = {}
    name = name.split(".")[0] + ".OD.v2.output.json"
    with open(data_dir + "output/" + name) as infile:
        json_data = json.load(infile)
        curves = json_data["curves"]
    data = parse_plate_data(
        curves,
        phase_length_cutoff=3,
        max_abs_baseline=4,
        max_baseline_dev=4,
        time_cutoff=20,
        verbose=False
    )
    for well in layouts[comp]:
        growth_rates[comp][well] = data[well]['slope']

In [24]:
data = []
for comp in layouts:
    for well, (conc, unit, med) in layouts[comp].items():
        data.append(
            {"compound": comp, "conc": conc,
             "growth_rate": growth_rates[comp][well],
             "medium": med, "unit": unit, "well": well}
        )
df = pd.DataFrame(data)
df.to_csv("../Data/Growth_data/initial_tolerance_tests/Initial_tolerance_data_frame.tsv", sep="\t", index=None)

## Evolved isolate screening (Biolector)

In [26]:
data_dir = "../Data/Growth_data/evolved-isolate-growth-data/"
exp_list = {
    'ALE_1,2-propanediol_duplicates_121514': "1,2-propanediol",
    'ALE_2,3-butanediol_duplicates_122414': "2,3-butanediol",
    'ALE_adipate_duplicates_091014': "adipate",
    'ALE_butanol_duplicates_060514': "butanol",
    'ALE_coumarate_duplicates_061914': "coumarate",
    'ALE_glutarate_duplicates_060214': "glutarate",
    'ALE_HDMA_duplicates_080114': "HMDA",
    'ALE_hexanoate_duplicates_101314': "hexanoate",
    'ALE_isobutyrate_duplicates_091114': "isobutyrate",
    'ALE_octanoate_duplicates_122214': "octanoate",
    'ALE_putrescine_duplicates_073014': "putrescine",
}

In [27]:
layouts = {}
for exp, comp in exp_list.items():
    date = exp.split("_")[-1]
    excel_name = exp + ".xlsx"
    layouts[exp] = {}
    workbook = xlrd.open_workbook(data_dir + "raw Biolector data/" + excel_name)
    sheet = workbook.sheet_by_name("subtracted")
    for well, strain in zip(sheet.col_slice(0), sheet.col_slice(2)):
        well, strain = well.value, strain.value
        if well == "":
            continue
        layouts[exp][well] = strain

In [28]:
growth_rates = {}
for exp in exp_list:
    growth_rates[exp] = {}
    with open(data_dir + "output/" + exp + ".v2.output.json") as f:
        json_data = json.load(f)
    curves = json_data["curves"]
    data = parse_plate_data(
        curves,
        phase_length_cutoff=3,
        max_abs_baseline=3,
        max_baseline_dev=3,
        verbose=False
    )
    for well in layouts[exp]:
        growth_rates[exp][well] = data[well]["slope"]

In [29]:
def parse_strain(strain):
    if strain.startswith("MG1655"):
        strain = strain.replace("-", "_")
    strain, repl = strain.split("_", 1)
    repl = int(repl)
    return strain, repl

data = []
for exp, comp in exp_list.items():
    for well, strain in layouts[exp].items():
        strain, repl = parse_strain(strain)
        if "HDMA" in strain:
            strain = strain.replace("HDMA", "HMDA")
        gr = growth_rates[exp][well]
        data.append(
            {"compound": comp, "strain": strain, "growth_rate": gr,
             "experiment": exp, "well": well, "repl": repl}
        )
df = pd.DataFrame(data)
df.to_csv("../Data/Growth_data/evolved-isolate-growth-data/Evolved_isolates_data_frame.tsv", sep="\t", index=None)

## Cross tolerance screening (Growth profiler)

In [30]:
data_dir = "../Data/Growth_data/Cross_tolerance/"
tray_2_substrate = {1: "butanol",
    2: "glutarate",
    3: "coumarate",
    4: "2,3-butanediol",
    5: "putrescine",
    6: "HMDA",
    7: "adipate",
    8: "isobutyrate",
    9: "hexanoate",
    10: "octanoate",
    11: "1,2-propanediol",
    12: "NaCl"
}

In [34]:
layout_filename = "../Data/Growth_data/Cross_tolerance/plate_layout.txt"
strain_layout = {}
with open(layout_filename) as infile:
    for line in infile:
        date_exp, well, strain = line.strip("\n").split("\t")
        date_exp = "".join(date_exp.split(",")) # remove , from e.g. 1,2-propanediol
        date_exp = "".join(date_exp.split("-")) # remove - from e.g. 1,2-propanediol
        date = "_".join(date_exp.split("_")[::-1])
        if "hexanoate_041715" in date:
            date = date.replace("hexanoate_041715", "hexanoate_plate1_041715") # Use consistent naming
        if "oddsnends" in date:
            date = date.replace("oddsnends", "odds-n-ends") # Use consistent naming
        strain = re.sub(r"HDMA", "HMDA", strain) # Fix a typo
        strain_layout.setdefault(date, {})[well] = strain

In [60]:
with open("../Data/Mutation_data/Strain_to_genes.json") as infile:
    new_genotypes = {k: set(v) for k, v in json.load(infile).items()}

In [61]:
with open("../../Data/Mutations/Strain_to_genes.json") as infile:
    old_genotypes = {k: set(v) for k, v in json.load(infile).items()}

In [62]:
len(set.union(*new_genotypes.values()))

401

In [63]:
len(set.union(*old_genotypes.values()))

399

In [65]:
set.union(*new_genotypes.values()) - set.union(*old_genotypes.values())

{'rrfD', 'rrlD'}

In [39]:
new_genotypes == old_genotypes

False

In [40]:
len(new_genotypes)

190

In [41]:
len(old_genotypes)

189

In [42]:
set(new_genotypes) - set(old_genotypes)

{'IBUA8-3'}

In [43]:
{k: new_genotypes[k] for k in old_genotypes} == old_genotypes

False

In [47]:
for k in old_genotypes:
    if set(new_genotypes[k]) != set(old_genotypes[k]):
        print(k, sorted(new_genotypes[k]), sorted(old_genotypes[k]))
        print()

COUM3-10 ['atpI', 'dacA', 'hns', 'manY', 'pyrE', 'rho', 'rph', 'rpoB', 'tdk'] ['atpI', 'dacA', 'hns', 'manY', 'mprA', 'pyrE', 'rho', 'rph', 'rpoB', 'tdk', 'ygaH', 'ygaZ']

BUT7-7 ['manY', 'oppA', 'pyrE', 'rob', 'rph', 'ychE', 'yebO', 'yobF'] ['insH1', 'manY', 'mppA', 'pgrR', 'pyrE', 'rob', 'rph', 'yebO', 'ynaI', 'yobF']

OCTA5-8 ['gtrS', 'hns', 'oppA', 'recE', 'rpoC', 'tdk', 'ychE', 'ydcI', 'yihQ', 'yihR'] ['gtrS', 'hns', 'recE', 'rpoC', 'tdk', 'ydcI', 'yihQ', 'yihR']

GLUT1-10 ['gltW', 'greA', 'hofM', 'kgtP', 'oppA', 'proV', 'rrfG', 'rrlG', 'rrsG', 'spoT', 'ychE', 'yiaT', 'yiaU'] ['gltW', 'greA', 'hofM', 'kgtP', 'proV', 'rrfG', 'rrlG', 'rrsG', 'spoT', 'yiaT', 'yiaU']

GLUT4-10 ['csiD', 'kgtP', 'oppA', 'rpoC', 'spoT', 'ychE', 'ygaQ'] ['csiD', 'kgtP', 'rpoC', 'spoT', 'ygaQ']

23BD1-6 ['elfA', 'gabP', 'metJ', 'nanK', 'purT', 'rnb', 'rpoB'] ['barA', 'elfA', 'gabP', 'gudD', 'metJ', 'nanK', 'purT', 'relA', 'rlmD', 'rnb', 'rpoB']

HEXA8-2 ['cydA', 'mngB', 'murG', 'ompC', 'oppA', 'rcsD', 'rpo

In [None]:
len()