# Parse growth rates

This notebook reads the output from croissance and extracts the relevant growth rates and associates them with the right strains using the plate layouts.

In [1]:
from croissance_parsing import parse_plate_data # Local module in this directory
import pandas as pd
import xlrd
import json
import re
import numpy as np
import os

In [2]:
cols = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"]
rows = ["A", "B", "C", "D", "E", "F", "G", "H"]
wells_96 = [str(row)+col for row in rows for col in cols]

## Initial screens (Biolector)

In [3]:
data_dir = "../Data/Growth_data/initial_tolerance_tests/"
screen_exps = {
    'butanol': 'tolerance_012714_butanol_coumarate.xlsx',
    'glutarate': 'tolerance_021014_coumarate_glutarate.xlsx',
    'coumarate': 'tolerance_021014_coumarate_glutarate.xlsx',
    'putrescine': 'tolerance_021114_adipic-acid_putrescine.xlsx',
    'HMDA': 'tolerance_012814_HDMA_6-aminohexanoic.xlsx',
    'adipate': 'tolerance_031914_adipate_propionate.xlsx',
    'isobutyrate': 'tolerance_040214_2,5-furandicarboxylate_isobutyrate.xlsx',
    'hexanoate': 'tolerance_021214_2,3-butanediol_hexanoic-acid.xlsx',
    '2,3-butanediol': 'tolerance_021214_2,3-butanediol_hexanoic-acid.xlsx',
    '1,2-propanediol': 'tolerance_032914_1,2-pentanediol_1,2-propanediol.xlsx',
    'octanoate': 'tolerance_081914_octanoate_caffeate.xlsx',
}

In [4]:
def parse_content(string):
    """Parse the media content of a well in the excel layout file"""
    string = string.replace("%", " %")
    try:
        conc, unit, comp, *junk = string.split()
    except ValueError:
        raise ValueError(string)
    assert unit.lower() in ("g/l", "v/v", "%"), unit
    conc = float(conc)
    return conc, unit, comp

# Read the excel files
layouts = {}
for comp, filename in screen_exps.items():
    layouts[comp] = {}
    workbook = xlrd.open_workbook(data_dir+"raw Biolector data/"+filename)
    sheet = workbook.sheet_by_name("subtracted")
    for well, content in zip(sheet.col_slice(0), sheet.col_slice(2)):
        well, content = well.value, content.value
        if well == "":
            continue
        # Skip any cells containing "pentane" to not confuse 12-propanediol with 12-pentanediol
        if comp[:4] in content and "pentane" not in content: 
            layouts[comp][well] = parse_content(content)

In [5]:
" ".join(map(str, layouts["glutarate"]["D02"]))

'10.0 g/L glutarate'

In [6]:
# Compile growth curves
for comp, exp in screen_exps.items():
    exp = exp.split(".")[0]
    growth_data = pd.read_csv(data_dir + "output/" + exp + ".OD.v2.tsv", sep="\t", index_col=0)
    for well in list(growth_data):
        ser = growth_data[well]
        if well in layouts[comp]:
            strain = " ".join(map(str, layouts[comp][well]))
            growth_data[strain] = ser
        del growth_data[well]
    growth_data.to_csv(data_dir + "/curves/" + exp + ".tsv", sep="\t")

In [7]:
# Parse the growth phases from the croissance output
growth_rates = {}
for comp, name in screen_exps.items():
    growth_rates[comp] = {}
    name = name.split(".")[0] + ".OD.v2.output.json"
    with open(data_dir + "output/" + name) as infile:
        json_data = json.load(infile)
        curves = json_data["curves"]
    data = parse_plate_data(
        curves,
        phase_length_cutoff=3,
        max_abs_baseline=4,
        max_baseline_dev=4,
        time_cutoff=20,
        verbose=False
    )
    for well in layouts[comp]:
        growth_rates[comp][well] = data[well]['slope']

In [8]:
data = []
for comp in layouts:
    for well, (conc, unit, med) in layouts[comp].items():
        data.append(
            {"compound": comp, "conc": conc,
             "growth_rate": growth_rates[comp][well],
             "medium": med, "unit": unit, "well": well}
        )
df = pd.DataFrame(data)
df.to_csv("../Data/Growth_data/initial_tolerance_tests/Initial_tolerance_data_frame.tsv", sep="\t", index=None)

## Evolved isolate screening (Biolector)

In [9]:
data_dir = "../Data/Growth_data/evolved-isolate-growth-data/"
exp_list = {
    'ALE_1,2-propanediol_duplicates_121514': "1,2-propanediol",
    'ALE_2,3-butanediol_duplicates_122414': "2,3-butanediol",
    'ALE_adipate_duplicates_091014': "adipate",
    'ALE_butanol_duplicates_060514': "butanol",
    'ALE_coumarate_duplicates_061914': "coumarate",
    'ALE_glutarate_duplicates_060214': "glutarate",
    'ALE_HDMA_duplicates_080114': "HMDA",
    'ALE_hexanoate_duplicates_101314': "hexanoate",
    'ALE_isobutyrate_duplicates_091114': "isobutyrate",
    'ALE_octanoate_duplicates_122214': "octanoate",
    'ALE_putrescine_duplicates_073014': "putrescine",
}

In [10]:
layouts = {}
for exp, comp in exp_list.items():
    date = exp.split("_")[-1]
    excel_name = exp + ".xlsx"
    layouts[exp] = {}
    workbook = xlrd.open_workbook(data_dir + "raw Biolector data/" + excel_name)
    sheet = workbook.sheet_by_name("subtracted")
    for well, strain in zip(sheet.col_slice(0), sheet.col_slice(2)):
        well, strain = well.value, strain.value
        if well == "":
            continue
        layouts[exp][well] = strain

In [11]:
# Compile growth curves
for exp in exp_list:
    growth_data = pd.read_csv(data_dir + "/output/" + exp + ".v2.tsv", sep="\t", index_col=0)
    for well in list(growth_data):
        ser = growth_data[well]
        if well in layouts[exp]:
            strain = layouts[exp][well]
            growth_data[strain] = ser
        del growth_data[well]
    growth_data.to_csv(data_dir + "/curves/" + exp + ".tsv", sep="\t")

In [12]:
# Parse growth rates
growth_rates = {}
for exp in exp_list:
    growth_rates[exp] = {}
    with open(data_dir + "output/" + exp + ".v2.output.json") as f:
        json_data = json.load(f)
    curves = json_data["curves"]
    data = parse_plate_data(
        curves,
        phase_length_cutoff=3,
        max_abs_baseline=3,
        max_baseline_dev=3,
        verbose=False
    )
    for well in layouts[exp]:
        growth_rates[exp][well] = data[well]["slope"]

In [13]:
def parse_strain(strain):
    if strain.startswith("MG1655"):
        strain = strain.replace("-", "_")
    strain, repl = strain.split("_", 1)
    repl = int(repl)
    return strain, repl

data = []
for exp, comp in exp_list.items():
    for well, strain in layouts[exp].items():
        strain, repl = parse_strain(strain)
        if "HDMA" in strain:
            strain = strain.replace("HDMA", "HMDA")
        gr = growth_rates[exp][well]
        data.append(
            {"compound": comp, "strain": strain, "growth_rate": gr,
             "experiment": exp, "well": well, "repl": repl}
        )
df = pd.DataFrame(data)
df.to_csv("../Data/Growth_data/evolved-isolate-growth-data/Evolved_isolates_data_frame.tsv", sep="\t", index=None)

## Cross tolerance screening (Growth profiler)

In [40]:
data_dir = "../Data/Growth_data/Cross_tolerance/"
tray_2_substrate = {1: "butanol",
    2: "glutarate",
    3: "coumarate",
    4: "2,3-butanediol",
    5: "putrescine",
    6: "HMDA",
    7: "adipate",
    8: "isobutyrate",
    9: "hexanoate",
    10: "octanoate",
    11: "1,2-propanediol",
    12: "NaCl"
}

In [41]:
layout_filename = "../Data/Growth_data/Cross_tolerance/plate_layout.txt"
strain_layout = {}
with open(layout_filename) as infile:
    for line in infile:
        date_exp, well, strain = line.strip("\n").split("\t")
        date_exp = "".join(date_exp.split(",")) # remove , from e.g. 1,2-propanediol
        date_exp = "".join(date_exp.split("-")) # remove - from e.g. 1,2-propanediol
        date = "_".join(date_exp.split("_")[::-1])
        if "hexanoate_041715" in date:
            date = date.replace("hexanoate_041715", "hexanoate_plate1_041715") # Use consistent naming
        if "oddsnends" in date:
            date = date.replace("oddsnends", "odds-n-ends") # Use consistent naming
        strain = re.sub(r"HDMA", "HMDA", strain) # Fix a typo
        strain_layout.setdefault(date, {})[well] = strain

In [42]:
exp_list = list(strain_layout.keys())
exp_list.remove("hexanoate_022815")

tray_numbers = list(range(1,13))

In [43]:
strain_2_compound = {
    "HDMA": "HMDA",
    "HMDA": "HMDA",
    "ADIP": "adipate",
    "HEXA": "hexanoate",
    "IBUA": "isobutyrate",
    "OCTA": "octanoate",
    "23BD": "2,3-butanediol",
    "12PD": "1,2-propanediol",
    "COUM": "coumarate",
    "PUTR": "putrescine",
    "BUT": "butanol",
    "GLUT": "glutarate"
}

def parse_strain(strain_string):
    strain, *repl = strain_string.split("_")
    if len(repl) == 1:
        repl = repl[0]
    elif len(repl) == 0:
        repl = 0
    else:
        raise ValueError("_".join([strain]+repl))
    for key, compound in strain_2_compound.items():
        if strain.startswith(key):
            evolved_substrate = compound
            break
    else:
        evolved_substrate = "NA"
        if strain.startswith("MG1655"):
            strain = "MG1655"
    return strain, repl, evolved_substrate

In [44]:
# Parse layouts for plates that had another layout than the rest
def read_plate_layout(sheet):
    layout = {}
    for i, row in enumerate(("A", "B", "C", "D", "E", "F", "G", "H")):
        for j in range(1, 13):
            well = row+str(j)
            cell = sheet.cell_value(i+1, j)
            if cell:
                layout[well] = cell
    return layout

odds_tray2_layout = read_plate_layout(xlrd.open_workbook("../Data/Growth_data/Cross_tolerance/odds_glutarate_non_standard.xlsx").sheet_by_index(0))
odds_tray2_media = {well: "butanol" for well in odds_tray2_layout if well[1:] in ("8", "9")}

odds_tray12_layout = read_plate_layout(xlrd.open_workbook("../Data/Growth_data/Cross_tolerance/odds_layout_nacl_non_standard.xlsx").sheet_by_index(0))
odds_tray12_media = {well: "NaCl" for well in odds_tray12_layout if int(well[1:]) <= 7}
odds_tray12_media.update(
    {well: "butanol" for well in odds_tray12_layout if int(well[1:]) >= 8}
)

In [45]:
# Output cleaned strain layout
with open("../Data/Growth_data/Cross_tolerance/Cleaned_layouts.txt", "w") as outfile:
    print("Experiment", "Tray", "Well", "Strain", "Medium_addition", sep="\t", file=outfile)
    for exp, layout in strain_layout.items():
        for tray_number in range(1, 13):
            grown_substrate = tray_2_substrate[tray_number]
            media_dict = {}
            if exp == "odds-n-ends_042415":
                if tray_number == 1:
                    layout = strain_layout["glutarate_110314"]
                elif tray_number == 2:
                    layout = odds_tray2_layout
                    media_dict = odds_tray2_media
                elif tray_number == 12:
                    layout = odds_tray12_layout
                    media_dict = odds_tray12_media
            for well, strain in layout.items():
                med = media_dict.get(well, grown_substrate)
                print(exp, tray_number, well, strain, med, sep="\t", file=outfile)

In [46]:
exp_list

['hexanoate_plate1_041715',
 'coumarate_111114',
 'odds-n-ends_042415',
 'adipate_012415',
 '23butanediol_041415',
 'putrescine_111914',
 'octanoate_030415',
 'glutarate_110314',
 '12propanediol_042215',
 'butanol_110714',
 'hmda_012215',
 'isobutyrate_030215']

In [49]:
cross_tolerance_time_cutoff = 40

output_datafile = "../Data/Growth_data/Cross_tolerance/Extracted_growth_rates_NO_CUT.csv"
all_data = []
for exp_name in exp_list:
    print(exp_name)
    for tray_number in tray_numbers:
        if exp_name == "glutarate_110314" and tray_number == 6:
            # This plate was bad and was rerun in odds-n-ends instead
            print("Skipping GLUT on", tray_2_substrate[tray_number])
            continue
        if exp_name == "hexanoate_plate1_041715" and tray_number in [3, 6, 9, 12]:
            # These plates were bad and have been rerun
            print("Skipping HEXA on", tray_2_substrate[tray_number])
            continue
        grown_substrate = tray_2_substrate[tray_number]
        path = data_dir+"cross-compound_"+exp_name+"_tray"+str(tray_number)+".OD.v2.output.json"
        with open(path) as infile:
            plate_data = json.load(infile)["curves"]
        plate_res = parse_plate_data(plate_data, time_cutoff=cross_tolerance_time_cutoff)
        
        # Handle special cases of plate layouts
        media_dict = {}
        if exp_name == "odds-n-ends_042415":
            if tray_number == 1:
                layout = strain_layout["glutarate_110314"]
                grown_substrate = "HMDA"
            elif tray_number == 2:
                layout = odds_tray2_layout
                media_dict = odds_tray2_media
            elif tray_number == 12:
                layout = odds_tray12_layout
                media_dict = odds_tray12_media
            else:
                layout = strain_layout[exp_name]
        else:
            layout = strain_layout[exp_name]
        
        for well in wells_96:
            well_res = plate_res[well]
            slope, intercept, baseline = well_res["slope"], well_res["intercept"], well_res["baseline"]
            tod1 = well_res["tod1"]
            if pd.isnull(tod1):
                slope = 0
            strain = layout.get(well)
            if strain is None:
                continue
            strain, repl, evolved_substrate = parse_strain(strain)
            
            all_data.append( (evolved_substrate, media_dict.get(well, grown_substrate), tray_number, strain, repl, slope, tod1, exp_name, well) )


# Hexanoate rerun plates
rerun_dir = "../Data/Growth_data/HEXA_rerun_cross_tolerance/"
for tray_number, grown_substrate in [(2, "coumarate"), (5, "HMDA"), (8, "hexanoate"), (11, "NaCl")]:
    exp_name = "hexanoate_redoplates"
    path = rerun_dir+"HEXA_rerun_tray"+str(tray_number)+".OD.v2.trim.output.json"
    with open(path) as infile:
        plate_data = json.load(infile)["curves"]
    plate_res = parse_plate_data(plate_data, time_cutoff=cross_tolerance_time_cutoff)
        
    for well in wells_96:
        well_res = plate_res[well]
        slope, intercept, baseline = well_res["slope"], well_res["intercept"], well_res["baseline"]
        tod1 = well_res["tod1"]
        if pd.isnull(tod1):
            slope = 0
        strain = strain_layout["hexanoate_plate1_041715"].get(well)

        if strain is None:
            continue
        strain, repl, evolved_substrate = parse_strain(strain)

        all_data.append( (evolved_substrate, media_dict.get(well, grown_substrate), tray_number, strain, repl, slope, tod1, exp_name, well) )    

df = pd.DataFrame(
    all_data, columns=[
        "evolved_compound",
        "grown_compound",
        "tray",
        "strain",
        "repl",
        "growth_rate",
        "tOD1",
        "exp_name",
        "well"
    ]
)
df.to_csv(output_datafile, index=None)

hexanoate_plate1_041715
Skipping HEXA on coumarate
Skipping HEXA on HMDA
Skipping HEXA on hexanoate
Skipping HEXA on NaCl
coumarate_111114
odds-n-ends_042415
adipate_012415
23butanediol_041415
putrescine_111914
octanoate_030415
glutarate_110314
Skipping GLUT on HMDA
12propanediol_042215
butanol_110714
hmda_012215
isobutyrate_030215


In [50]:
processed_df = df[~df["strain"].isin(["M9 blank", "don't use"])]
background_growth = dict(
    processed_df[(processed_df["strain"] == "MG1655") & (processed_df["growth_rate"] != 0)].groupby(["exp_name", "tray", "grown_compound"]).mean()["growth_rate"]
)

processed_df["background"] = processed_df.apply(lambda x: background_growth.get((x["exp_name"], x["tray"], x["grown_compound"]), np.nan), axis=1)
processed_df["rel_growth_rate"] = processed_df["growth_rate"] - processed_df["background"]
processed_df["growth_ratio"] = processed_df["growth_rate"] / processed_df["background"]
del processed_df["background"]
processed_df.to_csv("../Data/Growth_data/Cross_tolerance/Processed_growth_rates_NO_CUT.tsv", sep="\t")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## M9 screening

In [23]:
m9_layout = {"_".join(key.split("_")[:-1]): val for key, val in strain_layout.items()} # Layout from cross-tolerance screen
m9_layout["1,2-propanediol"] = m9_layout["12propanediol"]
m9_layout["2,3-butanediol"] = m9_layout["23butanediol"]

In [24]:
m9_outfile_name = "../Data/Growth_data/M9/M9_normalised_growth.csv"
m9_data_dir = "../Data/Growth_data/M9/"
m9_files = [f for f in os.listdir(m9_data_dir) if f.endswith("output.json")]

In [25]:
m9_data = []
for filename in m9_files:
        exp_name = filename.split(".")[0]
        grown_substrate = "M9"
        path = m9_data_dir+filename
        print(path)
        with open(path) as infile:
            plate_data = json.load(infile)["curves"]
        plate_res = parse_plate_data(plate_data, time_cutoff=30, phase_length_cutoff=2)
        
        wild_type = []
        for well in ["A1", "B1", "C1", "D1", "E1"]:  # wild types
            well_res = plate_res[well]
            slope, intercept, baseline, has_growth = well_res["slope"], well_res["intercept"], well_res["baseline"], well_res["growth"]
            if has_growth:
                wild_type.append(slope)
        if len(wild_type) < 1:
            raise Exception("Something is wrong")
        wild_type = sum(wild_type) / len(wild_type)
        
        for well in wells_96:
            well_res = plate_res[well]
            slope, intercept, baseline, has_growth = well_res["slope"], well_res["intercept"], well_res["baseline"], well_res["growth"]
            rel_slope = slope - wild_type
            slope_ratio = slope / wild_type
            #print(rel_slope)
            compound_name = strain_2_compound[filename.split(".")[0].split("_")[0]].lower()
            if compound_name == "hexanoate":
                compound_name = "hexanoate_plate1"
            strain = m9_layout[compound_name].get(well)

            if strain is None:
                continue
            strain, repl, evolved_substrate = parse_strain(strain)

            m9_data.append( (evolved_substrate, grown_substrate, strain, repl, slope, rel_slope, slope_ratio, exp_name) )

m9_df = pd.DataFrame(m9_data, columns=["evolved_compound", "grown_compound", "strain", "repl", "growth_rate", "rel_growth_rate", "growth_ratio", "exp_name"])
m9_df.to_csv(m9_outfile_name, index=None)

../Data/Growth_data/M9/12PD.OD.v2.output.json
../Data/Growth_data/M9/23BD.OD.v2.output.json
../Data/Growth_data/M9/ADIP.OD.v2.output.json
../Data/Growth_data/M9/BUT.OD.v2.output.json
../Data/Growth_data/M9/COUM.OD.v2.output.json
../Data/Growth_data/M9/GLUT.OD.v2.output.json
../Data/Growth_data/M9/HEXA_group1.OD.v2.output.json
../Data/Growth_data/M9/HMDA.OD.v2.output.json
../Data/Growth_data/M9/IBUA.OD.v2.output.json
../Data/Growth_data/M9/OCTA.OD.v2.output.json
../Data/Growth_data/M9/PUTR.OD.v2.output.json


## Keio screenings

In [26]:
keio_dir = "../Data/Growth_data/KEIO_primary/"
files = [f for f in os.listdir(keio_dir) if f.endswith(".json")]

In [27]:
def read_plate_layout(sheet, start_row=0, start_col=0):
    layout = {}
    for i, row in enumerate(("A", "B", "C", "D", "E", "F", "G", "H")):
        for j in range(1, 13):
            well = row+str(j)
            cell = sheet.cell_value(start_row+i+1, start_col+j)
            if cell:
                layout[well] = cell
    return layout

In [28]:
primary_sheets = [
    "plate1-BUT GLUT COUM PUTR",
    "plate2-HMDA ADIP IBUA HEXA",
    "plate3-23BD 12PD OCTA",
    "plate4 - multi-chem"
]

primary_strain_layout = {}
primary_media_layout = {}

worksheet = xlrd.open_workbook(
    "../Data/Growth_data/KEIO_primary/Keio_plates_KO_screening-modifications_fixed.xlsx"
)
for sheet_name in primary_sheets:
    sheet = worksheet.sheet_by_name(sheet_name)
    plate_name = sheet_name.split("-")[0].strip()
    if plate_name == "plate4":
        plate_name = "plate4-1"
    strain_layout = read_plate_layout(sheet, 0, 0)
    media_layout = read_plate_layout(sheet, 20, 0)
    primary_strain_layout[plate_name] = strain_layout
    primary_media_layout[plate_name] = media_layout
    
    if plate_name == "plate4-1":
        plate_name = "plate4-2"
        
        main_strain_layout = strain_layout
        strain_layout = read_plate_layout(sheet, 0, 0)
        media_layout = read_plate_layout(sheet, 30, 0)
        
        for well, med in list(media_layout.items()):
            if "(" in med:
                med, strain_well = med.strip().split()
                assert strain_well.startswith("(")
                assert strain_well.endswith(")")
                strain_well = strain_well[1:-1]
                strain_layout[well] = main_strain_layout[strain_well]
                media_layout[well] = med
                # print(well, strain_layout[well])
        primary_strain_layout[plate_name] = strain_layout
        primary_media_layout[plate_name] = media_layout           

In [29]:
tray_2_contents = {
    "tray1": ("plate1", "low"),
    "tray2": ("plate2", "low"),
    "tray3": ("plate3", "low"),
    "tray4": ("plate1", "high"),
    "tray5": ("plate2", "high"),
    "tray6": ("plate3", "high"),
    "tray7": ("plate4-1", "low"),
    "tray8": ("plate4-2", "low"),
    "tray10": ("plate4-1", "high"),
    "tray11": ("plate4-2", "high")
}

compound_concentrations = {
    "12PD": {"low": 6, "high": 8, "unit": "% v/v"},
    "23BD": {"low": 6, "high": 7, "unit": "% v/v"},
    "HMDA": {"low": 32, "high": 38, "unit": "g/L"},
    "PUTR": {"low": 32, "high": 38, "unit": "g/L"},
    "GLUT": {"low": 40, "high": 47.5, "unit": "g/L"},
    "ADIP": {"low": 45, "high": 50, "unit": "g/L"},
    "HEXA": {"low": 3, "high": 5, "unit": "g/L"},
    "OCTA": {"low": 8, "high": 10, "unit": "g/L"},
    "COUM": {"low": 7.5, "high": 10, "unit": "g/L"},
    "IBUA": {"low": 7.5, "high": 12.5, "unit": "g/L"},
    "BUT": {"low": 1.4, "high": 1, "unit": "% v/v"},
}

keio_data = []

for file in files:
    with open(keio_dir+file) as infile:
        curves = json.load(infile)["curves"]
    plate_result = parse_plate_data(
        curves,
        max_slope=0.7
    )
    tray_name = file.split(".")[0].split("_")[-1]
    plate_name, conc = tray_2_contents[tray_name]
    
    strain_layout = primary_strain_layout[plate_name]
    media_layout = primary_media_layout[plate_name]
    
    for well, dat in plate_result.items():
        slope = dat["slope"]
        try:
            strain = strain_layout[well].strip()
            media = media_layout[well].strip()
        except KeyError:
            continue
        
        keio_data.append({"strain": strain, "compound": media, "concentration": conc, "plate": plate_name, "well": well, "growth_rate": slope})
        
    
primary_df = pd.DataFrame(keio_data)[["strain", "compound", "concentration", "plate", "well", "growth_rate"]]
primary_df = primary_df[primary_df["plate"] != "plate4-2"]

def relative_growth_rate(idx):
    row = primary_df.loc[idx]
    normalise_df = primary_df[
        (primary_df["strain"] == "BW25113") &
        (primary_df["compound"] == row["compound"]) &
        (primary_df["concentration"] == row["concentration"]) &
        (primary_df["plate"] == row["plate"]) &
        (primary_df["growth_rate"] != 0)
    ]
    if len(normalise_df) == 0:
        return np.nan
    else:
        return row["growth_rate"] - normalise_df["growth_rate"].mean()
    
primary_df["rel_growth_rate"] = primary_df.index.map(relative_growth_rate)
primary_df["concentration_level"] = primary_df["concentration"].copy()
primary_df["concentration"] = primary_df.apply(
    lambda x: "%.1f %s" % (
        compound_concentrations[x["compound"]][x["concentration_level"]],
        compound_concentrations[x["compound"]]["unit"]),
    axis=1
)
primary_df = primary_df.sort_values(["compound", "concentration_level", "strain"])
primary_df.to_csv("../Data/Growth_data/KEIO_primary/Keio_growth_rates.tsv", sep="\t", index=None)

## Reconstructions

In [30]:
ko_data_dir = "../Data/Growth_data/Reconstructions/knockout growth data/raw Biolector data/"
mage_data_dir = "../Data/Growth_data/Reconstructions/MAGE mutant growth data/raw Biolector data/"

In [31]:
ko_xls_files = [f for f in os.listdir(ko_data_dir) if f.endswith(".xlsx") and not f.startswith("~")]
mage_xls_files = [f for f in os.listdir(mage_data_dir) if f.endswith(".xlsx") and not f.startswith("~")]

In [32]:
def strip_strain(strain):
    strain = strain.split(" ")[0]
    m1 = re.search(r"-\d+$", strain)
    m2 = re.search(r"_\d+$", strain)
    if m1:
        strain = re.split(r"-\d+$", strain)[0]
    elif m2:
        strain = re.split(r"_\d+$", strain)[0]
    if strain.lower() in ("nothing", "medium", "blank", "m9"):
        return None
    return strain

def strip_medium(med):
    comp_str = med.split("+")
    if len(comp_str) > 1:
        comp_str = comp_str[1].strip()
    else:
        return "M9"
    comp = comp_str.split()[-1]
    return comp
    

ko_layouts = {}
for f in ko_xls_files:
    workbook = xlrd.open_workbook(ko_data_dir + f)
    sheet = workbook.sheet_by_index(1)
    exp_name = f.split(".")[0]
    ko_layouts[exp_name] = {}
    for i, (well, strain) in enumerate(zip(sheet.col_slice(0), sheet.col_slice(2))):
        if i == 0:
            continue
        elif not well.value:
            break
        ko_layouts[exp_name][well.value] = strip_strain(strain.value)
        
with open("../Data/Growth_data/Reconstructions/knockout growth data/knockout_layouts.json", "w") as outfile:
    json.dump(ko_layouts, outfile)
    
mage_layouts = {}
mage_media_layouts = {}
for f in mage_xls_files:
    workbook = xlrd.open_workbook(mage_data_dir + f)
    sheet = workbook.sheet_by_index(1)
    exp_name = f.split(".")[0]
    mage_layouts[exp_name] = {}
    mage_media_layouts[exp_name] = {}
    for i, (well, strain, medium) in enumerate(zip(sheet.col_slice(0), sheet.col_slice(2), sheet.col_slice(3))):
        if i == 0:
            continue
        elif not well.value:
            break
        mage_layouts[exp_name][well.value] = strip_strain(strain.value)
        mage_media_layouts[exp_name][well.value] = strip_medium(medium.value)
        
with open("../Data/Growth_data/Reconstructions/MAGE mutant growth data/mage_layouts.json", "w") as outfile:
    json.dump(mage_layouts, outfile)

In [33]:
json_ko_data_dir = "../Data/Growth_data/Reconstructions/knockout growth data/output/"
ko_experiments = [f for f in os.listdir(json_ko_data_dir) if f.endswith(".json")]

json_mage_data_dir = "../Data/Growth_data/Reconstructions/MAGE mutant growth data/output/"
mage_experiments = [f for f in os.listdir(json_mage_data_dir) if f.endswith(".json")]

In [35]:
data = []

for exp in ko_experiments:
    if "glutarate" not in exp:
        pass
    compound = exp.split("_")[1]
    exp_name = exp.split(os.path.extsep)[0]
    with open(json_ko_data_dir + exp) as infile:
        dat = json.load(infile)
    res = parse_plate_data(
        dat["curves"], time_cutoff=24, phase_length_cutoff=5, max_abs_baseline=3, max_baseline_dev=3
    )
    for well, dat in res.items():
        rate, growth = dat["slope"], dat["growth"]
        strain = ko_layouts[exp_name][well]
        if strain is None or strain == "":
            continue
        if strain.endswith("-sm") or strain.endswith("-lg"):
            strain = strain[:-3]
        if strain == "M9":
            raise ValueError(exp, well)
        data.append({"strain": strain, "compound": compound, "growth_rate": rate, "experiment": exp_name, "well": well})
        
ko_df = pd.DataFrame(data)[["strain", "compound", "growth_rate", "experiment", "well"]]
wt_grs = dict(ko_df[ko_df["strain"] == "MG1655"].groupby("experiment").mean()["growth_rate"])
ko_df["rel_growth_rate"] = ko_df.apply(lambda x: x["growth_rate"] - wt_grs[x["experiment"]], axis=1)

In [36]:
data = []
for exp in mage_experiments:
    if "glutarate" not in exp:
        pass
    
    exp_name = exp.split(os.path.extsep)[0]
    with open(json_mage_data_dir + exp) as infile:
        dat = json.load(infile)
    res = parse_plate_data(dat["curves"])
    for well, dat in res.items():
        rate, growth = dat["slope"], dat["growth"]
        strain = mage_layouts[exp_name][well]
        if strain is None or strain == "":
            continue
        if strain.endswith("-sm") or strain.endswith("-lg"):
            strain = strain[:-3]
        compound = mage_media_layouts[exp_name][well]
        data.append({"strain": strain, "compound": compound, "growth_rate": rate, "experiment": exp_name, "well": well})
        
mage_df = pd.DataFrame(data)[["strain", "compound", "growth_rate", "experiment", "well"]]
mage_wt_grs = dict(mage_df[mage_df["strain"] == "MG1655"].groupby("experiment").mean()["growth_rate"])
mage_df["rel_growth_rate"] = mage_df.apply(lambda x: x["growth_rate"] - mage_wt_grs[x["experiment"]], axis=1)

In [37]:
recon_df = pd.concat([ko_df, mage_df])

In [38]:
recon_df.to_csv("../Data/Growth_data/Reconstructions/Reconstruction_dataframe.tsv", sep="\t")