In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Import dependencies

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import cobra

from sklearn.decomposition import PCA

from src.gem.yeast8model import Yeast8Model

# Initialise model

In [None]:
glc_exch_rate = 16.89

wt = Yeast8Model("../data/gemfiles/ecYeastGEM_batch_8-6-0.xml")
# Default: lots of glucose
wt.model.reactions.get_by_id("r_1714").bounds = (-glc_exch_rate, 0)
wt.model.reactions.get_by_id("r_1714_REV").bounds = (0, glc_exch_rate)

# Generate & dump or load data

## Grid

In [None]:
exch_rate_dict = {
    "r_1714": np.linspace(0.5*8.6869, 2*8.6869, 4), # glucose
    "r_1654": np.linspace(0.5*1.4848, 2*1.4848, 4), # ammonium
}

ablation_result_array = wt.usgfluxes_grid(exch_rate_dict)

# Dump data
with open('../data/interim/ec_usg_glc_amm.pkl', 'wb') as handle:
    pickle.dump(ablation_result_array, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load if saved
with open('../data/interim/ec_usg_glc_amm.pkl', 'rb') as handle:
    ablation_result_array = pickle.load(handle)

In [None]:
exch_rate_dict = {
    "r_2033": np.linspace(0.5*4.4444, 2*4.4444, 4), # pyruvate
    "r_1654": np.linspace(0.5*1.0, 2*1.0, 4), # ammonium
}

ablation_result_array = wt.usgfluxes_grid(exch_rate_dict)

# Dump data
with open('../data/interim/ec_usg_pyr_amm.pkl', 'wb') as handle:
    pickle.dump(ablation_result_array, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load if saved
with open('../data/interim/ec_usg_pyr_amm.pkl', 'rb') as handle:
    ablation_result_array = pickle.load(handle)

## Specific points

Sandbox: get enzyme usage fluxes on specific points, rather than a grid

In [None]:
exch_rate_points = {
    "exch_ids": ["r_1714", "r_1654"],
    "exch_points": np.array([[16.89, 2.96], [1.69, 1.05]])
}

In [None]:
def usgfluxes_list(ymodel, exch_rate_points):
    model_working = wt.model.copy()
    ablation_result_list = np.zeros(shape=(len(exch_rate_points["exch_points"])), dtype="object")

    for point_idx, point in enumerate(exch_rate_points["exch_points"]):
        # block glucose
        model_working.reactions.get_by_id("r_1714").bounds = (0, 0)
        try:
            model_working.reactions.get_by_id("r_1714_REV").bounds = (0, 0)
        except KeyError as e:
            print("r_1714_REV not found, ignoring in glucose-blocking step")
        # set bounds
        for exch_idx, exch_id in enumerate(exch_rate_points["exch_ids"]):
            model_working.reactions.get_by_id(exch_id).bounds = (-point[exch_idx], 0)
            # deal with reversible exchange reactions
            try:
                exch_id_rev = exch_id + "_REV"
                model_working.reactions.get_by_id(exch_id_rev).bounds = (0, point[exch_idx])
            except KeyError as e:
                print(
                    f"Error-- reversible exchange reaction {exch_id_rev} not found. Ignoring."
                )
        ablation_result = wt.ablate(input_model=model_working)
        enz_use_array = np.stack(
            [df.to_numpy() for df in wt.ablation_enzyme_fluxes.values()]
        )
        ablation_result_list[point_idx] = enz_use_array
    
    return ablation_result_list

In [None]:
ablation_result_list = usgfluxes_list(wt, exch_rate_points)

In [None]:
ablation_result_list

Sandbox: Pick random points from a grid based on a mask

In [None]:
grid_filename = "ec_grid_" + "glc" + "_amm"
grid_filepath = "../data/interim/" + grid_filename + ".pkl"
with open(grid_filepath, "rb") as handle:
    ablation_result_array = pickle.load(handle)

In [None]:
from src.calc.matrix import ArrayCollection
from src.calc.ablation import vget_ablation_ratio, vget_custom_ablation_ratio

saturation_glc = 8.6869
saturation_amm = 1.4848
x_axis = np.linspace(0, 2 * saturation_glc, 32)
y_axis = np.linspace(0, 2 * saturation_amm, 32)

ratio = ArrayCollection(vget_ablation_ratio(ablation_result_array), x_axis, y_axis)

In [None]:
ratio_array_mask = ratio.array > 1

In [None]:
x_coords, y_coords = np.meshgrid(x_axis, y_axis)

big_ratio_coords = np.column_stack((x_coords[ratio_array_mask], y_coords[ratio_array_mask]))
small_ratio_coords = np.column_stack((x_coords[~ratio_array_mask], y_coords[~ratio_array_mask]))

In [None]:
def get_random_coords(coords, num_samples):
    return coords[np.random.choice(coords.shape[0], num_samples, replace=False), :]

In [None]:
num_samples = 100
big_ratio_coords_random = get_random_coords(big_ratio_coords, num_samples)
small_ratio_coords_random = get_random_coords(small_ratio_coords, num_samples)

In [None]:
print(big_ratio_coords_random)
print(small_ratio_coords_random)

Sandbox: combine

In [None]:
def coords_to_dict(coords):
    return {
        "exch_ids": ["r_1714", "r_1654"],
        "exch_points": coords,
    }

In [None]:
coords_to_dict(big_ratio_coords_random)

In [None]:
big_ablation_result_list = usgfluxes_list(wt, coords_to_dict(big_ratio_coords_random))
small_ablation_result_list = usgfluxes_list(wt, coords_to_dict(small_ratio_coords_random))

In [None]:
all_ablation_result_list = np.concatenate((big_ablation_result_list, small_ablation_result_list))

In [None]:
all_ablation_result_list.shape

Adjust data variable dimensions

In [None]:
multicond_enz_use_array = np.concatenate(all_ablation_result_list)
multicond_enz_use_array.shape

In [None]:
#ablation_result_1d = ablation_result_array.ravel()
#multicond_enz_use_array = np.concatenate(ablation_result_1d)
#multicond_enz_use_array.shape

PCA

In [None]:
from sklearn.preprocessing import scale

scaled_array = scale(multicond_enz_use_array)

In [None]:
pca = PCA()
Xt = pca.fit_transform(scaled_array)
pca1 = Xt[:, 0]
pca2 = Xt[:, 1]

In [None]:
print(np.cumsum(pca.explained_variance_ratio_))

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)[:20])

In [None]:
# Color dots by biomass components, using the default cycle.
# Original = C0, lipid = C1, etc.
num_components = 8
color_dict = dict(zip(
    list(range(num_components)),
    ['C' + str(num) for num in range(num_components)]
))
color_list = [color_dict[el] for el in (np.arange(len(pca1)) % num_components)]

fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(pca1, pca2, color=color_list)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")

In [None]:
# Plot each condition
num_conds = int(len(pca1) / num_components)
#color_list = [color_dict[el % 8] for el in range(len(pca1) // 2)]

title_dict = {
    0: "ratio > 1",
    1: "ratio < 1",
}

fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(10,5))
for cond in range(2):
    start_idx = cond * (len(pca1) // 2)
    region_range = list(range(start_idx, start_idx+(len(pca1) //2 )))
    # all
    #to_plot = region_range
    # original, protein, carbohydrate
    # FIXME: lots of hard-cording, confusing, un-Pythonic
    to_plot = [el for el in region_range if el % 8 in [0, 2, 3]]
    color_list = [color_dict[el % 3] for el in range(len(to_plot))]
    ax[cond].scatter(
        pca1[to_plot],
        pca2[to_plot],
        color=color_list,
        #marker='+',
        s=30,
        alpha=0.2,
    )
    ax[cond].set_xlim(np.min(pca1), np.max(pca1))
    ax[cond].set_ylim(np.min(pca2), np.max(pca2))
    ax[cond].tick_params(
        axis='both', bottom=False, left=False, labelbottom=False, labelleft=False
    )
    ax[cond].set_xlabel("PC1")
    ax[cond].set_ylabel("PC2")

    ax[cond].set_title(f"{title_dict[cond]}")

Colour key

If all 8:
- original: blue
- lipid: orange
- protein: green
- carbohydrate: red
- DNA: purple
- RNA: brown
- cofactor: pink
- ion: grey

If just 3:
- original: blue
- protein: orange
- carbohydrate: green

Feature importance

See https://stackoverflow.com/questions/50796024/feature-variable-importance-after-a-pca-analysis

In [None]:
imp = abs(pca.components_)
imp0 = np.argsort(imp[0])[::-1]
imp1 = np.argsort(imp[1])[::-1]
list_enz_usg = wt.ablation_enzyme_fluxes['original'].index.to_numpy()

In [None]:
list_enz_usg[imp0[:30]]

In [None]:
list_enz_usg[imp1[:10]]