# 1. Import stuff

In [1]:
import numpy, datetime, os, pickle, itertools, importlib, dotenv, pandas, json

In [2]:
import multiprocessing, multiprocessing.pool
from multiprocessing import Process, Queue

In [3]:
import matplotlib, matplotlib.pyplot
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':30, 'ytick.labelsize':30, 'axes.labelsize':40, 'figure.figsize':(12, 8)})

In [4]:
def growth_coupled_analysis(task):
    
    """
    This function performs the growth-coupled production.
    It takes as input a list as [first_gene_pair_index, second_gene_pair_index, reaction_of_interest, biomass_reaction_label, model]
    It gives as output a list as [first_gene_pair_index, second_gene_pair_index, growth, min_production, max_production]
    """
    
    i = task[0]
    j = task[1]
    reaction_of_interest = task[2]
    biomass_reaction_label = task[3]
    model = task[4]
    
    with model as model:
                
        # KO
        model.genes[i].knock_out()
        model.genes[j].knock_out()
        solution = model.optimize()
        if solution.status == 'optimal':
            ko_growth = solution.objective_value

            # growth-coupled production
            model.objective = reaction_of_interest
            model.reactions.get_by_id(biomass_reaction_label).lower_bound = ko_growth
            max_production = model.optimize(objective_sense='maximize').objective_value
            min_production = model.optimize(objective_sense='minimize').objective_value
            
            result = [i, j, ko_growth, min_production, max_production]
        else:
            result = [i, j, 0, 0, 0]

    return result

In [5]:
def printt(message):

    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S \t {}".format(message)))   #here date & time, to see the runtime? 

    return None

# 1. Load model

In [6]:
io_path = '/home/adrian/hub/LVF601M/yeast/yeast-GEM/code/io.py'

In [7]:
os.chdir('yeast-GEM')
#! touch .env

# find .env + define paths:
dotenv_path = dotenv.find_dotenv()
REPO_PATH = os.path.dirname(dotenv_path)
MODEL_PATH = f"{REPO_PATH}/model/yeast-GEM.xml"

In [8]:
spec = importlib.util.spec_from_file_location("i_dont_know_what_is_this", io_path)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
model = foo.read_yeast_model()

Restricted license - for non-production use only - expires 2023-10-25


In [9]:
working_solver = "cplex" # much faster version of glpk
model.solver = working_solver
model.solver

<optlang.cplex_interface.Model at 0x7f925b18b370>

# 2. explore the model

In [10]:
model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
s_0420[e],r_1654,0.5988,0,0.00%
s_0565[e],r_1714,1.0,6,100.00%
s_0796[e],r_1832,0.04236,0,0.00%
s_0925[e],r_1861,2.63e-06,0,0.00%
s_1277[e],r_1992,2.25,0,0.00%
s_1324[e],r_2005,0.545,0,0.00%
s_1374[e],r_2020,0.000304,0,0.00%
s_1438[e],r_2049,0.0003325,0,0.00%
s_1468[e],r_2060,0.007203,0,0.00%
s_4200[e],r_4593,0.000108,0,0.00%

Metabolite,Reaction,Flux,C-Number,C-Flux
s_0458[e],r_1672,-2.39,1,100.00%
s_0776[e],r_1814,-5.41e-06,2,0.00%
s_0805[e],r_2100,-4.302,0,0.00%
s_0450[c],r_2111,-0.08375,0,0.00%
s_4157[e],r_4527,-0.2515,0,0.00%


In [11]:
wt_solution = model.optimize()
print(wt_solution.objective_value)

0.08374770604149129


In [12]:
number_of_genes = len(model.genes)
print(number_of_genes)

1150


# 3. adjust the model

## 3.1. Helper functions

In [13]:
import math
def find_probe_list(probe,control_list,condition_list):
  up_probe = []
  for i in range(len(control_list)-1):
    FC = condition_list[i]/control_list[i]
    log_FC = math.log(FC,2)
    if(log_FC >= 1):
      up_probe.append(probe[i])
  return up_probe

In [14]:
def find_gene_ids(up_probe,probe_map,gene_map):
  gene_id_up = []
  for i in range(len(up_probe)-1):
    for j in range(len(probe_map)-1):
      if(up_probe[i] == probe_map[j]):
        gene_id_up.append(gene_map[j])
  return gene_id_up

In [15]:
def adjust_model(model,gene_id_up):
  reaction_ko = []
  model_adjust = model.copy()
  for gene in gene_id_up:
    new_gene = fix_gene_id(gene)
    if new_gene in model.genes:
      model_adjust.genes.get_by_id(new_gene).knock_out()
  return model_adjust

In [16]:
def fix_gene_id(gene):
  gene_new_char = []
  for char in gene:
    if char != ".":
      gene_new_char.append(char)
    else:
      break
  str = ""
  for ele in gene_new_char:
    str += ele
  return str

In [17]:
def count_ko_reactions(model_adj):
  rxn_list = []
  for rxn in model_adj.reactions:
    if rxn.bounds == (0,0):
      rxn_list.append(rxn)
  return rxn_list

## 3.2. Load data to adjust model

In [18]:
! wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE20nnn/GSE20108/matrix/GSE20108_series_matrix.txt.gz   #the expression data for 
df = pandas.read_csv("GSE20108_series_matrix.txt.gz", compression="gzip", sep="\t", skiprows=59)
df.head()

--2022-04-22 17:19:21--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE20nnn/GSE20108/matrix/GSE20108_series_matrix.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 130.14.250.7, 2607:f220:41e:250::12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 798370 (780K) [application/x-gzip]
Saving to: ‘GSE20108_series_matrix.txt.gz.5’


2022-04-22 17:19:23 (952 KB/s) - ‘GSE20108_series_matrix.txt.gz.5’ saved [798370/798370]



Unnamed: 0,ID_REF,GSM502520,GSM502521,GSM502522,GSM502523,GSM502524,GSM502525,GSM502526,GSM502527,GSM502528,GSM502529,GSM502530,GSM502531,GSM502532,GSM502533,GSM502534,GSM502535,GSM502536,GSM502537
0,1769308_at,66.0219,64.5088,64.0777,50.8112,33.2731,27.1011,82.1433,155.595,74.0396,89.4596,100.333,112.943,67.264,88.6567,84.8422,80.0021,39.4134,49.6411
1,1769309_at,7.47442,4.5704,1.90656,0.701782,4.08954,1.27988,1.97934,2.83222,5.85934,5.06648,2.61963,2.28516,4.07976,2.3442,3.28361,6.44708,1.86492,0.74411
2,1769310_at,3.95121,3.85858,3.26991,1.96126,4.74444,3.79033,1.3491,0.915949,3.20209,4.695,1.21438,3.21346,4.33432,1.22967,3.63288,1.78898,3.25453,4.20871
3,1769311_at,571.151,609.048,552.192,544.056,470.936,485.519,617.768,881.655,806.093,700.81,793.898,802.2,616.106,639.426,615.873,577.367,541.2,555.366
4,1769312_at,169.127,176.107,153.344,152.357,149.77,121.778,141.535,159.424,154.06,153.674,149.798,147.949,186.583,189.349,187.622,137.465,167.472,151.405


In [19]:
df_probe = df.loc[:,"ID_REF"]
df_probe.head()
df_probe_list = df_probe.to_numpy()

In [20]:
df_control = []
df_control_list_1 = df["GSM502532"].to_numpy()
df_control_list_2 = df["GSM502533"].to_numpy()
for i in range(len(df_control_list_1)-1):
  control_i = (df_control_list_1[i]+df_control_list_2[i])/2
  df_control.append(control_i)

df_alc3 = []
df_alc3_1 = df["GSM502534"].to_numpy()
df_alc3_2 = df["GSM502535"].to_numpy()
for i in range(len(df_alc3_1)-1):
  alc3_i = (df_alc3_1[i]+df_alc3_2[i])/2
  df_alc3.append(alc3_i)


In [21]:
! wget https://www.ebi.ac.uk/arrayexpress/files/A-AFFY-47/A-AFFY-47.adf.txt
df2 = pandas.read_csv("A-AFFY-47.adf.txt",sep="\t", skiprows = 231)
df2.head()

--2022-04-22 17:19:23--  https://www.ebi.ac.uk/arrayexpress/files/A-AFFY-47/A-AFFY-47.adf.txt
Resolving www.ebi.ac.uk (www.ebi.ac.uk)... 193.62.193.80
Connecting to www.ebi.ac.uk (www.ebi.ac.uk)|193.62.193.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748713 (731K) [text/plain]
Saving to: ‘A-AFFY-47.adf.txt.5’


2022-04-22 17:19:24 (1.13 MB/s) - ‘A-AFFY-47.adf.txt.5’ saved [748713/748713]



Unnamed: 0,RPTR-Sc-M57289-1_s_at,AFFX-Sc-M57289-1,Unnamed: 2,RPTR-Sc-M57289-1_s_at.1,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,1773641_x_at,YAL068C.S1,,1773641_x_at,,,,,,,
1,1771192_at,YAL067W-A.S1,,1771192_at,,,,,,,
2,1769997_at,YAL067C.S1,,1769997_at,,,GO:0006810,EAK92569,S0000062,,
3,1775769_at,YAL065C.S1,,1775769_at,,,,BAA19915.1,S0001817,,
4,1771550_at,YAL064W-B.S1,,1771550_at,,,,NP_009336,S0002141,,


In [22]:
probe_map = df2.loc[:, "RPTR-Sc-M57289-1_s_at"].to_numpy()
gene_map = df2.loc[:, "AFFX-Sc-M57289-1"].to_numpy()

In [23]:
probe_alc3 = find_probe_list(df_probe_list,df_control,df_alc3)
up_gene_alc3 = find_gene_ids(probe_alc3, probe_map,gene_map)
model_alc3 = adjust_model(model,up_gene_alc3)
print(len(count_ko_reactions(model_alc3)))

38


# 4.  find best pairs of genes for to couple with growth

In [24]:
reaction_of_interest = 'r_1761' # ethanol export
biomass_reaction_label = 'r_2111' # biomass function

In [25]:
number_of_threads = 20

In [26]:
printt('working with {} genes'.format(number_of_genes))   

tasks = []
for i in range(len(model_alc3.genes)):
    for j in range(len(model_alc3.genes)):
        if i < j:
            task = [i, j, reaction_of_interest, biomass_reaction_label, model]
            tasks.append(task)
printt('working with {} gene pairs'.format(len(tasks)))

2022-04-22 17:19:26 	 working with 1150 genes
2022-04-22 17:19:27 	 working with 660675 gene pairs


In [27]:
%%time
printt('entering a parallel world of {} threads'.format(number_of_threads))
hydra = multiprocessing.pool.Pool(number_of_threads)
hydra_output = hydra.map(growth_coupled_analysis, tasks)
hydra.close()
printt('completed {} tasks'.format(len(hydra_output)))

2022-04-22 17:19:27 	 entering a parallel world of 20 threads








2022-04-22 20:38:40 	 completed 660675 tasks
CPU times: user 10.5 s, sys: 927 ms, total: 11.4 s
Wall time: 3h 19min 13s


In [28]:
df = pandas.DataFrame(hydra_output, columns=['i', 'j', 'KO growth', 'min production', 'max production'])
df.sort_values(by=['min production'], ascending=False, inplace=True)

In [29]:
printt('store double KO information as a dataframe')

with open('doubleKO_up.json', 'w') as f:
    json.dump(df.to_json(), f)

2022-04-22 20:38:41 	 store double KO information as a dataframe


In [30]:
# how to read the JSON file
with open('doubleKO_up.json', 'r') as f:
    new_str = json.load(f)
new = pandas.read_json(new_str)
print(new.shape)
new.head()

(660675, 5)


Unnamed: 0,i,j,KO growth,min production,max production
6096,5,367,0.011881,1.817701,1.817701
7239,6,367,0.011881,1.817701,1.817701
6542,5,813,0.011881,1.817701,1.817701
7685,6,813,0.011881,1.817701,1.817701
366,0,367,0.011881,1.8177,1.8177


In [31]:
new[new['min production'] > 1]

Unnamed: 0,i,j,KO growth,min production,max production
6096,5,367,0.011881,1.817701,1.817701
7239,6,367,0.011881,1.817701,1.817701
6542,5,813,0.011881,1.817701,1.817701
7685,6,813,0.011881,1.817701,1.817701
366,0,367,0.011881,1.817700,1.817700
...,...,...,...,...,...
84231,75,907,0.009049,1.526161,1.526161
6947,6,75,0.009019,1.496410,1.526623
83455,75,131,0.009019,1.496410,1.526623
5804,5,75,0.009019,1.496410,1.526623
