# Notebook 2: Generate multi-strain models

This notebook follows Notebook 1 in the tutorial, and continues to work on generating a draft model for Gc. This notebook utilizes the output of notebook 1 (presence/absence matrix (aka ortho_matrix) and geneID matrix) to generate a draft Gc model from the reference model. There are two major steps involved: 

1. Deletion of missing genes/reaction from reference model to generate draft models
2. Update gene-protein-reaction rule in each model

In [2]:
#import package needed
import cobra
import pandas as pd
from cobra.io import load_json_model
from glob import glob
from cobra.manipulation.delete import delete_model_genes, remove_genes
import seaborn as sns
from cobra.flux_analysis import single_gene_deletion
import os

#Simplify reading/writing files
cwd=os.path.realpath(os.path.join(os.path.dirname(os.getcwd()),"..",".."))

In [3]:
#From Jenior 2021 C.diff notebooks - benchmarking C. difficile GENRES. https://github.com/csbl/Jenior_CdifficileGENRE_2021
def basicCheck(model):
    
    # Determination
    determination = float(len(model.reactions)) / float(len(model.metabolites))
    determination = round(determination, 3)
    if len(model.reactions) < len(model.metabolites): 
        statement = ' (overdetermined)'
    elif len(model.reactions) > len(model.metabolites):
        statement = ' (underdetermined)'
    print('Reactions to metabolites ratio: ' + str(determination) + statement)
    
    # Compartments
    print('GENRE has ' + str(len(model.compartments.keys())) + ' compartment(s)')
    
    # Genes
    if len(model.genes) == 0: 
        print('GENRE has no gene data')
    else:
        print('GENRE has ' + str(len(model.genes)) + ' genes')
    no_rxns = []
    for gene in model.genes:
          if len(gene.reactions) == 0:
                no_rxns.append(gene.id)
    if len(no_rxns) > 0:
        print('\t' + str(len(no_rxns)) + ' are not associated with reactions')
        
    # Growth
    ov = model.slim_optimize(error_value=0.)
    if ov < 1e-6:
        for rxn in model.boundary: rxn.bounds = (-1000., 1000.)
        ov = model.slim_optimize(error_value=0.)
        if ov < 1e-6:
            print('GENRE cannot acheive objective flux')
        else:
            ov = round(ov, 3)
            print(str(ov) + ' objective flux, only in complete media')
    else:
        ov = round(ov, 3)
        print(str(ov) + ' objective flux in current media')

# Quicker way to read in models
import pickle
def read_model(fileName, obj='none'):
    
    fileType = fileName.split('.')[-1]
    
    if fileType == 'sbml' or fileType == 'xml':
        model = cobra.io.read_sbml_model(fileName)
    elif fileType == 'json':
        model = cobra.io.load_json_model(fileName)
    elif fileType == 'yaml':
        model = cobra.io.load_yaml_model(fileName)
    elif fileType == 'mat':
        model = cobra.io.load_matlab_model(fileName)
    elif fileType == 'pkl':
        model = pickle.load(open(fileName, 'rb'))
    else:
        raise TypeError('Unrecognized file extension')
    
    if obj != 'none': model.objective = obj
    for rxn in model.boundary: rxn.bounds = (-1000., 1000.)
        
    return model


In [4]:
#Define model medias

def complete(model):   
    for reaction in model.reactions:
        if 'EX_' in  reaction.id:
            reaction.lower_bound=-20.



mdm_req=set(['EX_co2_e_', 'EX_cobalt2_e_', 'EX_cu2_e_' , 'EX_h_e_', 'EX_h2o_e_','EX_mn2_e_','EX_mobd_e_','EX_tungs_e_', 'EX_zn2_e_','EX_na1_e_','EX_cl_e_','EX_k_e_','EX_so4_e_','EX_nh4_e_','EX_mg2_e_','EX_pi_e_','EX_ca2_e_','EX_fe3_e_','EX_no3_e_','EX_asp_L_e_','EX_glu_L_e_','EX_arg_L_e_','EX_gly_e_','EX_ser_L_e_','EX_leu_L_e_','EX_ile_L_e_','EX_val_L_e_','EX_tyr_L_e_','EX_cys_L_e_','EX_pro_L_e_','EX_trp_L_e_','EX_thr_L_e_','EX_phe_L_e_','EX_asn_L_e_','EX_gln_L_e_','EX_his_L_e_','EX_met_L_e_','EX_ala_L_e_','EX_lys_L_e_','EX_gthrd_e_','EX_thm_e_','EX_pnto_R_e_','EX_glc_D_e_','EX_hxan_e_','EX_ura_e_'])
def MDM(model):
    for reaction in model.reactions:
        if 'EX_' in  reaction.id:
            reaction.lower_bound=0
        if reaction.id in mdm_req:
            reaction.lower_bound = -10.
        if reaction.id == 'EX_o2_e_':
            reaction.lower_bound = -20.
        if reaction.id == 'EX_glc_D_e_':
            reaction.lower_bound = -10.

            


## 1. Deletion of missing genes/reaction from reference model 


In [5]:
## Load the previously generated homology matrix for N. gonorrhoeae 
hom_matrix=pd.read_csv(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/Matrices/ortho_matrix.csv')
hom_matrix=hom_matrix.set_index('Unnamed: 0')

In [6]:
#Load the base N. meningetidis reconstruction Nmb_iTM560
base = read_model(cwd+'/Gc_GENRE_2022/Models/Nmb_iTM560.json')

In [7]:
#set the model objective
base.objective = "Nm_Ess_biomass"
#report back model info
basicCheck(base)
base



Reactions to metabolites ratio: 1.171 (underdetermined)
GENRE has 1 compartment(s)
GENRE has 562 genes
74.215 objective flux in current media


0,1
Name,Nmb_iTM560
Memory address,0x01a30d7dd190
Number of metabolites,1297
Number of reactions,1519
Number of groups,0
Objective expression,1.0*Nm_Ess_biomass - 1.0*Nm_Ess_biomass_reverse_46af5
Compartments,Cytosol


In [8]:
complete(base)            
base.optimize()

Unnamed: 0,fluxes,reduced_costs
12DGR120tipp,0.000000,0.000000
12DGR140tipp,0.000000,0.000000
12DGR141tipp,0.000000,0.000000
12DGR160tipp,0.000000,0.000000
12DGR161tipp,0.000000,0.000000
...,...,...
XYLUtex,0.000000,0.000000
ZN2abcpp,0.000000,-0.024189
ZN2t3pp,0.000000,-0.006047
Zn2tex,0.021687,0.000000


In [9]:
hom_matrix

Unnamed: 0_level_0,AE004969.1
Unnamed: 0,Unnamed: 1_level_1
Sgene,0.0
NMB0319,1.0
NMB0182,1.0
NMB0453,1.0
NMB2039,1.0
...,...
NMB0930,1.0
NMB1315,1.0
NMB0544,1.0
NMB1823,1.0


### Delete missing genes from copies of the nMB_iTM560 model
Start with the Nmb_iTM560 model, identify the missing genes from the matrix, and remove them and their associated reactions from the reference model 

In [11]:
#create strain-specific draft models and save them
for strain in hom_matrix.columns:
    
    #Get the list of Gene IDs from the homology matrix dataframe for the current strain without a homolog
    currentStrain=hom_matrix[strain]
    nonHomologous=currentStrain[currentStrain==0.0]
    nonHomologous=nonHomologous.index.tolist()
    
    
    
    #Define a list of Gene objects from the base reconstruction to be deleted from the current strain
    toDelete=[]
    for gene in nonHomologous:
        toDelete.append(base.genes.get_by_id(gene))

    #Establish a model copy and use the COBRApy function to remove the appropriate content and save this model
    baseCopy=base.copy()
    remove_genes(baseCopy, toDelete, remove_reactions=True)
    baseCopy.id=str(strain)
    cobra.io.json.save_json_model(baseCopy, str(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/DraftModels/'+strain+'.json'), pretty=False)
    
    

## 2. Update Model Gene Product Rules

In [18]:
#load the geneID matrix from notebook 1a 
models=glob('%s/*.json'%cwd+'/Gc_GENRE_2022/Generate_Gc_Model/DraftModels')
geneIDs_matrix=pd.read_csv(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/Matrices/geneIDs_matrix.csv')
geneIDs_matrix=geneIDs_matrix.set_index('Unnamed: 0')
geneIDs_matrix

Unnamed: 0_level_0,AE004969.1
Unnamed: 0,Unnamed: 1_level_1
Sgene,
NMB0319,NGO_1682
NMB0182,NGO_1801
NMB0453,NGO_1506
NMB2039,NMB2039_ortholog
...,...
NMB0930,NMB0930_ortholog
NMB1315,NGO_0589
NMB0544,NGO_1448
NMB1823,NGO_0082


In [19]:
#Utilize the geneIDs matrix to update the GPRs in each of the strain-specific models with the proper gene ID

from cobra.manipulation.modify import rename_genes

for mod in models:
    model=cobra.io.load_json_model(mod)
    for column in geneIDs_matrix.columns:
        if column in mod:
            currentStrain=column
    
    IDMapping=geneIDs_matrix[currentStrain].to_dict()
    IDMappingParsed = {k:v for k,v in IDMapping.items() if v != 'None'}
    
    rename_genes(model,IDMappingParsed)
    cobra.io.json.save_json_model(model,mod, pretty=False)

### Examine the draft strain specific model contents

In [20]:
# gather the general information on the draft models
for strain in hom_matrix.columns:
    model=cobra.io.load_json_model(str(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/DraftModels/'+strain+'.json'))
    print (model.id,'Number of Model Genes:',len(model.genes),'Number of Model Reactions:',len(model.reactions))

AE004969.1 Number of Model Genes: 540 Number of Model Reactions: 1417


Test model for growth compared to the base model

In [24]:
#load the draft models created from Notebook 2
model=cobra.io.load_json_model(str(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/DraftModels/'+strain+'.json'))
model

0,1
Name,AE004969.1
Memory address,0x01a317050850
Number of metabolites,1297
Number of reactions,1417
Number of groups,0
Objective expression,1.0*Nm_Ess_biomass - 1.0*Nm_Ess_biomass_reverse_46af5
Compartments,Cytosol


In [26]:
#Load each target Strain model, initialize it to Gc minimal media and see if the model can optimize for
# biomass production

with model:
    complete(model)
    print (model.slim_optimize())

-1.0425707535287215e-17


In [27]:
# We see that however the base model can synthesize biomass in this condition
complete(base)
base.objective ="Nm_Ess_biomass"
base.slim_optimize()

6.86768839817138

Because no growth occurs for Gc, the model must be curated/gapfilled. Reactions that are essential for running the model must be added, additionally manual curation for gene presence/absence must be conducted. See FA1090 Model Annotation Decision Notes (in Generate Gc Model Folder) for notes on curation choices.

In [28]:
#Gather the list of base strain genes that have no homolog in strain of interest, an input to the below function
hom_matrix=pd.read_csv(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/Matrices/ortho_matrix.csv')
hom_matrix=hom_matrix.set_index('Unnamed: 0')
strain=hom_matrix['AE004969.1']
missingGenes=list(strain[strain==0.0].index)

In [29]:
def gapfill_multi(model, missing_genes, **kwargs):
    """
    Generate a list of gapfilling reactions from a list of missing genes for a strain-specific model.
    
    :param model: COBRA model for the base strain with the objective coefficient for the reaction of interest (e.g. biomass reaction) set to 1.
    
    :param missing_genes: list of genes with no homologs in the strain of interest.
    
    :param lower_bound: minimum allowable yield of gapfilled model.
    
    :param biomass: override the current model settings and temporarily assign the objective coefficient for a function of interest to 1.
    
    :return: a list of gapfilling reactions.
    
    """
    
    if 'lower_bound' in kwargs.keys():
        lower_bound = kwargs['lower_bound']
    else:
        lower_bound = model.optimize().objective_value*0.9
        
    biomass_reactions = [rx.id for rx in model.reactions if rx.objective_coefficient == 1]
    if 'biomass' in kwargs.keys():
        biomass = kwargs['biomass']
        if len(biomass_reactions) > 1:
            for rx in set(biomass_reactions) - {biomass}:
                model.reactions.get_by_id(rx).objective_coefficient = 0
                 
    else:
        if len(biomass_reactions) > 1:
            raise Exception("This model has more than one objective. \n Please adjust the objective coefficient to 1 for the chosen objective reaction (e.g. biomass or ATP) and 0 for the rest of the reactions, \n or specify the reaction ID to use as an objective.")
        if len(biomass_reactions) > 1:
            raise Exception("The model doesn't have an objective function. Please set the appropriate objective coefficient to 1, or specify the reaction ID to use as an objective.")
        biomass = biomass_reactions[0]
        
        
    model.solver.configuration.tolerances.feasibility = 1e-9
    constraints = []
    indicators = []

    for rx in cobra.manipulation.find_gene_knockout_reactions(model, missing_genes):

        indicator = model.problem.Variable('%s_i'%rx.id , type = 'binary')
        indicators.append(indicator)

        new_cstr1 = model.problem.Constraint( rx.flux_expression - rx.upper_bound*indicator ,ub = 0)
        new_cstr2 = model.problem.Constraint(-rx.flux_expression + rx.lower_bound*indicator ,ub = 0)
        constraints += [new_cstr1, new_cstr2]
        model.add_cons_vars([new_cstr1, new_cstr2, indicator])

    model.reactions.get_by_id(biomass).lower_bound = lower_bound
    model.objective = model.problem.Objective(-sum(indicators))
    sol = model.optimize()
    indicator_results = [ind.name[:-2] for ind in indicators if ind.primal != 0.0]
    
    # removing changes to model
    model.remove_cons_vars(constraints+indicators)
    for rx in set(biomass_reactions):
        model.reactions.get_by_id(rx).objective_coefficient = 1   
        
    return indicator_results

In [30]:
# We see that in this condition the model cannot synthesize biomass  as per 
# above analysis
model=cobra.io.load_json_model(cwd+'/Gc_GENRE_2022/Generate_Gc_Model/DraftModels/AE004969.1.json')

#Set reaction bounds to 1000
for reaction in model.reactions:
     if reaction.lower_bound < -4:
            reaction.lower_bound=-1000.
for reaction in model.reactions:
     if reaction.upper_bound > 4:
            reaction.upper_bound=1000.
            
MDM(model)
model.objective ="Nm_Ess_biomass"
print (model.slim_optimize())

-3.95399142022917e-18


In [32]:
# We see that however the base model can synthesize biomass in this condition
base=cobra.io.load_json_model(cwd+'/Gc_GENRE_2022/Models/Nmb_iTM560.json')

#Set reaction bounds to 1000
for reaction in base.reactions:
     if reaction.lower_bound < -4:
            reaction.lower_bound=-1000.
for reaction in base.reactions:
     if reaction.upper_bound > 4:
            reaction.upper_bound=1000.

MDM(base)
base.objective ="Nm_Ess_biomass"
print (base.slim_optimize())


1.0557843488117042


In [35]:
#By running the above function we obtain the list of candidate reactions
base
gapfill_multi(base, missingGenes)

['DHPTDCs',
 'MAc2',
 'H2Otpp',
 'G3PAT181',
 'GTPCI',
 'PPPGO3',
 'CO2tpp',
 'DPR',
 'FE3Ri',
 'COBALT2t1pp',
 'CU2tpp',
 'PERD',
 'G3PAT120',
 'T2DECAI',
 'PMDPHT',
 'G3PAT180',
 'G3PAT140',
 'MOBDabcpp',
 'ZN2tpp',
 'G3PAT141',
 'CAt1pp',
 'ETOHt2rpp',
 'DNMPPA',
 'G3PAT161ab',
 'THZPSN',
 'E4PD',
 'FCLT',
 'MAc1',
 'G3PAT160',
 'O2tpp']

Output 0.1 MDM media
['G3PAT141',
 'MOBDabcpp',
 'G3PAT140',
 'MAc1',
 'G3PAT160',
 'G3PAT161ab',
 'H2Otex',
 'G3PAT180',
 'MAc2',
 'G3PAT181',
 'G3PAT120',
 'T2DECAI'

Output from 0.1 complete media

['G3PAT141',
 'MOBDabcpp',
 'COBALT2t1pp',
 'ZN2tpp',
 'MAc1',
 'G3PAT160',
 'O2tpp',
 'G3PAT161ab',
 'G3PAT180',
 'MAc2',
 'G3PAT181',
 'CAt1pp',
 'MALTpp',
 'G3PAT140',
 'DPR',
 'FE3Ri',
 'Cuabcpp',
 'T2DECAI',
 'G3PAT120']
 
 
Output from 0.9 complete media
#['G3PAT141',
 'MOBDabcpp',
 'COBALT2t1pp',
 'ZN2tpp',
 'DHPTDCs',
 'MAc1',
 'G3PAT160',
 'O2tpp',
 'G3PAT161ab',
 'H2Otex',
 'DNMPPA',
 'G3PAT180',
 'H2Otpp',
 'MAc2',
 'G3PAT181',
 'CAt1pp',
 'MALTpp',
 'GTPCI',
 'CO2tpp',
 'PMDPHT',
 'G3PAT140',
 'DPR',
 'FE3Ri',
 'PHEMEtpp',
 'CU2tpp',
 'T2DECAI',
 'G3PAT120',
 'ETOHt2rpp']
#ZN2tpp is ZupT in mening. This gene is absent in Gc. 
#All other genes are orphan or spontaneous reactions.


Use these reactions to identify reactions that must be added/tailored to Gc model to allow growth. 