In [1]:
# Import libraries - REQUIRES pip version 9.0.3
import pandas
import os
from os.path import join
import sys
import scipy.stats
import numpy
import math
import pickle
import copy
import time
import warnings
import gc

# Using Cobrapy 0.13.0
import cobra
import cobra.test
import cobra.flux_analysis.gapfilling
from cobra.io import load_json_model, save_json_model, load_matlab_model, save_matlab_model, read_sbml_model, write_sbml_model
from cobra.flux_analysis import  flux_variability_analysis
from cobra.flux_analysis.reaction import assess_component
from cobra.manipulation.delete import *
from cobra.flux_analysis.parsimonious import add_pfba
from cobra.medium import find_boundary_types
from cobra.util import solver as sutil


# Confidence levels in each annotation
#import probanno

# Estabish handler for logger
import logging
logging.basicConfig()
logger = logging.getLogger('logger')

# Verbose exception printing
%xmode


Exception reporting mode: Verbose


In [2]:
# Quicker way to read in models
import pickle
def read_model(fileName, obj='none'):
    
    fileType = fileName.split('.')[-1]
    
    if fileType == 'sbml' or fileType == 'xml':
        model = cobra.io.read_sbml_model(fileName)
    elif fileType == 'json':
        model = cobra.io.load_json_model(fileName)
    elif fileType == 'yaml':
        model = cobra.io.load_yaml_model(fileName)
    elif fileType == 'mat':
        model = cobra.io.load_matlab_model(fileName)
    elif fileType == 'pkl':
        model = pickle.load(open(fileName, 'rb'))
    else:
        raise TypeError('Unrecognized file extension')
    
    if obj != 'none': model.objective = obj
    for rxn in model.boundary: rxn.bounds = (-1000., 1000.)
        
    return model

In [3]:
model= read_model("C://Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/models/annotatedGCmodel2.sbml")
NM_model = read_model('C://Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/models/archive/Nmb_iTM560.json')
carveme_model= cobra.io.read_sbml_model('C:/Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/models/Neisseria_gonorrhoeae_FA_1090.xml.gz')



In [4]:
save_json_model(carveme_model, "C:/Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/models/Neisseria_gonorrhoeae_FA_1090.json")

In [5]:
#For Figure1A - description of models
#Print Model Gene number
#Subtract 2 from each for final value to remove Spontaneous and orphan identifiers
print("Ngo_558 - Genes:", len(model.genes))
print("Nmb_iTM560 - Genes:",len(NM_model.genes))
print("CARVEME FA1090 - Genes:",len(carveme_model.genes))

Ngo_558 - Genes: 560
Nmb_iTM560 - Genes: 562
CARVEME FA1090 - Genes: 510


In [6]:
#Print Model Reaction number
print("Ngo_558-reactions :", len(model.reactions))
print("Nmb_iTM560-reactions :",len(NM_model.reactions))
print("CARVEME FA1090 -reactions:",len(carveme_model.reactions))

Ngo_558-reactions : 1424
Nmb_iTM560-reactions : 1519
CARVEME FA1090 -reactions: 1327


In [7]:
#Print Model Metabolites number
print("Ngo_558- metabolites :", len(model.metabolites))
print("Nmb_iTM560- metabolites  :",len(NM_model.metabolites))
print("CARVEME FA1090- metabolites  :",len(carveme_model.metabolites))

Ngo_558- metabolites : 1265
Nmb_iTM560- metabolites  : 1297
CARVEME FA1090- metabolites  : 991


In [8]:
#Print Model GPR - Exchange Reactions gene number
model_exchanges =[]
for r in model.reactions:
    if 'EX_' in r.id:
        model_exchanges.append(r.id)
print("Ngo_558 Exchange reactions:", len(model_exchanges))

model_exchanges =[]
for r in NM_model.reactions:
    if 'EX_' in r.id:
        model_exchanges.append(r.id)
print("Nmb_itm560 Exchange reactions:", len(model_exchanges))

model_exchanges =[]
for r in carveme_model.reactions:
    if 'EX_' in r.id:
        model_exchanges.append(r.id)
print("carveme_FA1090 Exchange reactions:", len(model_exchanges))

Ngo_558 Exchange reactions: 264
Nmb_itm560 Exchange reactions: 262
carveme_FA1090 Exchange reactions: 105


In [9]:
#For GPR Rules -- run memote to identify single gene, multigene, and none
#And to define metabolic vs transport reactions
model_exchanges =[]
model_transports =[]
model_metabolic =[]
for rxn in model.reactions:
    substrates = list(rxn.metabolites)
    compartments = set([x.compartment for x in substrates])

    if 'EX_' in rxn.id:
        model_exchanges.append(rxn.id) # exchange
    elif len(compartments) > 1:
        model_transports.append(rxn.id) # transport
    else:
        model_metabolic.append(rxn.id) # metabolic

print("Ngo_558 Exchange reactions:", len(model_exchanges))
print("Ngo_558 Transport reactions:",len(model_transports))
print("Ngo_558 purely metabolic reactions:",len(model_metabolic))

#Set compartments for Nmb_iTM560 so reaction types can be counted
for metabolite in NM_model.metabolites:
    if '_c_c' in metabolite.id:
        metabolite.compartment= 'cytosol'
for metabolite in NM_model.metabolites:
    if '_p_c' in metabolite.id:
        metabolite.compartment= 'periplasm'
for metabolite in NM_model.metabolites:
    if '_e_c' in metabolite.id:
        metabolite.compartment= 'extracellular'
        
model_exchanges =[]
model_transports =[]
model_metabolic =[]
for rxn in NM_model.reactions:
    substrates = list(rxn.metabolites)
    compartments = set([x.compartment for x in substrates])

    if 'EX_' in rxn.id:
        model_exchanges.append(rxn.id) # exchange
    elif len(compartments) > 1:
        model_transports.append(rxn.id) # transport
    else:
        model_metabolic.append(rxn.id) # metabolic
print("\n")
print("Nmb_itm560 Exchange reactions:", len(model_exchanges))
print("Nmb_itm560 Transport reactions:",len(model_transports))
print("Nmb_itm560 purely metabolic reactions:",len(model_metabolic))

model_exchanges =[]
model_transports =[]
model_metabolic =[]
for rxn in carveme_model.reactions:
    substrates = list(rxn.metabolites)
    compartments = set([x.compartment for x in substrates])

    if 'EX_' in rxn.id:
        model_exchanges.append(rxn.id) # exchange
    elif len(compartments) > 1:
        model_transports.append(rxn.id) # transport
    else:
        model_metabolic.append(rxn.id) # metabolic
print("\n")
print("carveme_FA1090 Exchange reactions:", len(model_exchanges))
print("carveme_FA1090 Transport reactions:",len(model_transports))
print("carveme_FA1090 purely metabolic reactions:",len(model_metabolic))

Ngo_558 Exchange reactions: 264
Ngo_558 Transport reactions: 474
Ngo_558 purely metabolic reactions: 686


Nmb_itm560 Exchange reactions: 262
Nmb_itm560 Transport reactions: 555
Nmb_itm560 purely metabolic reactions: 702


carveme_FA1090 Exchange reactions: 105
carveme_FA1090 Transport reactions: 279
carveme_FA1090 purely metabolic reactions: 943


In [11]:
#From https://www.biostars.org/p/492324/

import pandas as pd
from collections import * 

database = list()
for _, v in pd.read_json("C://Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/Figures/Fig_1/ngo00001.json").iterrows():
    d = v["children"]
    cat_1 = d["name"]
    for child_1 in d["children"]:
        cat_2 = child_1["name"] # Module?
        for child_2 in child_1["children"]:
            cat_3 = child_2["name"]
            if "children" in child_2:
                for child_3 in child_2["children"]:
                    cat_4 = child_3["name"]
                    fields = [cat_1, cat_2, cat_3, cat_4]
                    database.append(fields)
df_kegg = pd.DataFrame(database, columns=["Level_A", "Level_B", "Level_C", "Level_D"])

kos = list()
for tag in df_kegg["Level_D"]:
        full_tag=tag.split()
        geneid=full_tag[0]
        kos.append(geneid) 

df_kegg["ids"]= kos

#Include only kegg metabolism related annotations
kegg_annotations=df_kegg[df_kegg["Level_A"].str.contains("09100 Metabolism")==True]

#Match id format between kegg and model
kegg_annotations['ids'] = kegg_annotations['ids'] .str.replace('_', '')

#Filter kegg annotations by genes found in model
model_genes= list()
for gene in model.genes:
    model_genes.append(gene.id)
model_kegg_orthologies=kegg_annotations[kegg_annotations.ids.isin(model_genes)]


model_kegg_orthologies.to_csv('KEGG_orthologies_Ngo.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kegg_annotations['ids'] = kegg_annotations['ids'] .str.replace('_', '')


In [13]:
#From https://www.biostars.org/p/492324/
#Download Nme database of orthologies from KEGG


import pandas as pd
from collections import * 

database = list()
for _, v in pd.read_json("C://Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/Figures/Fig_1/nme00001.json").iterrows():
    d = v["children"]
    cat_1 = d["name"]
    for child_1 in d["children"]:
        cat_2 = child_1["name"] # Module?
        for child_2 in child_1["children"]:
            cat_3 = child_2["name"]
            if "children" in child_2:
                for child_3 in child_2["children"]:
                    cat_4 = child_3["name"]
                    fields = [cat_1, cat_2, cat_3, cat_4]
                    database.append(fields)
df_kegg = pd.DataFrame(database, columns=["Level_A", "Level_B", "Level_C", "Level_D"])

kos = list()
for tag in df_kegg["Level_D"]:
        full_tag=tag.split()
        geneid=full_tag[0]
        kos.append(geneid) 

df_kegg["ids"]= kos

#Include only kegg metabolism related annotations
kegg_annotations=df_kegg[df_kegg["Level_A"].str.contains("09100 Metabolism")==True]

#Match id format between kegg and model
kegg_annotations['ids'] = kegg_annotations['ids'] .str.replace('_', '')

#Filter kegg annotations by genes found in model
NM_model_genes= list()
for gene in NM_model.genes:
    NM_model_genes.append(gene.id)
NM_model_kegg_orthologies=kegg_annotations[kegg_annotations.ids.isin(NM_model_genes)]


NM_model_kegg_orthologies.to_csv('KEGG_orthologies_Nme.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kegg_annotations['ids'] = kegg_annotations['ids'] .str.replace('_', '')


In [14]:
#Orthologies for Carveme
#From https://www.biostars.org/p/492324/
#Download Nme database of orthologies from KEGG

#Get list of genes from Carveme model for FA1090 in NGO format
carveme_model_genes= list()
for gene in carveme_model.genes:
    carveme_model_genes.append(gene.id)
carveme_model_genes

#Use look_back table of annotations from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5399841/
annotation_table = pd.read_csv ('C://Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/annotation_resources/look_back_table.csv', usecols= ['Locus Tag ID','Protein.ID'])
annotation_table=annotation_table.rename(columns={'Protein.ID': 'ProteinID', 'Locus Tag ID': "NGO_ID"})
annotation_table
                        
#Match id format between kegg and model
annotation_table['ProteinID'] = annotation_table['ProteinID'].str.replace('.', '_')

annotatedgenes= annotation_table[annotation_table.ProteinID.isin(carveme_model_genes)]
converted_model_genes= list()
for tag in annotatedgenes['NGO_ID']:
        converted_model_genes.append(tag) 


#Get KEGG orthologies for carveme genes
database = list()
for _, v in pd.read_json("C://Users/Aimee/Documents/UVA/Metabolic_Modeling/organized/Figures/Fig_1/ngo00001.json").iterrows():
    d = v["children"]
    cat_1 = d["name"]
    for child_1 in d["children"]:
        cat_2 = child_1["name"] # Module?
        for child_2 in child_1["children"]:
            cat_3 = child_2["name"]
            if "children" in child_2:
                for child_3 in child_2["children"]:
                    cat_4 = child_3["name"]
                    fields = [cat_1, cat_2, cat_3, cat_4]
                    database.append(fields)
df_kegg = pd.DataFrame(database, columns=["Level_A", "Level_B", "Level_C", "Level_D"])

kos = list()
for tag in df_kegg["Level_D"]:
        full_tag=tag.split()
        geneid=full_tag[0]
        kos.append(geneid) 

df_kegg["ids"]= kos

#Include only kegg metabolism related annotations
kegg_annotations=df_kegg[df_kegg["Level_A"].str.contains("09100 Metabolism")==True]

#Match id format between kegg and model
kegg_annotations['ids'] = kegg_annotations['ids'] .str.replace('_', '')

#Filter kegg annotations by genes found in model

carveme_model_kegg_orthologies=kegg_annotations[kegg_annotations.ids.isin(converted_model_genes)]


carveme_model_kegg_orthologies.to_csv('KEGG_orthologies_carveme_FA1090.csv')

  annotation_table['ProteinID'] = annotation_table['ProteinID'].str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kegg_annotations['ids'] = kegg_annotations['ids'] .str.replace('_', '')


In [15]:
#Trim out sublevels of metabolism categories not being used for all three models
trimmed_model_kegg_orthologies=model_kegg_orthologies.drop(["Level_C", "Level_D"], axis=1)
trimmed_NM_model_kegg_orthologies=NM_model_kegg_orthologies.drop(["Level_C", "Level_D"], axis=1)
trimmed_carveme_model_kegg_orthologies=carveme_model_kegg_orthologies.drop(["Level_C", "Level_D"], axis=1)

#drop duplicate genes that are listed twice within a single kegg category
pruned_and_trimmed_model_kegg_orthologies=trimmed_model_kegg_orthologies.drop_duplicates()
pruned_and_trimmed_NM_model_kegg_orthologies=trimmed_NM_model_kegg_orthologies.drop_duplicates()
pruned_and_trimmed_carveme_model_kegg_orthologies=trimmed_carveme_model_kegg_orthologies.drop_duplicates()

#Report counts of unique genes present in each category (genes may appear in more than one category)
print("Ngo_558:")
print(pruned_and_trimmed_model_kegg_orthologies['Level_B'].value_counts())
print("\n")
print("Nmb_iTM560:")
print(pruned_and_trimmed_NM_model_kegg_orthologies['Level_B'].value_counts())
print("\n")
print("carveme_FA1090:")
print(pruned_and_trimmed_carveme_model_kegg_orthologies['Level_B'].value_counts())

Ngo_558:
09105 Amino acid metabolism                          115
09108 Metabolism of cofactors and vitamins            96
09101 Carbohydrate metabolism                         91
09102 Energy metabolism                               59
09104 Nucleotide metabolism                           45
09107 Glycan biosynthesis and metabolism              43
09103 Lipid metabolism                                27
09110 Biosynthesis of other secondary metabolites     26
09106 Metabolism of other amino acids                 26
09109 Metabolism of terpenoids and polyketides        16
09111 Xenobiotics biodegradation and metabolism        2
Name: Level_B, dtype: int64


Nmb_iTM560:
09105 Amino acid metabolism                          114
09101 Carbohydrate metabolism                         98
09108 Metabolism of cofactors and vitamins            94
09102 Energy metabolism                               72
09107 Glycan biosynthesis and metabolism              50
09104 Nucleotide metabolism          

In [16]:
#Number of Genes with unidentified metabolic functions in KEGG
print("Ngo_558:")
#Total Number of genes in the model
print("total genes in model:", len(model_genes))
#total number of unique genes with Kegg ides
print("unique genes with kegg ids:", model_kegg_orthologies.ids.nunique())
#Number of genes in the model without metabolism linked Kegg IDs
print("unannotated genes:",len(model_genes) - model_kegg_orthologies.ids.nunique())

print("\n")
print("Nmb_iTM560:")
#Total Number of genes in the model
print("total genes in model:", len(NM_model_genes))
#total number of unique genes with Kegg ides
print("unique genes with kegg ids:", NM_model_kegg_orthologies.ids.nunique())
#Number of genes in the model without metabolism linked Kegg IDs
print("unannotated genes:",len(NM_model_genes) - NM_model_kegg_orthologies.ids.nunique())

print("\n")
print("carveme_FA1090:")
#Total Number of genes in the model
print("total genes in model:", len(carveme_model_genes))
#total number of unique genes with Kegg ides
print("unique genes with kegg ids:", carveme_model_kegg_orthologies.ids.nunique())
#Number of genes in the model without metabolism linked Kegg IDs
print("unannotated genes:",len(carveme_model_genes) - carveme_model_kegg_orthologies.ids.nunique())

Ngo_558:
total genes in model: 560
unique genes with kegg ids: 399
unannotated genes: 161


Nmb_iTM560:
total genes in model: 562
unique genes with kegg ids: 413
unannotated genes: 149


carveme_FA1090:
total genes in model: 510
unique genes with kegg ids: 347
unannotated genes: 163


In [17]:
#Possible Alternative?
#https://widdowquinn.github.io/2018-03-06-ibioic/02-sequence_databases/09-KEGG_programming.html


In [None]:
model.reactions
NM_model.reactions
carvememodel.reactions