In [None]:
# Input file name definitions
fileComps = 'Compartments.txt' # input: previous model file
fileSpecies = 'Species.txt' # input: previous model file
fileRatelaws = 'RatelawsNoSM.txt' # input: previous model file
fileOmicsdata = 'OmicsData.txt' # input: previous model file
fileGeneReg = 'GeneReg.txt' # input: previous model file

# New network and data file name definitions
fileMOBILE = 'IRF1_PDL1sub.txt' # input: MOBILE subnetwork
fileSPARCEDomics = 'Supplementary_Data_22.xlsx' # input: 10A mRNA & protein levels
fileLINCSRNAs = 'RNAseqDataLINCS.csv' # input: LINCS mRNA levels
fileLitkvals = 'Schwanhausser2011.txt' # input: Schwanhausser et al. Nature 2011 paper data on mRNA/protein half-lives 

# Names for the updated input files
fileComps_Upd = 'Compartments_MM.txt' # Output: Updated file
fileSpecies_Upd = 'Species_MM.txt' # Output: Updated file
fileRatelaws_Upd = 'RatelawsNoSM_MM.txt' # Output: Updated file
fileOmicsdata_Upd = 'OmicsData_MM.txt' # Output: Updated file
fileGeneReg_Upd = 'GeneReg_MM.txt' # Output: Updated file

In [None]:
# Write file names into 'FileNames' file to inform 'createModel' notebook
inputfiles = [fileComps,fileSpecies,fileRatelaws,fileOmicsdata,fileGeneReg]
outfiles = [fileComps_Upd,fileSpecies_Upd,fileRatelaws_Upd,fileOmicsdata_Upd,fileGeneReg_Upd]

fileNames = open('FileNames.txt','w') 
for name in inputfiles:
    fileNames.write(name+" \t")
fileNames.write("\n")
for name in outfiles:
    fileNames.write(name+" \t")
fileNames.close()

### Import required packages and scripts

In [None]:
import sys
import os
sys.path.append(os.getcwd()[0:os.getcwd().rfind('/')]+'/bin')

import libsbml
import importlib
import amici
import numpy as np
import re
import pandas as pd
from antimony import *
from modules.copyDir import copyDirectory

# Optional packages to import
import amici.plotting
import matplotlib.pyplot as plt

In [None]:
#copy input files over to current directory if needed
current_dir = os.getcwd()
input_data_folder = current_dir[0:current_dir.rfind('/')+1]+'input_files'
copyDirectory(input_data_folder, os.getcwd()+"/")

In [None]:
# Read-in Species list
network_sheet = np.array([np.array(line.strip().split("\t")) for line in open(fileMOBILE, encoding='latin-1')])
network_sheet = network_sheet[1:]
ns = pd.DataFrame(network_sheet)

# Read-in TARs structure
TARsRead = pd.read_csv(fileGeneReg,header=0,index_col=0,sep="\t")
TARs0 = (TARsRead.values)
numberofTARs = len(TARsRead.columns)
numberofgenes = len(TARsRead)

netGeneNames = np.unique(np.append(network_sheet[:,3],network_sheet[:,6]))
netmRNAs = ['m_' + x for x in netGeneNames] # naming convention from SPARCED
netProteins = []
netRxns_phos = []
netGenes = []
netTARs = []
netSigns = []
for idx,val in enumerate(network_sheet[:,8]):    
    netProteins = np.append(netProteins,['prot_' + network_sheet[idx,3]])
    netProteins = np.append(netProteins,['prot_' + network_sheet[idx,6]])
    if val=='2': # flag for protein or chromatin region (2=protein, 4=chromatin)
        if "_" in network_sheet[idx,7]: # '_' means the RPPA antibody target was a phosphoprotein 
            pp = network_sheet[idx,7].split("_") # there can be multiple phospho sites
            ppprot = netProteins[-1]
            for idy,j in enumerate(pp[1:]):
                netRxns_phos = np.append(netRxns_phos,ppprot + ' ; ' + ppprot+'_'+j) # Add the rxn "X-->pX" or "pX-->ppX"
                netRxns_phos = np.append(netRxns_phos,ppprot+'_'+j + ' ; ' + ppprot) # Add the rxn "pX-->X" or "ppX-->pX"
                ppprot = ppprot+'_'+j
                netProteins = np.append(netProteins,ppprot) # Add the phosphoprotein form in the list of species
    # Add the gene-gene connections as TARs for both genes
    netGenes = np.append(netGenes,network_sheet[idx,6]) # Add to TARed-gene list
    netTARs = np.append(netTARs,['prot_'+network_sheet[idx,3]]) # Add to TARer-protein list
    netGenes = np.append(netGenes,network_sheet[idx,3])
    netTARs = np.append(netTARs,['prot_'+network_sheet[idx,6]])
    # Also assign signs (Activator=1, or repressor=-1) to the TARs
    tempS1 = network_sheet[idx,20] # FULL model lasso coefficient column index - from MOBILE
    tempS2 = network_sheet[idx,30] # IFNG-LOGO model lasso coefficient column index - from MOBILE
    if tempS1>tempS2:
        temps = np.sign(np.double(network_sheet[idx,19]))
    else:
        temps = np.sign(np.double(network_sheet[idx,29]))
    netSigns = np.append(netSigns,temps)
    netSigns = np.append(netSigns,temps)    
netProteins = np.unique(netProteins)

TARsReadE = TARsRead.copy()
### Translation and nascent protein degradation rxns 
netRxnsTLs = []
netRLawsTLs = []
netRxnsTLds = []
for idx,gene in enumerate(netGeneNames): 
    if not gene in TARsReadE.index:
        netRxnsTLs = np.append(netRxnsTLs,' ; ' + 'prot_'+ gene) # translation rxn for each new gene
        netRLawsTLs = np.append(netRLawsTLs,'kTLn' + str(idx+1) + '*m_'+ gene) # the ratelaw for above
        netRxnsTLds = np.append(netRxnsTLds,'prot_'+ gene + ' ; ') # degradation rate for each new nascent protein
    else:
        print('Warning: Gene "' + gene + '" is already in the model. Check and harmonize protein name and rxns!')
        netProteins = netProteins[netProteins != ['prot_' + gene]] # remove the protein of this gene

In [None]:
# replace the name of the existing genes with active forms of proteins that are actually the TARs
netTARs = list(map(lambda x: x.replace('prot_STAT1','STAT1snuc_STAT1snuc'), netTARs)) 

In [None]:
Data10A = pd.ExcelFile(fileSPARCEDomics)
RNAdata = pd.read_excel(Data10A, 'RNAseq')
RNAdata.columns = RNAdata.columns.str.strip()
PROTdata = pd.read_excel(Data10A, 'ProteomicsMS')
PROTdata.columns = PROTdata.columns.str.strip()
OmicsData_old = pd.read_csv(fileOmicsdata,header=0,index_col=0,sep="\t")

Literature_kvals = pd.read_csv(fileLitkvals,header=0,sep="\t")

species_sheet = pd.read_csv(fileSpecies,header=0,index_col=0,sep="\t",encoding='latin-1')
mRNAdf = species_sheet[species_sheet.index.map(lambda s: s.startswith('m_'))] # Separate mRNA species from protein/complexes
spsdf = species_sheet[~species_sheet.index.str.startswith("m_")]

comps_sheet = pd.read_csv(fileComps,header=0,index_col=0,sep="\t",encoding='latin-1')
# Find the conversion factor from mpc to nM in nucleus
Vn = comps_sheet[comps_sheet.index.str.startswith('Nucleus')]['volume'][0]
mpc2nmcf_Vn = 1.0E9/(Vn*6.023E+23)

OmicsData = OmicsData_old.copy()
# First, set protein levels from 10A data, if present
for sps in netProteins:    
    if sps not in spsdf.index:
        pp = sps.split("_")
        sps_gene = pp[1]        
        tempDF = PROTdata.loc[PROTdata['Gene names'] == sps_gene] # check if the gene has data
        if not tempDF.empty: # if yes
            aa = tempDF.index.values.astype(int)[0]
            bb = (PROTdata.at[aa,'average'])*mpc2nmcf_Vn # Here we define all new species are in nucleus (they are TARs)
        else:
            bb = 0.0
        spsdf.at[sps,'Compartment'] = 'Nucleus'
        spsdf.at[sps,'IC'] = str(bb)
        spsdf.at[sps,'HGNC'] = sps_gene
    else:
        print('Species is already in the model, no changes made: ' + sps)

for mrna in netmRNAs:    
    if mrna not in mRNAdf.index:        
        pp = mrna.split("_")
        gene = pp[1]        
        tempDF = RNAdata.loc[RNAdata['Gene'] == gene] # check if the gene has data
        if not tempDF.empty: # if yes
            aa = tempDF.index.values.astype(int)[0]
            # Below we define all new mRNAs are in nucleus
            bbmpc = np.int(np.rint(np.mean([RNAdata.at[aa,'P21-mpc'],RNAdata.at[aa,'P22-mpc']])))
            bb = bbmpc*mpc2nmcf_Vn 
        else:
            bb = 0.0
        mRNAdf.at[mrna,'Compartment'] = 'Nucleus'
        mRNAdf.at[mrna,'IC'] = str(bb) 
        mRNAdf.at[mrna,'HGNC'] = gene 
        if gene not in OmicsData.index:
            OmicsData.loc[gene] = OmicsData.loc['IGF1R'] # Insert a new row, values the same as for your favorite gene (eg.IGF1R)
            OmicsData.at[gene,'Exp RNA'] = str(bbmpc) # set to nearest integer from the mean of two experiments
            OmicsData.at[gene,'comp'] = 'Nucleus'
    else:
        print('mRNA is already in the model, no changes made: ' + mrna)  
species_updated = pd.concat([spsdf, mRNAdf], axis=0) # create new species dataframe

In [None]:
### Get LINCS datasets
RNADataLINCS = pd.read_csv(fileLINCSRNAs,header=0,index_col=0)
RNADataLINCS = RNADataLINCS[~RNADataLINCS.index.isnull()] # remove rows without an HGNC identifier

mrnaData_IFNG = []
for dgene in netGeneNames:
    datagenes = RNADataLINCS[RNADataLINCS.index.str.contains(dgene)] #RNAseqData
    for gname in datagenes.index.str.split(' '):
        for indgene in gname:
            if indgene == (dgene):
                mrnaData_IFNG.append([indgene, 2**datagenes.at[str(gname[0]),'ctrl_0'],  # take 2^ to linearize the data
                                               2**datagenes.at[str(gname[0]),'IFNG_24'],
                                               2**datagenes.at[str(gname[0]),'IFNG_48']])
            else:
                print('boo')
mrnaData_t0r2IRF1 = [row[1] for row in mrnaData_IFNG]
for ii in np.arange(np.shape(mrnaData_IFNG)[0]): # normalize to m_IRF1 at time=0
    mrnaData_t0r2IRF1[ii] = np.float(mrnaData_IFNG[ii][1])/np.float(mrnaData_IFNG[6][1])

# get protein/mRNA ratio for IRF1 and use the ratio to calculate new protein levels for MOBILE genes
prot_IRF1_10A = species_updated.loc['prot_IRF1']["IC"]
mrna_IRF1_10A = species_updated.loc['m_IRF1']["IC"]
pr2mr = prot_IRF1_10A/mrna_IRF1_10A # protein/mRNA ratio of IRF1 (the gene that had "data" points in both 10A & LINCS datasets)

for idx,gene in enumerate(netGeneNames):
    if gene!='STAT1': # STAT1 was already present in the model (learned above!)
        sps_prot = 'prot_'+gene
        sps_mrna = 'm_'+gene
    else: # Comment out if do not want to alter STAT1 levels
        sps_prot = gene # STAT1 protein was already in the model as "STAT1"
        sps_mrna = 'm_'+gene

    #### Indent below part if the "else" is OFF above 
    # Update mRNA and protein initial levels with LINCS data (note that IRF1 is kept at 10A data levels and 
    # the rest are calculated based on IRF1 ratios)
    species_updated.at[sps_mrna,'IC'] = str(np.float(species_updated.loc['m_IRF1']["IC"])*np.float(mrnaData_t0r2IRF1[idx]))
    species_updated.at[sps_prot,'IC'] = str(np.float(species_updated.loc[sps_mrna]["IC"])*pr2mr)

    # Update the Omics Data input with new mRNA mpc and other parameter values
    OmicsData.at[gene,'Exp RNA'] = str(np.int(np.rint(np.float(species_updated.loc[sps_mrna]["IC"])/mpc2nmcf_Vn)))
    tau_m = 0
    if gene!='IRF1': # IRF1 gene name had unrelated genes show up in the dataset, so it is filtered here. 
        tempGdf = Literature_kvals.loc[Literature_kvals['Gene Names'].str.contains(gene, case=False, na=False)]
        # Check for isoform data and use if the gene is not present
        tempGdfzz = Literature_kvals.loc[Literature_kvals['Gene Names'].str.contains(gene[0:-1]+'.*', case=False, na=False, regex = True)]    
        if not tempGdf.empty:
            tau_m = np.float(np.mean(tempGdf['mRNA half-life average [h]']))
        elif not tempGdfzz.empty: # Update with mean isoform data if present 
            tau_m = np.float(np.mean(tempGdfzz['mRNA half-life average [h]']))
    if not np.isnan(tau_m) and tau_m!=0:
        OmicsData.at[gene,'kTCd'] = str(np.log(2)/tau_m/3600.0)
    OmicsData.at[gene,'kTCleak'] = str((np.float(OmicsData.loc[[gene],'kTCd'])*np.float(OmicsData.loc[[gene],'Exp RNA']))*(np.float(OmicsData.loc[[gene],'kGin'])+np.float(OmicsData.loc[[gene],'kGac']))/(np.float(OmicsData.loc[[gene],'kGac'])*np.float(OmicsData.loc[[gene],'Exp GCN'])))

### Prepare the new GeneReg structure
def_nA = 4.0 # Default value, can change with parameter fitting
for gene in netGeneNames: # Add rows for GeneReg
    if gene not in TARsReadE.index:
        TARsReadE.loc[gene] = str(0)
for tempP in netTARs: # Add columns to GeneReg
    if tempP not in TARsReadE.columns:
        TARsReadE[tempP] = str(0)
for idx,gene in enumerate(netGenes): # Set the parameter value pairs for new TARs
    if netTARs[idx]=='STAT1snuc_STAT1snuc':
        tempProt = 'STAT1' # STAT1snuc=0 at time=0 so use STAT1 total protein level to set KA
    else:
        tempProt = netTARs[idx]
    tempKA = np.double(np.float(species_updated.loc[tempProt]["IC"]))
    TARsReadE.at[gene,netTARs[idx]] = str(str(netSigns[idx]*def_nA)+'; '+str(np.ceil(tempKA)/2.0))        

In [None]:
### Now convert new rxns as row inserts to the model Ratelaws input file
# Read-in Ratelaws list
ratelaw_sheetE = pd.read_csv(fileRatelaws,header=0,index_col=0,sep="\t")
# Phosphorylation / dephosphorylation rxns -> row insert
for idx,val in enumerate(netRxns_phos): 
    ratelaw_sheetE.at['vMM'+str(idx+1),'Comp_correction'] = 'Nucleus'
    ratelaw_sheetE.at['vMM'+str(idx+1),'Species'] = val
    ratelaw_sheetE.at['vMM'+str(idx+1),'Ratelaw'] = str(1.0) # This is user defined. 
# Translation rxns -> row insert
for idx,val in enumerate(netRxnsTLs): 
    ratelaw_sheetE.at['vMMTL'+str(idx+1),'Comp_correction'] = 'Nucleus'
    ratelaw_sheetE.at['vMMTL'+str(idx+1),'Species'] = val
    ratelaw_sheetE.at['vMMTL'+str(idx+1),'Ratelaw'] = netRLawsTLs[idx]
    # If literature data exist for protein half-life, set the kTL accordingly 
    pp = val.split("_")
    gene = pp[1]            
    sps_prot = 'prot_'+gene
    sps_mrna = 'm_'+gene
    tau_p = 0
    if gene!='IRF1': 
        tempGdf = Literature_kvals.loc[Literature_kvals['Gene Names'].str.contains(gene, case=False, na=False)]
        # Check for isoform data and use if the gene is not present
        tempGdfzz = Literature_kvals.loc[Literature_kvals['Gene Names'].str.contains(gene[0:-1]+'.*', case=False, na=False, regex = True)]    
        if not tempGdf.empty:
            tau_p = np.float(tempGdf['Protein half-life average [h]'])
        elif not tempGdfzz.empty: # Update with mean isoform data if present 
            tau_p = np.float(np.mean(tempGdfzz['Protein half-life average [h]']))
    #         ratelaw_sheetE.at['vMMTL'+str(idx+1),'Unnamed: 4'] = str(np.float(species_updated.loc[sps_prot]['IC'])*(np.log(2)/tau_p/3600.0)/np.float(species_updated.loc[sps_mrna]['IC']))
    if not np.isnan(tau_p) and tau_p != 0:
        ratelaw_sheetE.at['vMMTL'+str(idx+1),'Unnamed: 4'] = str(np.float(species_updated.loc[sps_prot]['IC'])*(np.log(2)/tau_p/3600.0)/np.float(species_updated.loc[sps_mrna]['IC']))    
    else: # if no literature data exist, set to median for SPARCED genes
        ratelaw_sheetE.at['vMMTL'+str(idx+1),'Unnamed: 4'] = str(7.122388e-02) # Median value, used for SPARCED-I-SOCS1 model
    
# Degradation rxns -> row insert
for idx,val in enumerate(netRxnsTLds): 
    ratelaw_sheetE.at['vMMTLd'+str(idx+1),'Comp_correction'] = 'Nucleus'
    ratelaw_sheetE.at['vMMTLd'+str(idx+1),'Species'] = val
     # If literature data exist for protein half-life, set the kTLd accordingly 
    pp = val.split("_")
    gene = pp[1][0:-3] # to disregard the " ; "
    sps_prot = 'prot_'+gene
    sps_mrna = 'm_'+gene
    tau_p = 0
    if gene!='IRF1':
        tempGdf = Literature_kvals.loc[Literature_kvals['Gene Names'].str.contains(gene, case=False, na=False)]
        # Check for isoform data and use if the gene is not present
        tempGdfzz = Literature_kvals.loc[Literature_kvals['Gene Names'].str.contains(gene[0:-1]+'.*', case=False, na=False, regex = True)]    
        if not tempGdf.empty:
            tau_p = np.float(tempGdf['Protein half-life average [h]'])
    #         ratelaw_sheetE.at['vMMTLd'+str(idx+1),'Ratelaw'] = str((np.log(2)/tau_p/3600.0))
        elif not tempGdfzz.empty: # Update with mean isoform data if present 
            tau_p = np.float(np.mean(tempGdfzz['Protein half-life average [h]']))
    #         ratelaw_sheetE.at['vMMTLd'+str(idx+1),'Ratelaw'] = str((np.log(2)/tau_p/3600.0))
    if not np.isnan(tau_p) and tau_p != 0: 
        ratelaw_sheetE.at['vMMTLd'+str(idx+1),'Ratelaw'] = str((np.log(2)/tau_p/3600.0))
    else: # if no literature data exist, calculate or set to median value (4.599338e-06) for SPARCED genes
        ratelaw_sheetE.at['vMMTLd'+str(idx+1),'Ratelaw'] = str(np.double(ratelaw_sheetE.loc['vMMTL'+str(idx+1),'Unnamed: 4'])*np.double(species_updated.loc[sps_mrna,'IC'])/np.double(species_updated.loc[sps_prot,'IC']))       
# USER CHECK: Here, also set the STAT1 degradation rate so that it is constant without IFNg stimulation
gene = 'STAT1'
sps_prot = gene
sps_mrna = 'm_'+gene
ratelaw_sheetE.at['vI61','Ratelaw'] = str(np.double(ratelaw_sheetE.loc['vI50','Unnamed: 4'])*np.double(species_updated.loc[sps_mrna,'IC'])/np.double(species_updated.loc[sps_prot,'IC']))       

In [None]:
comps_sheet.to_csv(fileComps_Upd,sep='\t',header=True, index=True)
species_updated.to_csv(fileSpecies_Upd,sep='\t',header=True, index=True)
ratelaw_sheetE.to_csv(fileRatelaws_Upd,sep='\t',header=True, index=True)
OmicsData.to_csv(fileOmicsdata_Upd,sep='\t',header=True, index=True)
TARsReadE.to_csv(fileGeneReg_Upd,sep='\t',header=True, index=True)

In [None]:
netGeneNames = pd.DataFrame(netGeneNames)
netGeneNames.to_csv('NetGeneNames.txt',sep='\t',header=True, index=True)