## Set the environment

In [1]:
import cobra
from cobra import Model, Reaction, Metabolite
from cobra.flux_analysis import flux_variability_analysis
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import escher
from escher import Builder
from utils import show_map
from utils.check_precursor_problem import check_precursor_problem

## Read the excel that have all gene reaction rule information

In [2]:
df_gene_reaction_rule =pd.read_excel('../data/gene reaction rule.xlsx')
# set the reaction id column as the index in order to map it to the model 
df_gene_reaction_rule = df_gene_reaction_rule.set_index(["reaction_id"],)

## Read the raw transcriptomics of temperature treatment  70 and 80 ℃

In [3]:
xl = pd.ExcelFile('../data/80vs70 ArrayData.xlsx') # read an excel file
names1 = xl.sheet_names  # write all the sheet names in one list

### Format all the data

In [4]:
def get_overall_data (names1):
    df_overall_data = xl.parse("Array_Annotation") # write sheet named "Array_Annotation" in excel into a dataframe
    df_overall_data = df_overall_data.drop(labels=[0,1,2,3,4],axis = 0) # drop the first 5 lines
    df_overall_data.columns = df_overall_data.iloc[0] # set the first line as the column names
    df_overall_data = df_overall_data.drop(labels=[5],axis = 0) # delete the line with label 5
    df_overall_data = df_overall_data.set_index(["UID"],) # set the index
    df_overall_data["SSO number"] = "" # add a new blank column with name 'SSO number'
    for  UID in df_overall_data.index:
        if "SSO" in df_overall_data["ID"].loc[UID]:
            df_overall_data["SSO number"].at[UID] = str(df_overall_data["ID"].loc[UID]).split("_")[0]+"_" # the reason for adding "_" is because some SSO numbers 
            #only has 4 numbers and others has 5 numbers, to avoide the conditions like this (2133 and 21338)
    for name in names1:
        if name not in ["Array_Annotation", "Metadata"]:# the sheet name in the list cannot be processed by the following codes
            df = xl.parse(name)
            df  = df.drop(labels=[0,1,2,3,4,5],axis = 0)
            df.columns = df.iloc[0]
            df = df.drop(labels=[6],axis = 0)
            df = df.set_index(["UID"],)
            column_name_70 = name + " 70" #set the column name to sheet name add temperature
            column_name_80 = name + " 80"
            df_overall_data[column_name_70] = ""
            df_overall_data[column_name_80] = "" # set empty column

            for i in set(df_overall_data.index).intersection(set(df.index)): # find the same items in two set

                df_overall_data[column_name_70].at[i] = df["70°C"].loc[i]
                df_overall_data[column_name_80].at[i] = df["80°C"].loc[i]
        
    return (df_overall_data)
df_overall_data = get_overall_data(names1)
df_overall_data = df_overall_data.set_index(["SSO number"],)
df_overall_data.head()

5,R,C,Name,ID,Array_67 70,Array_67 80,Array_68 70,Array_68 80,Array_86 70,Array_86 80,Array_85 70,Array_85 80
SSO number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,1,1,,gi|16206:1-933_A.thaliana_Cab_mRNA_oligo1_553-608,116020.0,85320.0,118800.0,124647.0,86190.0,145044.0,163856.0,89737.0
,1,2,,gi|16206:1-933_A.thaliana_Cab_mRNA_oligo1_553-608,119282.0,96720.0,129168.0,127391.0,99104.0,157584.0,119132.0,87409.0
,1,3,,gi|16470|emb|X14212.1|_A.thaliana_RCA_mRNA_oligo2_1232-1288,,,,,65521.0,149152.0,,
,1,4,,gi|16470|emb|X14212.1|_A.thaliana_RCA_mRNA_oligo2_1232-1288,,,,,,,,
,1,5,,gi|1928871|gb|U91966.1|ATU91966_A.thaliana_rbcL_mRNA_oligo3_1088-1155,,,,,69295.0,155472.0,162288.0,59350.0


### Match transcriptome to the reaction id

In [5]:
def match_data_to_reaction_id(columns): 
    
    for column_name in columns:
        if "Array" in column_name: # select the column with "Array" in its name
            
            df_gene_reaction_rule[column_name] = ""
            # the following list is special because its items' reaction rule is special with both "and" and "or" in it
            list_id_with_special_gene_reaction_rule = ["carb_acetylcoa_1.2.7.1_PYRUFLAVREDUCT__RXN","oxp_redox_1.6.5.3_RXN0__5330",'carb_degra__gly_1.8.1.4_1.4.4.2_2.1.2.10_GCVMULTI__RXN']
            for rxn in df_gene_reaction_rule.index:
                
                if rxn not in list_id_with_special_gene_reaction_rule:
                    if 'spontaneous' not in str(df_gene_reaction_rule['genes'].loc[rxn]):
                        c = str(df_gene_reaction_rule["genes"].loc[rxn]).split(",")
                       
                        if len(c) > 1:# select the reactions catalyzed by enzymes with more subunits or isoenzymes

                            gene_numbers1 = []
                            
                            if "or" in str(df_gene_reaction_rule['gene_reaction_rule'].loc[rxn]):

                                for i in range(len(c)):
                                    e = "SSO" + c[i]+"_"

                                    if e in df_overall_data.index:
                                        
                                        if df_overall_data[column_name].loc[e] !='':
                                            gene_numbers1.append(df_overall_data[column_name].loc[e])
     

                            df_gene_reaction_rule[column_name].loc[rxn] = sum(gene_numbers1)
                    

                            if "and" in str(df_gene_reaction_rule['gene_reaction_rule'].loc[rxn]):               

                                for i in range(len(c)):
                                    e = "SSO" + c[i]+"_"

                                    if e in df_overall_data.index:

                                        gene_numbers1.append(df_overall_data[column_name].loc[e])

                                       
                            if '' in gene_numbers1:
                                df_gene_reaction_rule[column_name].loc[rxn] = ""
                             
                            else:
                                df_gene_reaction_rule[column_name].loc[rxn] = min(gene_numbers1)
                             

                        else:
                            for i in range(len(c)):
                                e = "SSO" + c[i]+"_"

                                if e in df_overall_data.index:
                                    df_gene_reaction_rule[column_name].loc[rxn] = df_overall_data[column_name].loc[e]  
                                    df_gene_reaction_rule[column_name].loc[rxn] = df_overall_data[column_name].loc[e]
                    else:
                        df_gene_reaction_rule[column_name].loc[rxn] = 1000000 # if spontaneous in it, set the number to 1000000.
                        df_gene_reaction_rule[column_name].loc[rxn] = 1000000
                elif rxn in ["oxp_redox_1.6.5.3_RXN0__5330","carb_acetylcoa_1.2.7.1_PYRUFLAVREDUCT__RXN"]:
                    g = df_gene_reaction_rule["genes"].loc[rxn].split("or")
                   
                    gene_numbers1 = []
                    
                    c1 = str(g[0]).split(",")
                    for i in range(len(c1)):

                        e = "SSO" + c1[i]+ "_"

                        if e in df_overall_data.index:
                            
                            gene_numbers1.append(df_overall_data[column_name].loc[e])
                                


                    gene_numbers3 = []
                   
                    c2 = str(g[1]).split(",")
                    for i in range(len(c2)):

                        e = "SSO" + c2[i]+"_"
                        if e in df_overall_data.index:
                        
                            gene_numbers3.append(df_overall_data[column_name].loc[e])


                    if '' not in gene_numbers3:
                        if '' in gene_numbers1:
                            df_gene_reaction_rule[column_name].loc[rxn] = min(gene_numbers3)
                        else:
                            df_gene_reaction_rule[column_name].loc[rxn] = min(gene_numbers3) + min(gene_numbers3)
                    else:
                        
                        if '' in gene_numbers1:
                            df_gene_reaction_rule[column_name].loc[rxn] = ''
                        else:
                            df_gene_reaction_rule[column_name].loc[rxn] = min(gene_numbers1)
                    
                    
                elif rxn == "carb_degra__gly_1.8.1.4_1.4.4.2_2.1.2.10_GCVMULTI__RXN":
                    g = str(df_gene_reaction_rule["genes"].loc[rxn]).split(")")
                    
                    gene_numbers1 = []
                    
                    c1 = str(g[0]).split(",")
                    c1[0] = c1[0][1:]

                    for i in range(len(c1)):

                        e = "SSO" + c1[i]+ "_"

                        if e in df_overall_data.index:

                            if df_overall_data[column_name].loc[e] !='':
                                gene_numbers1.append(df_overall_data[column_name].loc[e])
                            
    
                    gene_numbers3 = []
                    
                    c2 = str(g[1]).split(",")
                    c2[0] = c2[0][1:]
                    for i in range(len(c2)):

                        e = "SSO" + c2[i]+"_"
                        if e in df_overall_data.index:

                            gene_numbers3.append(df_overall_data[column_name].loc[e])
                          

                    
                    
                    if '' in gene_numbers3:
                         df_gene_reaction_rule.loc[rxn,column_name] = ''
                    else:
                        score = min(gene_numbers3)
                        df_gene_reaction_rule.loc[rxn,column_name] = min(sum(gene_numbers1),score)

    return (df_gene_reaction_rule)
list_columns = df_overall_data.columns
df_match_data_to_reaction_id = match_data_to_reaction_id(list_columns)
df_match_data_to_reaction_id.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Reaction name,pathway,genes,gene_reaction_rule,Unnamed: 5,Unnamed: 6,Array_67 70,Array_67 80,Array_68 70,Array_68 80,Array_86 70,Array_86 80,Array_85 70,Array_85 80
reaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
T_ABC__transporter_beta__D__glucose,ABC transport,,28473066284830672849306828503069,and,,,142229,63180,,,196101,99840,215040,141324
carb_entner_1.1.1.359_GLUCOSE__1__DEHYDROGENASE__NAD+__RXN,GDH,,300330423204,or,,,105265,111072,134460.0,107596.0,134495,154112,150784,92612
carb_entner_1.1.1.47_GLUCOSE__1__DEHYDROGENASE__NAD+__RXN,GDH,,300330423204,or,,,105265,111072,134460.0,107596.0,134495,154112,150784,92612
carb_entner_1.1.1.360_GLUCOSE__1__DEHYDROGENASE__NADP+__RXN,GDH,,300330423204,or,,,105265,111072,134460.0,107596.0,134495,154112,150784,92612
carb_entner_3.1.1.17_GLUCONOLACT__RXN,GL,,spontaneous,,,,1000000,1000000,1000000.0,1000000.0,1000000,1000000,1000000,1000000


### Calculate the average

In [6]:
df_match_data_to_reaction_id.to_csv("../data/matched_transcriptone_data_to_reaction_id.csv")

In [7]:
df_match_data_to_reaction_id =pd.read_csv('../data/matched_transcriptone_data_to_reaction_id.csv')
list_columns_70 = [i for i in df_match_data_to_reaction_id.columns if " 70" in i]
list_columns_80 = [i for i in df_match_data_to_reaction_id.columns if " 80" in i]
df_match_data_to_reaction_id["average_list_column_70"] = df_match_data_to_reaction_id[list_columns_70].mean(axis= 1) # calculate the average values of different columns
df_match_data_to_reaction_id["average_list_column_80"] = df_match_data_to_reaction_id[list_columns_80].mean(axis= 1)
df_match_data_to_reaction_id.head()

Unnamed: 0,reaction_id,Reaction name,pathway,genes,gene_reaction_rule,Unnamed: 5,Unnamed: 6,Array_67 70,Array_67 80,Array_68 70,Array_68 80,Array_86 70,Array_86 80,Array_85 70,Array_85 80,average_list_column_70,average_list_column_80
0,T_ABC__transporter_beta__D__glucose,ABC transport,,28473066284830672849306828503069,and,,,142229.0,63180.0,,,196101.0,99840.0,215040.0,141324.0,184456.666667,101448.0
1,carb_entner_1.1.1.359_GLUCOSE__1__DEHYDROGENASE__NAD+__RXN,GDH,,300330423204,or,,,105265.0,111072.0,134460.0,107596.0,134495.0,154112.0,150784.0,92612.0,131251.0,116348.0
2,carb_entner_1.1.1.47_GLUCOSE__1__DEHYDROGENASE__NAD+__RXN,GDH,,300330423204,or,,,105265.0,111072.0,134460.0,107596.0,134495.0,154112.0,150784.0,92612.0,131251.0,116348.0
3,carb_entner_1.1.1.360_GLUCOSE__1__DEHYDROGENASE__NADP+__RXN,GDH,,300330423204,or,,,105265.0,111072.0,134460.0,107596.0,134495.0,154112.0,150784.0,92612.0,131251.0,116348.0
4,carb_entner_3.1.1.17_GLUCONOLACT__RXN,GL,,spontaneous,,,,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0


In [8]:
df_match_data_to_reaction_id.to_csv("../data/reaction_id_RAS.csv")