In [11]:
import pandas as pd
import numpy as np
from scipy import stats

In [12]:
# Load gene expression dataset
exprData = pd.read_csv(filepath_or_buffer="microtomGeneExpression.csv", header=2)
exprData = exprData.set_index("Gene")

# Create dataframe with only data
onlyExprData = exprData.iloc[:, 5:exprData.shape[1]]
onlyExprData.index = exprData.index

# Normalize the gene expression dataset using a sample-wise z-score normalization
onlyExprDataTF = stats.zscore(onlyExprData, axis=1)
onlyExprDataTF = pd.DataFrame(onlyExprDataTF, columns=onlyExprData.columns, index=onlyExprData.index)
exprDataTF = exprData.iloc[:, 0:5]
exprDataTF = pd.concat([exprDataTF, onlyExprDataTF], axis=1)

In [13]:
# Create correlation matrix for gene-gene interactions
onlyExprDataTF = exprDataTF.iloc[:, 5:exprDataTF.shape[1]]
exprPCC = np.zeros(shape=(onlyExprDataTF.shape[0], 1))

for row in range(0, exprPCC.shape[0]):
    exprPCC[row, 0] = np.corrcoef(onlyExprDataTF.loc["Solyc05g005150.1"], onlyExprDataTF.iloc[row])[1, 0]
exprPCC = pd.DataFrame(data=exprPCC, index=exprDataTF.index, columns=["Solyc05g005150.1"])
exprPCC.head()

Unnamed: 0_level_0,Solyc05g005150.1
Gene,Unnamed: 1_level_1
Solyc00g005000.3,0.684711
Solyc00g005040.3,-0.295151
Solyc00g005050.3,-0.52559
Solyc00g005080.2,0.450393
Solyc00g005084.1,-0.21343


In [14]:
highCutoff = 0.85
lowCutoff = -0.85

exprEdges = pd.DataFrame(columns=["Gene1", "Gene2", "Direction"])
for row in range(0, exprPCC.shape[0]):
    if exprPCC.iloc[row, 0] > highCutoff and exprPCC.iloc[row, 0] != 1:
        exprEdges = exprEdges.append({"Gene1":exprPCC.index[row], "Gene2":"Solyc05g005150.1", "Direction":"+"}, ignore_index=True)
    elif exprPCC.iloc[row, 0] < lowCutoff and exprPCC.iloc[row, 0] != 1:
        exprEdges = exprEdges.append({"Gene1":exprPCC.index[row], "Gene2":"Solyc05g005150.1", "Direction":"-"}, ignore_index=True)
exprEdges = exprEdges.drop_duplicates() # If run is split up, need to remove duplicates

In [15]:
# Determine if there are any edges between genes included in exprEdges
exprPCC = pd.DataFrame(np.zeros(shape=(exprEdges.shape[0], exprEdges.shape[0])), index=exprEdges.Gene1, columns=exprEdges.Gene1)

for row in range(0, exprPCC.shape[0]):
    for col in range(0, exprPCC.shape[1]):
        exprPCC.iloc[row, col] = np.corrcoef(onlyExprDataTF.loc[exprPCC.index[row]], onlyExprDataTF.loc[exprPCC.columns[col]])[1, 0]

In [16]:
upperExprPCC = pd.DataFrame(np.triu(exprPCC.to_numpy(), k=1), index=exprPCC.index, columns=exprPCC.columns)

for row in range(0, upperExprPCC.shape[0]):
    for col in range(0, upperExprPCC.shape[1]):
        if upperExprPCC.iloc[row, col] > highCutoff:
            exprEdges = exprEdges.append({"Gene1":upperExprPCC.index[row], "Gene2":upperExprPCC.columns[col], "Direction":"+"}, ignore_index=True)
        elif upperExprPCC.iloc[row, col] < lowCutoff:
            exprEdges = exprEdges.append({"Gene1":upperExprPCC.index[row], "Gene2":upperExprPCC.columns[col], "Direction":"-"}, ignore_index=True)
exprEdges = exprEdges.drop_duplicates() # If run is split up, need to remove duplicates

In [17]:
# Load and filter metabolite-gene edge dataset to 
metExprEdges = pd.read_csv(filepath_or_buffer="metExprEdges.csv", header=0, index_col=0)
metExprEdgesSubset = metExprEdges[metExprEdges.Gene == "Solyc05g005150.1"]
metExprEdgesFiltered = metExprEdges[metExprEdges.Metabolite.isin(metExprEdgesSubset.Metabolite)]
uniqueGenes = [exprEdges.Gene1, exprEdges.Gene2]
uniqueGenes = list(set([y for x in uniqueGenes for y in x]))
metExprEdgesSubset = metExprEdgesSubset.append(metExprEdgesFiltered[metExprEdgesFiltered.Gene.isin(uniqueGenes)])

In [18]:
# Export edge datasets for drawing networks
exprEdges.to_csv(path_or_buf="f-boxNetwork/exprEdges.csv")
metExprEdgesSubset.to_csv(path_or_buf="f-boxNetwork/metExprEdges.csv")