In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import savefig

In [3]:
# Load the clean data into variables
DataPath = "Processed Data/"

dicts = ['EFI_ID_List', 'metabolite_dict', 'Protein_seq_dict']

with open(DataPath + 'EFI_ID_List.p', 'rb') as EFI_ID:
    EFI_ID_List = pickle.load(EFI_ID)

with open(DataPath + 'metabolite_dict.p', 'rb') as metabolite:
    metabolite_dict = pickle.load(metabolite)

with open(DataPath + 'Protein_seq_dict.p', 'rb') as Protein_seq:
    Protein_seq_dict = pickle.load(Protein_seq)

activations = pd.read_csv(DataPath + 'activations.csv', index_col=0)

In [4]:
def Heatmap(df=activations, xlabel="Protein Index", ylabel="Substrate Index", dpi=700, xtick=False):
    # Generate heatmap of raw data and activations
    plt.figure(dpi=dpi)
    heatmap = plt.imshow(df) # Creates heatmap using input dataframe/array
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.jet() # Specifies the "jet" colormap
    clb = clb = plt.colorbar() # Displayes col
    clb.ax.set_ylabel('Vmax values of activity',rotation=270, labelpad=13)

    if xtick == True:
        loc, labels = plt.xticks()
        plt.xticks(labels=df.columns)

    # Need to choose to show heat map or savefig, cannot do both in one operation
    # It turns out that savefig also displays the figure, who knew?

    # plt.show(heatmap)
    # plt.close()
    plt.savefig('HeatMap.png')

def Heatmap_Vector(df=activations.iloc[0,:], ylabel="Protein Index", xlabel="Substrate Index", my_dpi=400, show_values=True, showfig=False, savefig=False):
    # Generate heatmap from data contained in a vector using seaborn
    fig, ax = plt.subplots(figsize=40, dpi=my_dpi)

    sns.heatmap(data=df,   
                cmap='jet',
                linewidths=0,
                annot=show_values,
                yticklabels=True,
                cbar_kws={"shrink":10.0, "anchor":(0.0, 1.0)},
                ax=ax)
    
    
    ax.set(title='Ranked Protein Activity',xlabel=xlabel, ylabel=ylabel)
    
    if showfig == True:
        plt.show(ax)

    if savefig == True:
        path = "Heatmap_Vector_Plots/"
        plt.savefig(path + str(xlabel) + ".png", dpi=my_dpi, bbox_inches='tight')

    plt.close(fig)

def SortNormalize(df, SID):
    # Sort values in dataframe, normalize data, retain labels on protein ID
    df = df.sort_values(by=SID, ascending=False) # Sort values in descending order
    # Still need to add data normalization feature
    print(df[df[SID] > 0].count()) # Print count of non-zero datapoints
    return df

# Heatmap()

In [5]:
def SaveSubstrate(df=activations, SID=0, Subs_list=list(metabolite_dict.values())):
    df = pd.DataFrame(df.iloc[SID,:]) # Obtains vector from dataframe based on substrate needed for analysis

    df = SortNormalize(df=df,SID=SID)
    substrate = Subs_list[SID]

    Heatmap_Vector(df=df, show_values=False, xlabel=substrate, savefig=True)

'''
SID = np.random.randint(0,168)
print(SID)
AnalyzeSubstrate(df=activations, SID=SID)
'''

'\nSID = np.random.randint(0,168)\nprint(SID)\nAnalyzeSubstrate(df=activations, SID=SID)\n'

In [6]:
# This section is pretty dangerous to run, will take a long time, also needs a restart after
'''
for i in range(0,len(activations.index)):
    SaveSubstrate(df=activations, SID=i)
'''

'\nfor i in range(0,len(activations.index)):\n    SaveSubstrate(df=activations, SID=i)\n'

In [7]:
def RankAll(df=activations):
    # Reorder dataframe based on protein with the highest activity score
    # Activity score is calculated by total activity across substrates
    score = df.sum(axis=0)
    # print(score)
    score_sort = score.sort_values(ascending=False)
    # print(score_sort)
    # Reorder dataframe from left to right
    df = pd.concat([df, score_sort.to_frame().T], ignore_index=True)
    sorted_df = df.sort_values(df.last_valid_index(), axis=1, ascending=False)
    sorted_df.drop(df.tail(1).index,inplace=True)
    return sorted_df, score_sort

sorted_df, _ = RankAll()
# Show heatmap of ranked activities
# Heatmap_Vector(sorted_df)

# Obtain list of ranked proteins, including activities

In [8]:
def Heatmap_Ranked(df=activations.iloc[0,:], xlabel="Protein Index", ylabel="Substrate Index", my_dpi=400, show_values=False, showfig=False, savefig=False):
    # Generate heatmap from data contained in a vector using seaborn
    fig, ax = plt.subplots(dpi=my_dpi)

    sns.heatmap(data=df,   
                cmap='jet',
                linewidths=0,
                annot=show_values,
                yticklabels=False,
                # cbar_kws={"shrink":10.0, "anchor":(0.0, 1.0)},
                ax=ax)
    
    
    ax.set(title='Ranked Protein Activity',xlabel=xlabel, ylabel=ylabel)
    
    if showfig == True:
        plt.show(ax)

    if savefig == True:
        # path
        plt.savefig("Ranked Protein Activity.png", dpi=my_dpi, bbox_inches='tight')

    plt.close(fig)

# Heatmap_Ranked(sorted_df, my_dpi=700, showfig=False, savefig=True)

In [31]:
def RankProteinBySubstrate(df=activations, percentile=25, substrates=list(metabolite_dict.values())):

        # Note: Percentile is from the bottom
    percentile = 100 - percentile
    
    my_dict = {}

    for i in range(0,len(df.index)):
        # Rank proteins on a per substrate basis
        table = df.iloc[i,:]
        table = table.where(table > 0).dropna() # remove 0 values from threshold calculation
        table = table.sort_values(ascending=False)
        
        # Identify the top 25 percentile, or whatever is set
        threshold = table.quantile(percentile/100)
        table = table.where(table > threshold).dropna()
        # print(threshold)
        # print(table)

        # We can change the scoring method later, use this for now
        score = table.values.sum()
        substrate = substrates[i]

        table = table.to_frame()
        df2 = pd.DataFrame({0:[score]})
        table = table.append(df2, ignore_index=False)
        table = table.rename(columns={0:str(substrate)+'Activity'})

        my_dict[str(substrate)] = [table, score]
        
    return my_dict

Dict = RankProteinBySubstrate(percentile=10)

{'Blank': [             0
  501392  0.9825
  900327  0.6505
  508016  0.4290
  900334  0.4165
  900159  0.3190
  900176  0.3020
  508252  0.1815
  508069  0.1720
  508513  0.1540
  508525  0.1525
  502329  0.1436
  508485  0.1380
  508537  0.1280
  508563  0.1255
  502358  0.1217
  900099  0.1180
  900122  0.1090
  501188  0.1050
  508465  0.1045
  508464  0.0970
  900094  0.0960
  0       5.0458,
  5.045800000000001],
 'phosphoenolpyruvate': [             0       1
  508473     NaN  0.5625
  501348     NaN  0.5480
  501280     NaN  0.4125
  501309     NaN  0.3360
  508537     NaN  0.3175
  508485     NaN  0.3080
  900129     NaN  0.3005
  508534     NaN  0.2655
  900135     NaN  0.2345
  501188     NaN  0.2220
  508525     NaN  0.2110
  501272     NaN  0.2110
  900128     NaN  0.1835
  508513     NaN  0.1705
  501193     NaN  0.1660
  508533     NaN  0.1555
  508539     NaN  0.1520
  0       4.7565     NaN,
  4.7565],
 'Glycerol-2-phospate': [            0       2
  508537    NaN  1.2

In [30]:
x = activations.iloc[0,:].where(activations.iloc[3,:] > 0.5).dropna().to_frame()
df2 = pd.DataFrame({0:[3]})
x = x.append(df2, ignore_index=False)
x.rename(columns={0:"Guacamole"})

# x.append(df2, ignore_index=False)

Unnamed: 0,Guacamole
501150,0.0055
501206,0.0625
501394,0.0415
502345,0.0335
508464,0.097
508485,0.138
900146,0.0195
900153,0.0415
900166,0.0355
900199,0.022
