In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import savefig

In [2]:
# Load the clean data into variables
DataPath = "Processed Data/"

dicts = ['EFI_ID_List', 'metabolite_dict', 'Protein_seq_dict']

with open(DataPath + 'EFI_ID_List.p', 'rb') as EFI_ID:
    EFI_ID_List = pickle.load(EFI_ID)

with open(DataPath + 'metabolite_dict.p', 'rb') as metabolite:
    metabolite_dict = pickle.load(metabolite)

with open(DataPath + 'Protein_seq_dict.p', 'rb') as Protein_seq:
    Protein_seq_dict = pickle.load(Protein_seq)

activations = pd.read_csv(DataPath + 'activations.csv', index_col=0)

In [3]:
def Heatmap(df=activations, xlabel="Protein Index", ylabel="Substrate Index", dpi=700, xtick=False):
    # Generate heatmap of raw data and activations
    plt.figure(dpi=dpi)
    heatmap = plt.imshow(df) # Creates heatmap using input dataframe/array
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.jet() # Specifies the "jet" colormap
    clb = clb = plt.colorbar() # Displayes col
    clb.ax.set_ylabel('Vmax values of activity',rotation=270, labelpad=13)

    if xtick == True:
        loc, labels = plt.xticks()
        plt.xticks(labels=df.columns)

    # Need to choose to show heat map or savefig, cannot do both in one operation
    # It turns out that savefig also displays the figure, who knew?

    # plt.show(heatmap)
    # plt.close()
    plt.savefig('HeatMap.png')

def Heatmap_Vector(df=activations.iloc[0,:], ylabel="Protein Index", xlabel="Substrate Index", my_dpi=400, show_values=True, showfig=False, savefig=False):
    # Generate heatmap from data contained in a vector using seaborn
    fig, ax = plt.subplots(figsize=40, dpi=my_dpi)

    sns.heatmap(data=df,   
                cmap='jet',
                linewidths=0,
                annot=show_values,
                yticklabels=True,
                cbar_kws={"shrink":10.0, "anchor":(0.0, 1.0)},
                ax=ax)
    
    
    ax.set(title='Ranked Protein Activity',xlabel=xlabel, ylabel=ylabel)
    
    if showfig == True:
        plt.show(ax)

    if savefig == True:
        path = "Heatmap_Vector_Plots/"
        plt.savefig(path + str(xlabel) + ".png", dpi=my_dpi, bbox_inches='tight')

    plt.close(fig)

def SortNormalize(df, SID):
    # Sort values in dataframe, normalize data, retain labels on protein ID
    df = df.sort_values(by=SID, ascending=False) # Sort values in descending order
    # Still need to add data normalization feature
    print(df[df[SID] > 0].count()) # Print count of non-zero datapoints
    return df

# Heatmap()

In [4]:
def SaveSubstrate(df=activations, SID=0, Subs_list=list(metabolite_dict.values())):
    df = pd.DataFrame(df.iloc[SID,:]) # Obtains vector from dataframe based on substrate needed for analysis

    df = SortNormalize(df=df,SID=SID)
    substrate = Subs_list[SID]

    Heatmap_Vector(df=df, show_values=False, xlabel=substrate, savefig=True)

'''
SID = np.random.randint(0,168)
print(SID)
AnalyzeSubstrate(df=activations, SID=SID)
'''

'\nSID = np.random.randint(0,168)\nprint(SID)\nAnalyzeSubstrate(df=activations, SID=SID)\n'

In [5]:
# This section is pretty dangerous to run, will take a long time, also needs a restart after
'''
for i in range(0,len(activations.index)):
    SaveSubstrate(df=activations, SID=i)
'''

'\nfor i in range(0,len(activations.index)):\n    SaveSubstrate(df=activations, SID=i)\n'

In [6]:
def RankAll(df=activations):
    # Reorder dataframe based on protein with the highest activity score
    # Activity score is calculated by total activity across substrates
    score = df.sum(axis=0)
    # print(score)
    score_sort = score.sort_values(ascending=False)
    # print(score_sort)
    # Reorder dataframe from left to right
    df = pd.concat([df, score_sort.to_frame().T], ignore_index=True)
    sorted_df = df.sort_values(df.last_valid_index(), axis=1, ascending=False)
    sorted_df.drop(df.tail(1).index,inplace=True)
    return sorted_df, score_sort

sorted_df, _ = RankAll()
# Show heatmap of ranked activities
# Heatmap_Vector(sorted_df)

# Obtain list of ranked proteins, including activities

In [7]:
def Heatmap_Ranked(df=activations.iloc[0,:], xlabel="Protein Index", ylabel="Substrate Index", my_dpi=400, show_values=False, showfig=False, savefig=False):
    # Generate heatmap from data contained in a vector using seaborn
    fig, ax = plt.subplots(dpi=my_dpi)

    sns.heatmap(data=df,   
                cmap='jet',
                linewidths=0,
                annot=show_values,
                yticklabels=False,
                # cbar_kws={"shrink":10.0, "anchor":(0.0, 1.0)},
                ax=ax)
    
    
    ax.set(title='Ranked Protein Activity',xlabel=xlabel, ylabel=ylabel)
    
    if showfig == True:
        plt.show(ax)

    if savefig == True:
        # path
        plt.savefig("Ranked Protein Activity.png", dpi=my_dpi, bbox_inches='tight')

    plt.close(fig)

# Heatmap_Ranked(sorted_df, my_dpi=700, showfig=False, savefig=True)

In [27]:
def RankProteinBySubstrate(df=activations, percentile=25, substrates=list(metabolite_dict.values())):

        # Note: Percentile is from the bottom
    percentile = 100 - percentile
    
    my_dict = {}

    for i in range(0,len(df.index)):
        # Rank proteins on a per substrate basis
        table = df.iloc[i,:]
        table = table.where(table > 0).dropna() # remove 0 values from threshold calculation
        table = table.sort_values(ascending=False)
        
        # Identify the top 25 percentile, or whatever is set
        threshold = table.quantile(percentile/100)
        table = table.where(table > threshold).dropna()
        # print(threshold)
        # print(table)

        # We can change the scoring method later, use this for now
        score = table.values.sum()
        substrate = substrates[i]

        table = table.to_frame()
        df2 = pd.DataFrame({0:[score]})
        table = table.append(df2, ignore_index=False)
        table = table.rename(columns={0:str(substrate)+'Activity'})

        my_dict[str(substrate)] = [table, score]

    '''This is the score table generator'''
    data = pd.DataFrame()
    for i in Dict:
        score = Dict[i][1]
        substrate = i
        data = data.append([[substrate,score]],ignore_index=True)
    data = data.rename(columns={0:'Substrate', 1:'Score'})
    score_table = data


    return my_dict, score_table

# Dict = RankProteinBySubstrate(percentile=10)

In [34]:
Dict, Score_Table = RankProteinBySubstrate(percentile=10)
'''
data = pd.DataFrame(columns='Substrate','Score')
for i in Dict:
    score = Dict[i][1]
    substrate = i
    data = data.append([[substrate,score]],ignore_index=True)
data'''
Score_Table.sort_values(by='Score',ascending=False).iloc[0:10,:]

Unnamed: 0,Substrate,Score
167,PNPP,11.929
80,5'-AMP,11.4835
94,UMP,11.3126
70,IMP,11.291
82,dTMP,11.2744
81,dUMP,11.1376
62,2'-deoxyribose-5-phosphate,11.1334
137,2'-deoxy-D-glucose-6-phosphate,11.105
13,Glycerol-phosphate,11.0587
51,L-arabitol-1-phosphate,10.963


In [36]:
df2 = Dict['PNPP'][0]
Heatmap_Vector(df=df2)

TypeError: from_bounds() argument after * must be an iterable, not int