In [None]:
import networkx as nx
import random
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sns.set_theme()
np.random.seed(0)

In [None]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)
    
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

def save_to_gpickle_file(output_folder_dest, data, file_name):
    output_filename = output_folder_dest + file_name + ".gpickle"
    nx.write_gpickle(data, output_filename)
    print("Constructed and saved", output_filename)
    
def read_networkx_gpickle_file(input_file_name):
    if not os.path.exists(input_file_name):
        sys.exit("Can't locate input file %s" % input_file_name)
    return nx.read_gpickle(input_file_name)

In [None]:
def plot_histplot(output_folder_dest, fig_size, data_df, x_col, plot_series=False, x_label="", input_kde=True, input_color="black", set_axis=False, axis_array=[], input_label="", graph_title=""):
    plt.figure(figsize=fig_size)
    if not plot_series:
        sns.histplot(data=data_df, x=x_col, kde=input_kde, color=input_color, label=input_label)
    else:
        sns.histplot(data=data_df, kde=input_kde, color=input_color, label=input_label)
    plt.legend()
    if(set_axis):
        plt.axis(xmin=axis_array[0], xmax=axis_array[1], ymin=axis_array[2], ymax=axis_array[3])
    plt.xlabel(x_label, fontsize=18)
    plt.ylabel("Count", fontsize=18)
    plt.title(graph_title, fontsize=25)
    output_filename = output_folder_dest + graph_title + "_histplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
def plot_pathfx_vs_string_histplot(output_folder_dest, data_df_one, data_df_two, x_col_one, x_col_two, x_label="", y_label="", fig_size=(15, 10), input_kde=True, input_colors=["blue","red"], set_axis=False, axis_array=[], input_labels=["PathFX","STRING"], graph_title=""):
    plt.figure(figsize=fig_size)
    sns.histplot(data=data_df_one[x_col_one], kde=input_kde, color=input_colors[0], label="PathFX")
    sns.histplot(data=data_df_two[x_col_two], kde=input_kde, color=input_colors[1], label="STRING")
    plt.legend()
    if(set_axis):
        plt.axis(xmin=axis_array[0], xmax=axis_array[1], ymin=axis_array[2], ymax=axis_array[3])
    plt.xlabel(x_label, fontsize=18)
    plt.ylabel(y_label, fontsize=18)
    output_filename = output_folder_dest + graph_title + "_histplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
def plot_vertical_barplot(output_folder_dest, fig_size, data_df, x_col, y_col, hue_col, graph_title):
    plt.figure(figsize=fig_size)
    ax = sns.barplot(data=data_df, x=x_col, y=y_col, hue=hue_col, ci=None)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
    plt.xlabel(x_col, fontsize=18)
    plt.ylabel(y_col, fontsize=18)
    plt.title(graph_title, fontsize=25)
    
    if(hue_col):
        plt.legend(loc='upper right')
    
    output_filename = output_folder_dest + graph_title + "_vbarplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)

In [None]:
input_reduced_string_score_df = transfer_string_score_df
input_bin_list = transfer_bin_list
input_parent_folder = transfer_output_folder
output_batch_folder_name = transfer_output_batch_folder_name

In [None]:
output_binned_score_analysis_folder = input_parent_folder + output_batch_folder_name + "/"
check_directory_exists(output_binned_score_analysis_folder)

In [None]:
def bin_edge_score(x, input_bin_list):
    for lower, upper in zip(input_bin_list, input_bin_list[1:]):
        if (x["Avg Physical Combined Score"] >= lower) and (x["Avg Physical Combined Score"] < upper):
            interval = "[" + str(lower) + ", " + str(upper) + ")"
            return interval

In [None]:
input_reduced_string_score_df = input_reduced_string_score_df[input_reduced_string_score_df["Avg Physical Combined Score"] >= input_bin_list[0]]
input_reduced_string_score_df = input_reduced_string_score_df[input_reduced_string_score_df["Avg Physical Combined Score"] < input_bin_list[-1]]

In [None]:
input_reduced_string_score_df["Binned Scored Value"] = input_reduced_string_score_df.apply(lambda x: bin_edge_score(x, input_bin_list), axis=1)

num_bins = len(list(input_reduced_string_score_df["Binned Scored Value"].unique()))

input_reduced_string_score_df = input_reduced_string_score_df.sort_values(by="Binned Scored Value")

display("input_reduced_string_score_df", input_reduced_string_score_df)

In [None]:
input_reduced_string_score_df_removed_cross_pair_df = input_reduced_string_score_df[input_reduced_string_score_df['Protein 1'] < input_reduced_string_score_df['Protein 2']]
input_reduced_string_score_df_removed_cross_pair_df = input_reduced_string_score_df_removed_cross_pair_df.sort_values(by="Binned Scored Value")
display("input_reduced_string_score_df_removed_cross_pair_df", input_reduced_string_score_df_removed_cross_pair_df)

In [None]:
binned_score_dict = {'Binned Scored Value':[], 'Number of Protein Interactions': [], 'Number Unique Proteins':[],
                     'mean edge score':[],  'min edge score':[], 'max edge score':[], 'std edge score':[],
                     "mean degree":[], "min degree":[], "max degree":[], "std degree":[]}
binned_score_stats_df = pd.DataFrame(data=binned_score_dict)
binned_score_stats_df = binned_score_stats_df.set_index("Binned Scored Value")

binned_degree_dist_dict = {'Binned Scored Value':[], 'Distribution':[]}
binned_degree_dist_df = pd.DataFrame(data=binned_degree_dist_dict)
binned_degree_dist_df = binned_degree_dist_df.set_index("Binned Scored Value")

binned_score_lst = list(input_reduced_string_score_df["Binned Scored Value"].unique())
binned_score_lst.sort()

for binned_score in binned_score_lst:
    
    df = input_reduced_string_score_df[input_reduced_string_score_df["Binned Scored Value"] == binned_score]
    df2 = input_reduced_string_score_df_removed_cross_pair_df[input_reduced_string_score_df_removed_cross_pair_df["Binned Scored Value"] == binned_score]
    
    # edge scores
    df3 = df2["Avg Physical Combined Score"].agg(['mean', 'std', 'max', 'min'])
    
    # This obtains degree disribution
    degree_dist = df.groupby("Protein 1").count().reset_index()["Binned Scored Value"]
    df4 = degree_dist.agg(['mean', 'std', 'max', 'min'])
    
    num_protein_interactions = len(df2)
    num_unique_proteins = int(len(list(df["Protein 1"].unique())))
    
    mean_edge_score = df3["mean"]
    min_edge_score = df3["min"]
    max_edge_score = df3["max"]
    std_edge_score = df3["std"]
    
    mean_degree = df4["mean"]
    min_degree = df4["min"]
    max_degree = df4["max"]
    std_degree = df4["std"]
    
    lst = [list(degree_dist)]
    binned_degree_dist_df.loc[binned_score] = lst
    
    lst = [num_protein_interactions, num_unique_proteins, mean_edge_score, min_edge_score, max_edge_score, std_edge_score, mean_degree, min_degree, max_degree, std_degree]
    binned_score_stats_df.loc[binned_score] = lst

binned_score_stats_df = binned_score_stats_df.reset_index()
binned_degree_dist_df = binned_degree_dist_df.reset_index()

print()
display("binned_score_stats_df:", binned_score_stats_df)
print()
display("binned_degree_dist_df:", binned_degree_dist_df)
print()

In [None]:
if num_bins <= 10:
    fig_size = (15, 10)
else:
    fig_size = (30, 20)

In [None]:
save_to_csv_file(output_binned_score_analysis_folder, binned_score_stats_df, "binned_score_stats_table")

# Plot Number of Protein Interactions
plot_vertical_barplot(output_binned_score_analysis_folder, fig_size, binned_score_stats_df, "Binned Scored Value", "Number of Protein Interactions", None, "Total Number of Protein Interactions per Binned Scored Value")

# Plot Number of Unique Protein
plot_vertical_barplot(output_binned_score_analysis_folder, fig_size, binned_score_stats_df, "Binned Scored Value", "Number Unique Proteins", None, "Number Unique Proteins per Binned Scored Value")

# Plot mean edge score
plot_vertical_barplot(output_binned_score_analysis_folder, fig_size, binned_score_stats_df, "Binned Scored Value", "mean edge score", None, "Mean Edge Score per Binned Scored Value")

# Plot mean degree
plot_vertical_barplot(output_binned_score_analysis_folder, fig_size, binned_score_stats_df, "Binned Scored Value", "mean degree", None, "Mean Number of Interactions (Degree) per Binned Scored Value")


# ---------------------

plt.figure(figsize = fig_size)
sns.boxplot(data=input_reduced_string_score_df_removed_cross_pair_df, x="Binned Scored Value", y="Avg Physical Combined Score")
graph_title = "Avg Physical Combined Score Distribution Spread per Binned Scored Value"
plt.xlabel("Binned Scored Value")
plt.ylabel("Avg Physical Combined Score")
plt.title(graph_title)
output_filename = output_binned_score_analysis_folder + graph_title +".png"
plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
plt.close()
print("Constructed and saved", output_filename)

In [None]:
if(output_batch_folder_name == "from_0.7_to_1.0_step_0.05"):

    df = binned_degree_dist_df["Distribution"].apply(pd.Series)
    df = df.set_index(binned_degree_dist_df["Binned Scored Value"])
    df = df.T

    plt.figure(figsize = (15, 10))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value"
    plt.yticks(range(0, 1210, 100))
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree)")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)

    plt.figure(figsize = (15, 10))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    plt.axis(ymin=0, ymax=160)
    plt.yticks(range(0, 161, 10))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value (ZOOMED)"
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree) ")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)

    plt.figure(figsize = (15, 10))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    plt.axis(ymin=0, ymax=20)
    plt.yticks(range(0, 21, 1))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value (ZOOMED 2X)"
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree)")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
    
elif (output_batch_folder_name == "from_0.9_to_1.0_step_0.01"):
    
    df = binned_degree_dist_df["Distribution"].apply(pd.Series)
    df = df.set_index(binned_degree_dist_df["Binned Scored Value"])
    df = df.T

    plt.figure(figsize = (15, 10))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value"
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree)")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)

    plt.figure(figsize = (15, 10))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    plt.axis(ymin=0, ymax=100)
    plt.yticks(range(0, 101, 5))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value (ZOOMED)"
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree)")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
    
elif (output_batch_folder_name == "from_0.9_to_1.0_step_0.01"):
    
    df = binned_degree_dist_df["Distribution"].apply(pd.Series)
    df = df.set_index(binned_degree_dist_df["Binned Scored Value"])
    df = df.T

    plt.figure(figsize = (15, 10))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value"
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree)")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
else:
    df = binned_degree_dist_df["Distribution"].apply(pd.Series)
    df = df.set_index(binned_degree_dist_df["Binned Scored Value"])
    df = df.T

    plt.figure(figsize = (30, 20))
    sns.boxplot(x="Binned Scored Value", y="value", data=pd.melt(df))
    graph_title = "Distribution of Number of Interactions (Degree) per Binned Scored Value"
    plt.xlabel("Binned Scored Value")
    plt.ylabel("Number of Interactions (Degree)")
    plt.title(graph_title)
    output_filename = output_binned_score_analysis_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)