In [None]:
import networkx as nx
import random
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sns.set_theme()
np.random.seed(0)

In [None]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)
    
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

def save_to_gpickle_file(output_folder_dest, data, file_name):
    output_filename = output_folder_dest + file_name + ".gpickle"
    nx.write_gpickle(data, output_filename)
    print("Constructed and saved", output_filename)
    
def read_networkx_gpickle_file(input_file_name):
    if not os.path.exists(input_file_name):
        sys.exit("Can't locate input file %s" % input_file_name)
    return nx.read_gpickle(input_file_name)

In [None]:
def plot_histplot(output_folder_dest, fig_size, data_df, x_col, plot_series=False, x_label="", input_kde=True, input_color="black", set_axis=False, axis_array=[], input_label="", graph_title=""):
    plt.figure(figsize=fig_size)
    if not plot_series:
        sns.histplot(data=data_df, x=x_col, kde=input_kde, color=input_color, label=input_label)
    else:
        sns.histplot(data=data_df, kde=input_kde, color=input_color, label=input_label)
    plt.legend()
    if(set_axis):
        plt.axis(xmin=axis_array[0], xmax=axis_array[1], ymin=axis_array[2], ymax=axis_array[3])
    plt.xlabel(x_label, fontsize=18)
    plt.ylabel("Count", fontsize=18)
    plt.title(graph_title, fontsize=25)
    output_filename = output_folder_dest + graph_title + "_histplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
def plot_pathfx_vs_string_histplot(output_folder_dest, data_df_one, data_df_two, x_col_one, x_col_two, x_label="", y_label="", fig_size=(15, 10), input_kde=True, input_colors=["blue","red"], set_axis=False, axis_array=[], input_labels=["PathFX","STRING"], graph_title=""):
    plt.figure(figsize=fig_size)
    sns.histplot(data=data_df_one[x_col_one], kde=input_kde, color=input_colors[0], label="PathFX")
    sns.histplot(data=data_df_two[x_col_two], kde=input_kde, color=input_colors[1], label="STRING")
    plt.legend()
    if(set_axis):
        plt.axis(xmin=axis_array[0], xmax=axis_array[1], ymin=axis_array[2], ymax=axis_array[3])
    plt.xlabel(x_label, fontsize=18)
    plt.ylabel(y_label, fontsize=18)
    output_filename = output_folder_dest + graph_title + "_histplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
def plot_vertical_barplot(output_folder_dest, fig_size, data_df, x_col, y_col, hue_col, graph_title):
    plt.figure(figsize=fig_size)
    ax = sns.barplot(data=data_df, x=x_col, y=y_col, hue=hue_col, ci=None)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
    plt.xlabel(x_col, fontsize=18)
    plt.ylabel(y_col, fontsize=18)
    plt.title(graph_title, fontsize=25)
    if(hue_col):
        plt.legend(loc='upper right')
    output_filename = output_folder_dest + graph_title + "_vbarplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)

In [None]:
input_string_score_df = transfer_string_score_df
input_filter_val_lst = transfer_filter_val_lst
input_parent_folder = transfer_output_folder
output_batch_folder_name = transfer_output_batch_folder_name

In [None]:
output_threshold_analysis_folder = input_parent_folder + output_batch_folder_name + "/"
check_directory_exists(output_threshold_analysis_folder)

In [None]:
input_reduced_string_score_df = input_string_score_df[input_string_score_df["Avg Physical Combined Score"] >= input_filter_val_lst[0]]
input_reduced_string_score_df = input_reduced_string_score_df[input_reduced_string_score_df["Avg Physical Combined Score"] < input_filter_val_lst[-1]]
input_reduced_string_score_df

In [None]:
filter_val_dict = {'threshold':[], 'num_proteins': [], 'num_interactions':[],
     'max_degree': [], 'min_degree':[], 'mean_degree':[], 'std_degree':[], 
     'max_edge_score': [], 'min_edge_score':[], 'mean_edge_score':[], 'std_edge_score':[]}
filter_val_df = pd.DataFrame(data=filter_val_dict)
filter_val_df = filter_val_df.set_index("threshold")
filter_val_df

In [None]:
for filter_val in input_filter_val_lst:

    output_filter_folder = output_threshold_analysis_folder + "threshold_val_" + str(filter_val) + "/"
    check_directory_exists(output_filter_folder)

    filtered_string_score_df = input_reduced_string_score_df[input_reduced_string_score_df["Avg Physical Combined Score"] >= filter_val]

    # Constructing DataFrame Statistics for protein interactions
    filtered_string_num_protein_per_protein_stats_df = pd.DataFrame(filtered_string_score_df.groupby('Protein 1')["Avg Physical Combined Score"].count().agg(['count', 'mean', 'std', 'max', 'min', 'sum']))
    filtered_string_num_protein_per_protein_stats_df = filtered_string_num_protein_per_protein_stats_df.rename(columns={"Avg Physical Combined Score": "Number Protein Interaction Per Protein"})

    filtered_string_removed_cross_pair_df = filtered_string_score_df[filtered_string_score_df['Protein 1'] < filtered_string_score_df['Protein 2']]

    # Constructing DataFrame Statistics for Avg Physical Combined Scores 
    filtered_string_all_avg_physical_combined_scores_stats_df = pd.DataFrame(filtered_string_removed_cross_pair_df['Avg Physical Combined Score'].agg(['count', 'mean', 'std', 'max', 'min', 'sum']))

    # Saving pandas Dataframe to .csv files 
    save_to_csv_file(output_filter_folder, filtered_string_score_df, "filtered_string_score_table_for_threshold_val_" + str(filter_val))
    save_to_csv_file(output_filter_folder, filtered_string_num_protein_per_protein_stats_df, "filtered_string_num_protein_per_protein_stats_table_for_threshold_val_" + str(filter_val), input_index=True)
    save_to_csv_file(output_filter_folder, filtered_string_removed_cross_pair_df, "filtered_string_removed_cross_pair_table_for_threshold_val_" + str(filter_val))
    save_to_csv_file(output_filter_folder, filtered_string_all_avg_physical_combined_scores_stats_df, "filtered_string_all_avg_physical_combined_scores_stats_table_for_threshold_val_" + str(filter_val), input_index=True)

    # Visual for Avg Physical Combined Score Distribtion Graph
    plot_histplot(output_filter_folder, (15, 10), filtered_string_removed_cross_pair_df["Avg Physical Combined Score"], x_col="", plot_series=True, x_label="", input_kde=True, input_color="red", input_label="STRING", graph_title = "Disribution of Avg Physical Combined Score in STRING | Threshold Value - " + str(filter_val))

    # Visual for Number of Protein Interactions per Protein in STRING
    plot_histplot(output_filter_folder, (15, 10), filtered_string_score_df.groupby('Protein 1')["Avg Physical Combined Score"].count(), x_col="", plot_series=True, x_label="", input_kde=True, input_color="red", input_label="STRING", graph_title = "Distribution of the Number of Protein Interactions per Protein in STRING | Threshold Value - " + str(filter_val))
    
    # Visual for Number of Protein Interactions per Protein in STRING (ZOOMED LEFT)
    plt.figure(figsize = (15, 10))
    sns.histplot(data=filtered_string_score_df.groupby('Protein 1')["Avg Physical Combined Score"].count(), kde=True, color="red", label="STRING")
    plt.xlabel("Number of Interactions")
    plt.axis(xmin=0, xmax=300)
    graph_title = "Distribution of the Number of Protein Interactions per Protein in STRING | Threshold Value - " + str(filter_val) + " (ZOOMED LEFT)"
    plt.title(graph_title)
    output_filename = output_filter_folder + graph_title +".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
    # num_proteins, num_interactions, max_degree, min_degree, mean_degree, std_degree, max_edge_score, min_edge_score, mean_edge_score, std_edge_score
    num_proteins = filtered_string_num_protein_per_protein_stats_df.loc["count"][0]
    num_interactions = filtered_string_all_avg_physical_combined_scores_stats_df.loc["count"][0]
    max_degree = filtered_string_num_protein_per_protein_stats_df.loc["max"][0]
    min_degree = filtered_string_num_protein_per_protein_stats_df.loc["min"][0]
    mean_degree = filtered_string_num_protein_per_protein_stats_df.loc["mean"][0]
    std_degree = filtered_string_num_protein_per_protein_stats_df.loc["std"][0]
    max_edge_score = filtered_string_all_avg_physical_combined_scores_stats_df.loc["max"][0]
    min_edge_score = filtered_string_all_avg_physical_combined_scores_stats_df.loc["min"][0]
    mean_edge_score = filtered_string_all_avg_physical_combined_scores_stats_df.loc["mean"][0]
    std_edge_score = filtered_string_all_avg_physical_combined_scores_stats_df.loc["std"][0]

    lst = [num_proteins, num_interactions, max_degree, min_degree, mean_degree, std_degree, max_edge_score, min_edge_score, mean_edge_score, std_edge_score]
    filter_val_df.loc[filter_val] = lst

In [None]:
filter_val_df = filter_val_df.reset_index()
display("filter_val_df", filter_val_df)

In [None]:
num_threshold = len(input_filter_val_lst)

if num_threshold <= 10:
    fig_size = (15, 10)
elif num_threshold <= 40:
    fig_size = (30, 20)
else:
    fig_size = (60, 40)

In [None]:
output_theshold_analysis_folder = output_threshold_analysis_folder + "threshold_analysis/"
check_directory_exists(output_theshold_analysis_folder)
    
save_to_csv_file(output_theshold_analysis_folder, filter_val_df, "threshold_score_stats")

plot_vertical_barplot(output_theshold_analysis_folder, fig_size, filter_val_df, "threshold", "num_proteins", None, "Number of Unique Proteins per Threshold")
plot_vertical_barplot(output_theshold_analysis_folder, fig_size, filter_val_df, "threshold", "num_interactions", None, "Number of Total Protein Interactions per Threshold")
plot_vertical_barplot(output_theshold_analysis_folder, fig_size, filter_val_df, "threshold", "mean_degree", None, "Mean Interaction (Degree) per Protein per Threshold")
plot_vertical_barplot(output_theshold_analysis_folder, fig_size, filter_val_df, "threshold", "mean_edge_score", None, "Mean Edge Score per Threshold")