In [1]:
import networkx as nx
import random
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
import numpy as np
import pandas as pd
import pickle 
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
sns.set_theme()
np.random.seed(0)

In [2]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)
    
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

def save_to_gpickle_file(output_folder_dest, data, file_name):
    output_filename = output_folder_dest + file_name + ".gpickle"
    nx.write_gpickle(data, output_filename)
    print("Constructed and saved", output_filename)
    
def read_networkx_gpickle_file(input_file_name):
    if not os.path.exists(input_file_name):
        sys.exit("Can't locate input file %s" % input_file_name)
    return nx.read_gpickle(input_file_name)

In [3]:
def plot_histplot(output_folder_dest, fig_size, data_df, x_col, plot_series=False, x_label="", input_kde=True, input_color="black", set_axis=False, axis_array=[], input_label="", graph_title=""):
    plt.figure(figsize=fig_size)
    if not plot_series:
        sns.histplot(data=data_df, x=x_col, kde=input_kde, color=input_color, label=input_label)
    else:
        sns.histplot(data=data_df, kde=input_kde, color=input_color, label=input_label)
    plt.legend()
    if(set_axis):
        plt.axis(xmin=axis_array[0], xmax=axis_array[1], ymin=axis_array[2], ymax=axis_array[3])
    plt.xlabel(x_label, fontsize=18)
    plt.ylabel("Count", fontsize=18)
    plt.title(graph_title, fontsize=25)
    output_filename = output_folder_dest + graph_title + "_histplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)
    
def plot_pathfx_vs_string_histplot(output_folder_dest, data_df_one, data_df_two, x_col_one, x_col_two, x_label="", y_label="", fig_size=(15, 10), input_kde=True, input_colors=["blue","red"], set_axis=False, axis_array=[], input_labels=["PathFX","STRING"], graph_title=""):
    plt.figure(figsize=fig_size)
    sns.histplot(data=data_df_one[x_col_one], kde=input_kde, color=input_colors[0], label="PathFX")
    sns.histplot(data=data_df_two[x_col_two], kde=input_kde, color=input_colors[1], label="STRING")
    plt.legend()
    if(set_axis):
        plt.axis(xmin=axis_array[0], xmax=axis_array[1], ymin=axis_array[2], ymax=axis_array[3])
    plt.xlabel(x_label, fontsize=18)
    plt.ylabel(y_label, fontsize=18)
    output_filename = output_folder_dest + graph_title + "_histplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    print("Constructed and saved", output_filename)

In [4]:
if 'filter_data' not in globals():
    filter_data = True
    if (filter_data):
        filter_for_same_nodes = False
        filter_for_close_nodes = True
        filter_for_same_edges = False
        if filter_for_same_nodes == filter_for_same_edges and filter_for_same_nodes == filter_for_close_nodes:
            sys.exit("ERROR!!! User must filter for SAME EDGES, SAME NODES, or for CLOSE NODES only!")

In [5]:
# Save DataFrame Table to CSV and Visual Graphs as .png

# Turn on saving - True, Turn off saving - False
save_visual_figures = True
save_table_figures = True
save_text_outputs = True
    
output_folder = "outputs/output_comparison_network_analysis/"
output_csv_folder = output_folder + "csv_files/"
output_graph_folder = output_folder + "visual_graphs/"

check_directory_exists(output_folder)
check_directory_exists(output_csv_folder)
check_directory_exists(output_graph_folder)

In [6]:
input_string_folder = "inputs/input_comparison_network_analysis/string/"

# read gpickle file to get networkx graph for revised Physical STRING protein interaction
string_G = read_networkx_gpickle_file(input_string_folder + "string_G.gpickle")

string_score_df = read_csv_file(input_string_folder + "string_score_table.csv")
string_num_protein_per_protein_stats_df = read_csv_file(input_string_folder + "string_num_protein_per_protein_stats_table.csv", input_index_col="Unnamed: 0")
string_removed_cross_pair_df = read_csv_file(input_string_folder + "string_removed_cross_pair_table.csv")
string_all_avg_physical_combined_scores_stats_df = read_csv_file(input_string_folder + "string_all_avg_physical_combined_scores_stats_table.csv", input_index_col="Unnamed: 0")
string_summary_score_stats_for_avg_physical_combined_score_df = read_csv_file(input_string_folder + "string_summary_score_stats_for_Avg Physical Combined Score_per_protein.csv")

In [7]:
display("string_score_df", string_score_df)
print()
display("string_num_protein_per_protein_stats_df", string_num_protein_per_protein_stats_df)
print()
display("string_removed_cross_pair_df", string_removed_cross_pair_df)
print()
display("string_all_avg_physical_combined_scores_stats_df", string_all_avg_physical_combined_scores_stats_df)
print()
display("string_summary_score_stats_for_avg_physical_combined_score_df", string_summary_score_stats_for_avg_physical_combined_score_df)
print()

'string_score_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
1,FKBP4,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
2,CYP51A1,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
3,PDK4,CALM2,0.000,0.0,0.104,0.104,0.104,0.1040
4,RALA,CALM2,0.313,0.0,0.000,0.313,0.313,0.3130
...,...,...,...,...,...,...,...,...
11385399,ENSG00000258947,ZNF445,,,,,0.138,0.0895
11385400,ENSG00000258947,ZNF516,,,,,0.093,0.0875
11385401,ENSG00000258947,ZNF607,,,,,0.169,0.1595
11385402,ENSG00000258947,ZNRF1,,,,,0.041,0.0410





'string_num_protein_per_protein_stats_df'

Unnamed: 0,Number Protein Interaction Per Protein
count,19338.0
mean,588.7581
std,530.8547
max,7644.0
min,1.0
sum,11385400.0





'string_removed_cross_pair_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.0,0.0,0.000,0.041,0.041,0.0410
1,ACTL6B,CALM2,0.0,0.0,0.000,0.041,0.041,0.0410
2,AQP2,CALM2,0.0,0.0,0.051,0.050,0.050,0.0500
3,APOH,CALM2,0.0,0.0,0.000,0.041,0.041,0.0410
4,ANKRD54,CALM2,0.0,0.0,0.210,0.210,0.210,0.2100
...,...,...,...,...,...,...,...,...
5692697,ENSG00000258947,ZNF445,,,,,0.138,0.0895
5692698,ENSG00000258947,ZNF516,,,,,0.093,0.0875
5692699,ENSG00000258947,ZNF607,,,,,0.169,0.1595
5692700,ENSG00000258947,ZNRF1,,,,,0.041,0.0410





'string_all_avg_physical_combined_scores_stats_df'

Unnamed: 0,Avg Physical Combined Score
count,5692702.0
mean,0.1836059
std,0.2223218
max,0.999
min,0.041
sum,1045214.0





'string_summary_score_stats_for_avg_physical_combined_score_df'

Unnamed: 0,Protein ID,count,mean,std,max,min,sum
0,GAPDH,7644,0.241637,0.150712,0.994,0.041,1847.0700
1,AKT1,6507,0.300910,0.222489,0.999,0.041,1958.0225
2,TP53,6193,0.354423,0.240752,0.999,0.041,2194.9425
3,INS,6053,0.320572,0.201611,0.998,0.041,1940.4200
4,MYC,5786,0.294266,0.186558,0.999,0.041,1702.6225
...,...,...,...,...,...,...,...
19333,C17orf47,1,0.212000,,0.212,0.212,0.2120
19334,CCDC142,1,0.041000,,0.041,0.041,0.0410
19335,C5orf55,1,0.309000,,0.309,0.309,0.3090
19336,CLPSL2,1,0.556000,,0.556,0.556,0.5560





In [8]:
input_pathfx_folder = "inputs/input_comparison_network_analysis/pathfx/"

# read pickle file to get networkx graph for PathFX 
pathfx_G = read_networkx_gpickle_file(input_pathfx_folder + "pathfx_G.gpickle")

pathfx_score_df = read_csv_file(input_pathfx_folder + "pathfx_score_table.csv")
pathfx_num_target_per_source_stats_df = read_csv_file(input_pathfx_folder + "pathfx_num_target_per_source_stats_table.csv", input_index_col="Unnamed: 0")
pathfx_removed_cross_pair_df = read_csv_file(input_pathfx_folder + "pathfx_removed_cross_pair_pathfx_table.csv")
pathfx_all_weight_stats_df = read_csv_file(input_pathfx_folder + "pathfx_all_weight_stats_table.csv", input_index_col="Unnamed: 0")
pathfx_source_to_target_weight_stats_df = read_csv_file(input_pathfx_folder + "pathfx_source_to_target_weight_stats_table.csv")

In [9]:
display("pathfx_score_df", pathfx_score_df)
print()
display("pathfx_num_target_per_source_stats_df", pathfx_num_target_per_source_stats_df)
print()
display("pathfx_removed_cross_pair_df", pathfx_removed_cross_pair_df)
print()
display("pathfx_all_weight_stats_df", pathfx_all_weight_stats_df)
print()
display("pathfx_source_to_target_weight_stats_df", pathfx_source_to_target_weight_stats_df)
print()

'pathfx_score_df'

Unnamed: 0,source,target,weight
0,CHMP1B,KNSTRN,0.279930
1,CHMP1B,USP8,0.562588
2,CHMP1B,STAMBP,0.676545
3,CHMP1B,SPAST,0.416595
4,CHMP1B,SNRNP200,0.279930
...,...,...,...
262235,tat,GPX2,0.246744
262236,tat,GPX5,0.246744
262237,tat,GPX6,0.246744
262238,tat,BRIX1,0.279930





'pathfx_num_target_per_source_stats_df'

Unnamed: 0,Number Target Interacton Per Source
count,22832.0
mean,11.485634
std,52.692758
max,6753.0
min,1.0
sum,262240.0





'pathfx_removed_cross_pair_df'

Unnamed: 0,source,target,weight
0,CHMP1B,KNSTRN,0.279930
1,CHMP1B,USP8,0.562588
2,CHMP1B,STAMBP,0.676545
3,CHMP1B,SPAST,0.416595
4,CHMP1B,SNRNP200,0.279930
...,...,...,...
131115,Ppif,VDAC1,0.279930
131116,ADGRG1,ADRB2,0.490794
131117,ORF26,TRIM37,0.279930
131118,ALL3_AEDAE,COL3A1,0.416595





'pathfx_all_weight_stats_df'

Unnamed: 0,weight
count,131120.0
mean,0.387541
std,0.131891
max,0.99
min,0.244958
sum,50814.314438





'pathfx_source_to_target_weight_stats_df'

Unnamed: 0,source,count,mean,std,max,min,sum
0,UBC,6753,0.528777,0.132470,0.935012,0.279930,3570.832522
1,APP,1984,0.287942,0.040666,0.705007,0.246744,571.276135
2,SUMO2,713,0.429295,0.086484,0.818355,0.246936,306.087256
3,TP53,627,0.417612,0.124118,0.990000,0.246936,261.842982
4,tat,436,0.321098,0.085697,0.770889,0.246744,139.998813
...,...,...,...,...,...,...,...
22827,RTTN,1,0.605763,,0.605763,0.605763,0.605763
22828,RTT109,1,0.416595,,0.416595,0.416595,0.416595
22829,RTP2,1,0.279930,,0.279930,0.279930,0.279930
22830,RTFDC1,1,0.279930,,0.279930,0.279930,0.279930





In [10]:
# ------------------------------------------------

In [11]:
# ------------ Comparison ------------------------

In [12]:
# ------------------------------------------------

In [13]:
# STRING

string_node_one_list = string_score_df["Protein 1"].unique()
string_node_two_list = string_score_df["Protein 2"].unique()

string_num_unique_node_one = len(string_node_one_list)
string_num_unique_node_two = len(string_node_two_list)

string_node_list = []
string_node_list.extend(string_node_one_list)
string_node_list.extend(string_node_two_list)
string_unique_nodes = (pd.Series(string_node_list)).unique()

string_num_unique_nodes = len(string_unique_nodes)

print("\nSTRING Database")
print("-------------------------------------------")
print("Number of Unique Protein 1s:", string_num_unique_node_one)
print("Number of Unique Protein 2s:", string_num_unique_node_two)
print("Number of Unique Protein:", string_num_unique_nodes)
print()

# PathFX

source_list = pathfx_score_df["source"].unique()
target_list = pathfx_score_df["target"].unique()

num_sources = len(source_list)
num_targets = len(target_list)

pathfx_node_list = []
pathfx_node_list.extend(source_list)
pathfx_node_list.extend(target_list)
pathfx_unique_nodes = pd.Series(pathfx_node_list).unique()

print("\nPathFX Interactome")
print("-------------------------------------------")
print("Number of Unique sources:", num_sources)
print("Number of Unique targets:", num_targets)
print("Number of Unique sources and targets:", len(pathfx_unique_nodes))
print()

string_nodes = set(string_unique_nodes)
pathfx_nodes = set(pathfx_unique_nodes)

intersecting_nodes = pathfx_nodes & string_nodes
string_node_disjoint = string_nodes - intersecting_nodes
pathfx_node_disjoint = pathfx_nodes - intersecting_nodes

num_shared_nodes = len(intersecting_nodes)
num_distinct_string_nodes = len(string_node_disjoint)
num_distinct_pathfx_nodes = len(pathfx_node_disjoint)

print("\nBoth STRING and PathFX")
print("----------------------------------------------------------------")
print("Number of Shared Common Nodes from both graph networks:", num_shared_nodes)
print("Number of distinct STRING nodes:", num_distinct_string_nodes)
print("Number of distinct PathFX nodes:", num_distinct_pathfx_nodes)
print()


STRING Database
-------------------------------------------
Number of Unique Protein 1s: 19338
Number of Unique Protein 2s: 19338
Number of Unique Protein: 19338


PathFX Interactome
-------------------------------------------
Number of Unique sources: 22832
Number of Unique targets: 22832
Number of Unique sources and targets: 22832


Both STRING and PathFX
----------------------------------------------------------------
Number of Shared Common Nodes from both graph networks: 13948
Number of distinct STRING nodes: 5390
Number of distinct PathFX nodes: 8884



In [14]:
string_nodes = set(string_G.nodes())
pathfx_nodes = set(pathfx_G.nodes())

intersecting_nodes = string_nodes & pathfx_nodes
string_node_disjoint = string_nodes - intersecting_nodes
pathfx_node_disjoint = pathfx_nodes - intersecting_nodes

num_shared_nodes = len(intersecting_nodes)
num_distinct_string_nodes = len(string_node_disjoint)
num_distinct_pathfx_nodes = len(pathfx_node_disjoint)

print("\nComparison of NODES in STRING and PathFX NetworkX Graphs")
print("----------------------------------------------------------------")
print("Number of Shared Common Nodes from both graph networks:", num_shared_nodes)
print("Number of distinct STRING nodes:", num_distinct_string_nodes)
print("Number of distinct PathFX nodes:", num_distinct_pathfx_nodes)
print()

string_edges = list(string_G.edges())
pathfx_edges = list(pathfx_G.edges())

intersecting_edges = set([tuple(sorted(ele)) for ele in string_edges]) & set([tuple(sorted(ele)) for ele in pathfx_edges])
string_edge_disjoint = set([tuple(sorted(ele)) for ele in string_edges]) - set([tuple(sorted(ele)) for ele in intersecting_edges])
pathfx_edge_disjoint = set([tuple(sorted(ele)) for ele in pathfx_edges]) - set([tuple(sorted(ele)) for ele in intersecting_edges])

num_shared_edges = len(intersecting_edges)
num_distinct_string_edges = len(string_edge_disjoint)
num_distinct_pathfx_edges = len(pathfx_edge_disjoint)

print("\nComparison of EDGES in STRING and PathFX NetworkX Graphs")
print("----------------------------------------------------------------")
print("Number of Shared Common Edges from both graph networks:", num_shared_edges)
print("Number of distinct STRING edges:", num_distinct_string_edges)
print("Number of distinct PathFX edges:", num_distinct_pathfx_edges)
print()


Comparison of NODES in STRING and PathFX NetworkX Graphs
----------------------------------------------------------------
Number of Shared Common Nodes from both graph networks: 13948
Number of distinct STRING nodes: 5390
Number of distinct PathFX nodes: 8884


Comparison of EDGES in STRING and PathFX NetworkX Graphs
----------------------------------------------------------------
Number of Shared Common Edges from both graph networks: 72278
Number of distinct STRING edges: 5620424
Number of distinct PathFX edges: 58842



In [15]:
# filter for SAME NODES
# --------------------------------------------------------------------------------------------------------
if save_visual_figures and filter_data and filter_for_same_nodes:
    # Edge score distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 1.1, 0, 2000000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0.8, 1.1, 0, 150000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED RIGHT)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 0.7, 0, 80000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED LEFT)")
    # Interaction per Node distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-100, 7000, 0, 3000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-10, 2000, 0, 2000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-10, 500, 0, 1250], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED 2x)")


# filter for CLOSE NODES
# --------------------------------------------------------------------------------------------------------
if save_visual_figures and filter_data and filter_for_close_nodes:
    # Edge score distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 1.1, 0, 2000000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0.8, 1.1, 0, 150000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED RIGHT)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 0.7, 0, 80000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED LEFT)")
    # Interaction per Node distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-100, 7000, 0, 3000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-10, 2000, 0, 2000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-10, 500, 0, 1250], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED 2x)")

    
# filter for SAME EDGES
# --------------------------------------------------------------------------------------------------------
if save_visual_figures and filter_data and filter_for_same_edges:
    # Edge score distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 1.1, 0, 10000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0.2, 0.8, 0, 5000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED)")
    # Interaction per Node distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-100, 2000, 0, 3000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-10, 250, 0, 1000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 50, 0, 400], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED 2x)")

# NON-filter
# --------------------------------------------------------------------------------------------------------
if save_visual_figures and not filter_data:
    # Edge score distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 1.1, 0, 2000000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 1.1, 0, 150000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_removed_cross_pair_df, string_removed_cross_pair_df, "weight", "Avg Physical Combined Score", x_label="Edge Score", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[0, 1.1, 0, 40000], input_labels=["PathFX","STRING"], graph_title="Distribution of edge scores in STRING and PathFX (ZOOMED 2x)")
    # Interaction per Node distributions
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-100, 8000, 0, 1100], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-100, 2000, 0, 6000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED)")
    plot_pathfx_vs_string_histplot(output_graph_folder, pathfx_source_to_target_weight_stats_df, string_summary_score_stats_for_avg_physical_combined_score_df, "count", "count", x_label="Number of Protein Interaction", y_label="Count", input_kde=True, input_colors=["blue","red"], set_axis=True, axis_array=[-10, 50, 0, 6000], input_labels=["PathFX","STRING"], graph_title="Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED 2x)")


Constructed and saved outputs/output_comparison_network_analysis/visual_graphs/Distribution of edge scores in STRING and PathFX_histplot.png
Constructed and saved outputs/output_comparison_network_analysis/visual_graphs/Distribution of edge scores in STRING and PathFX (ZOOMED RIGHT)_histplot.png
Constructed and saved outputs/output_comparison_network_analysis/visual_graphs/Distribution of edge scores in STRING and PathFX (ZOOMED LEFT)_histplot.png
Constructed and saved outputs/output_comparison_network_analysis/visual_graphs/Distribution of Number of Interactions per Protein in STRING and PathFX_histplot.png
Constructed and saved outputs/output_comparison_network_analysis/visual_graphs/Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED)_histplot.png
Constructed and saved outputs/output_comparison_network_analysis/visual_graphs/Distribution of Number of Interactions per Protein in STRING and PathFX (ZOOMED 2x)_histplot.png


In [16]:
 # Outputting information about shared and distinct Nodes and Edges for both network graphs

if (save_text_outputs):
    
    f = open(r"outputs/output_comparison_network_analysis/intersecting_network_nodes_output.txt","w+")
    for item in intersecting_nodes:
        f.write("%s\n" % item)
    f.close()
    
    f = open(r"outputs/output_comparison_network_analysis/string_only_network_nodes_output.txt","w+")
    for item in string_node_disjoint:
        f.write("%s\n" % item)
    f.close()
    
    f = open(r"outputs/output_comparison_network_analysis/pathfx_only_network_nodes_output.txt","w+")
    for item in pathfx_node_disjoint:
        f.write("%s\n" % item)
    f.close()

    f = open(r"outputs/output_comparison_network_analysis/intersecting_network_edges_output.txt","w+")
    for item in intersecting_edges:
        f.write("%s\n" % str((item)))
    f.close()
    
    f = open(r"outputs/output_comparison_network_analysis/string_only_network_edges_output.txt","w+")
    for item in string_edge_disjoint:
        f.write("%s\n" % str((item)))
    f.close()
    
    f = open(r"outputs/output_comparison_network_analysis/pathfx_only_network_edges_output.txt","w+")
    for item in pathfx_edge_disjoint:
        f.write("%s\n" % str(item))
    f.close()

In [17]:
 # Outputting Information of Revised Max Physical STRING Network Graph (Physical Combined Score) to a text file

if (save_text_outputs):
    
    comparison_output_info_file = open(r"outputs/output_comparison_network_analysis/network_comparison_info_output.txt","w+")
    
    comparison_output_info_file.write("\n")
    comparison_output_info_file.write("PathFX Network Analysis vs. String Network Analysis \n")
    comparison_output_info_file.write("============================================================================================= \n\n")
    comparison_output_info_file.write("Number of Shared Common Nodes from both graph networks: " + str(num_shared_nodes) + "\n")
    comparison_output_info_file.write("Number of distinct STRING nodes: " + str(num_distinct_string_nodes) + "\n")
    comparison_output_info_file.write("Number of distinct PathFX nodes: " + str(num_distinct_pathfx_nodes) + "\n\n")
    
    comparison_output_info_file.write("Number of Shared Common Edges from both graph networks: " + str(num_shared_edges) + "\n")
    comparison_output_info_file.write("Number of distinct STRING edges: " + str(num_distinct_string_edges) + "\n")
    comparison_output_info_file.write("Number of distinct PathFX edges: " + str(num_distinct_pathfx_edges) + "\n\n\n")          
    
    
    comparison_output_info_file.write("PathFX Network Analysis \n")
    comparison_output_info_file.write("============================================================================================= \n \n")

    comparison_output_info_file.write("PathFX NetworkX Interactome \n")
    comparison_output_info_file.write("------------------------------------------- \n")
    comparison_output_info_file.write("Number of Unique sources: " + str(num_sources) + "\n")
    comparison_output_info_file.write("Number of Unique targets: " + str(num_targets) + "\n")
    comparison_output_info_file.write("Number of Unique sources and targets: " + str(len(pathfx_unique_nodes)) + "\n \n")

    comparison_output_info_file.write("PathFX NetworkX Graph \n")
    comparison_output_info_file.write("------------------------------------- \n")
    comparison_output_info_file.write("Number of Nodes: " + str(pathfx_G.number_of_nodes()) + "\n")
    comparison_output_info_file.write("Number of Edges: " + str(pathfx_G.number_of_edges()) + "\n \n")

    comparison_output_info_file.write("Average node degree: " + str(pathfx_num_target_per_source_stats_df.loc["mean"][0]) + "\n")
    comparison_output_info_file.write("Standard Deviation node degree: " + str(pathfx_num_target_per_source_stats_df.loc["std"][0]) + "\n")
    comparison_output_info_file.write("Max node degree: " + str(pathfx_num_target_per_source_stats_df.loc["max"][0]) + "\n")
    comparison_output_info_file.write("Min node degree: " + str(pathfx_num_target_per_source_stats_df.loc["min"][0]) + "\n \n")

    comparison_output_info_file.write("Average edge weight: " + str(pathfx_all_weight_stats_df.loc["mean"][0]) + "\n")
    comparison_output_info_file.write("Standard Deviation node degree: " + str(pathfx_all_weight_stats_df.loc["std"][0]) + "\n")
    comparison_output_info_file.write("Max edge weight: " + str(pathfx_all_weight_stats_df.loc["max"][0]) + "\n")
    comparison_output_info_file.write("Min edge weight: " + str(pathfx_all_weight_stats_df.loc["min"][0]) + "\n \n")

    comparison_output_info_file.write("nx Graph is connected - " + str(nx.is_connected(pathfx_G)) + "\n")
    comparison_output_info_file.write("Number of Connected Components: " + str(nx.number_connected_components(pathfx_G)) + "\n \n \n \n")

    comparison_output_info_file.write("Top 25 Proteins with Most Neighbors - Weighted Edge Scores \n")
    comparison_output_info_file.write("--------------------------------------------------------------------------------------------- \n")
    comparison_output_info_file.write(pathfx_source_to_target_weight_stats_df.head(25).to_string())
    comparison_output_info_file.write("\n \n")

                                      
    comparison_output_info_file.write("STRING Network Analysis \n")
    comparison_output_info_file.write("============================================================================================= \n \n")

    comparison_output_info_file.write("STRING Database \n")
    comparison_output_info_file.write("------------------------------------------- \n")
    comparison_output_info_file.write("Number of Unique Protein 1s: " + str(string_num_unique_node_one) + "\n")
    comparison_output_info_file.write("Number of Unique Protein 2s: " + str(string_num_unique_node_two) + "\n")
    comparison_output_info_file.write("Number of Unique Protein: " + str(string_num_unique_nodes) + "\n \n")

    comparison_output_info_file.write("STRING NetworkX Graph \n")
    comparison_output_info_file.write("------------------------------------- \n")
    comparison_output_info_file.write("Number of Nodes: " + str(string_G.number_of_nodes()) + "\n")
    comparison_output_info_file.write("Number of Edges: " + str(string_G.number_of_edges()) + "\n \n")

    comparison_output_info_file.write("Average node degree: " + str(string_num_protein_per_protein_stats_df.loc["mean"][0]) + "\n")
    comparison_output_info_file.write("Standard Deviation node degree: " + str(string_num_protein_per_protein_stats_df.loc["std"][0]) + "\n")
    comparison_output_info_file.write("Max node degree: " + str(string_num_protein_per_protein_stats_df.loc["max"][0]) + "\n")
    comparison_output_info_file.write("Min node degree: " + str(string_num_protein_per_protein_stats_df.loc["min"][0]) + "\n \n")

    comparison_output_info_file.write("Average edge weight: " + str(string_all_avg_physical_combined_scores_stats_df.loc["mean"][0]) + "\n")
    comparison_output_info_file.write("Standard Deviation node degree: " + str(string_all_avg_physical_combined_scores_stats_df.loc["std"][0]) + "\n")
    comparison_output_info_file.write("Max edge weight: " + str(string_all_avg_physical_combined_scores_stats_df.loc["max"][0]) + "\n")
    comparison_output_info_file.write("Min edge weight: " + str(string_all_avg_physical_combined_scores_stats_df.loc["min"][0]) + "\n \n")

    comparison_output_info_file.write("nx Graph is connected - " + str(nx.is_connected(string_G)) + "\n")
    comparison_output_info_file.write("Number of Connected Components: " + str(nx.number_connected_components(string_G)) + "\n \n \n \n")

    comparison_output_info_file.write("Top 25 Proteins with Most Neighbors - Physical Avg Combined Scored \n")
    comparison_output_info_file.write("--------------------------------------------------------------------------------------------- \n")
    comparison_output_info_file.write(string_summary_score_stats_for_avg_physical_combined_score_df.head(25).to_string())
    comparison_output_info_file.write(" ")
    
    comparison_output_info_file.close()  