In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)

In [3]:
if 'filter_data' not in globals():
    filter_data = True
    if (filter_data):
        filter_for_same_nodes = False
        filter_for_close_nodes = True
        filter_for_same_edges = False
        if filter_for_same_nodes == filter_for_same_edges and filter_for_same_nodes == filter_for_close_nodes:
            sys.exit("ERROR!!! User must filter for SAME EDGES, SAME NODES, or for CLOSE NODES only!")

In [4]:
string_score_df = read_csv_file("inputs/input_filter_string_and_pathfx_score_tables/string_score_table.csv")
pathfx_score_df = read_csv_file("inputs/input_filter_string_and_pathfx_score_tables/pathfx_score_table.csv")

print()
display("string_score_df", string_score_df)
print()
display("pathfx_score_df", pathfx_score_df)
print()




'string_score_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
1,FKBP4,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
2,CYP51A1,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
3,PDK4,CALM2,0.000,0.0,0.104,0.104,0.104,0.1040
4,RALA,CALM2,0.313,0.0,0.000,0.313,0.313,0.3130
...,...,...,...,...,...,...,...,...
11754221,ZSCAN5A,ENSG00000239810,,,,,0.268,0.2670
11754222,ZSWIM3,ENSG00000197054,,,,,0.214,0.2140
11754223,ZSWIM3,ENSG00000239810,,,,,0.215,0.2135
11754224,ZSWIM7,ENSG00000166160,,,,,0.041,0.0410





'pathfx_score_df'

Unnamed: 0,source,target,weight
0,rs1042713,isoproterenol,0.233675
1,rs1042713,Ace Inhibitors,0.285181
2,rs1042713,Angiotensin,0.285181
3,rs1042713,Plain,0.285181
4,rs1042713,risperidone,0.285181
...,...,...,...
273057,SFTPA1,SFTPA2,0.359348
273058,VCAN,SELL,0.279930
273059,SELP,VCAN,0.279930
273060,NFIB,NFIC,0.279930





In [5]:
"""
Constructing Network graphs
"""
string_G = nx.from_pandas_edgelist(df=string_score_df, source="Protein 1", target='Protein 2', edge_attr=True)
pathfx_G = nx.from_pandas_edgelist(df=pathfx_score_df, source="source", target='target', edge_attr='weight')

In [6]:
# STRING

string_node_one_list = string_score_df["Protein 1"].unique()
string_node_two_list = string_score_df["Protein 2"].unique()

string_num_unique_node_one = len(string_node_one_list)
string_num_unique_node_two = len(string_node_two_list)

string_node_list = []
string_node_list.extend(string_node_one_list)
string_node_list.extend(string_node_two_list)
string_unique_nodes = (pd.Series(string_node_list)).unique()

string_num_unique_nodes = len(string_unique_nodes)

print()
print("STRING Database")
print("-------------------------------------------")
print("Number of Unique Protein 1s:", string_num_unique_node_one)
print("Number of Unique Protein 2s:", string_num_unique_node_two)
print("Number of Unique Protein:", string_num_unique_nodes)
print()

# PathFX

source_list = pathfx_score_df["source"].unique()
target_list = pathfx_score_df["target"].unique()

num_sources = len(source_list)
num_targets = len(target_list)

pathfx_node_list = []
pathfx_node_list.extend(source_list)
pathfx_node_list.extend(target_list)
pathfx_unique_nodes = pd.Series(pathfx_node_list).unique()

print("PathFX Interactome")
print("-------------------------------------------")
print("Number of Unique sources:", num_sources)
print("Number of Unique targets:", num_targets)
print("Number of Unique sources and targets:", len(pathfx_unique_nodes))
print()

string_nodes = set(string_unique_nodes)
pathfx_nodes = set(pathfx_unique_nodes)

intersecting_nodes = pathfx_nodes & string_nodes
string_node_disjoint = string_nodes - intersecting_nodes
pathfx_node_disjoint = pathfx_nodes - intersecting_nodes

num_shared_nodes = len(intersecting_nodes)
num_distinct_string_nodes = len(string_node_disjoint)
num_distinct_pathfx_nodes = len(pathfx_node_disjoint)

print("Both STRING and PathFX")
print("----------------------------------------------------------------")
print("Number of Shared Common Nodes from both graph networks:", num_shared_nodes)
print("Number of distinct STRING nodes:", num_distinct_string_nodes)
print("Number of distinct PathFX nodes:", num_distinct_pathfx_nodes)
print()


STRING Database
-------------------------------------------
Number of Unique Protein 1s: 19344
Number of Unique Protein 2s: 19344
Number of Unique Protein: 19344

PathFX Interactome
-------------------------------------------
Number of Unique sources: 23970
Number of Unique targets: 23970
Number of Unique sources and targets: 23970

Both STRING and PathFX
----------------------------------------------------------------
Number of Shared Common Nodes from both graph networks: 13948
Number of distinct STRING nodes: 5396
Number of distinct PathFX nodes: 10022



In [7]:
string_nodes = set(string_G.nodes())
pathfx_nodes = set(pathfx_G.nodes())

intersecting_nodes = string_nodes & pathfx_nodes
string_node_disjoint = string_nodes - pathfx_nodes
pathfx_node_disjoint = pathfx_nodes - string_nodes

num_shared_nodes = len(intersecting_nodes)
num_distinct_string_nodes = len(string_node_disjoint)
num_distinct_pathfx_nodes = len(pathfx_node_disjoint)

print()
print("Comparison of NODES in STRING and PathFX NetworkX Graphs")
print("----------------------------------------------------------------")
print("Number of Shared Common Nodes from both graph networks:", num_shared_nodes)
print("Number of distinct STRING nodes:", num_distinct_string_nodes)
print("Number of distinct PathFX nodes:", num_distinct_pathfx_nodes)
print()

string_edges = list(string_G.edges())
pathfx_edges = list(pathfx_G.edges())

intersecting_edges = set([tuple(sorted(ele)) for ele in string_edges]) & set([tuple(sorted(ele)) for ele in pathfx_edges])
string_edge_disjoint = set([tuple(sorted(ele)) for ele in string_edges]) - set([tuple(sorted(ele)) for ele in intersecting_edges])
pathfx_edge_disjoint = set([tuple(sorted(ele)) for ele in pathfx_edges]) - set([tuple(sorted(ele)) for ele in intersecting_edges])

num_shared_edges = len(intersecting_edges)
num_distinct_string_edges = len(string_edge_disjoint)
num_distinct_pathfx_edges = len(pathfx_edge_disjoint)

print("Comparison of EDGES in STRING and PathFX NetworkX Graphs")
print("----------------------------------------------------------------")
print("Number of Shared Common Edges from both graph networks:", num_shared_edges)
print("Number of distinct STRING edges:", num_distinct_string_edges)
print("Number of distinct PathFX edges:", num_distinct_pathfx_edges)
print()


Comparison of NODES in STRING and PathFX NetworkX Graphs
----------------------------------------------------------------
Number of Shared Common Nodes from both graph networks: 13948
Number of distinct STRING nodes: 5396
Number of distinct PathFX nodes: 10022

Comparison of EDGES in STRING and PathFX NetworkX Graphs
----------------------------------------------------------------
Number of Shared Common Edges from both graph networks: 72278
Number of distinct STRING edges: 5804835
Number of distinct PathFX edges: 64253



In [8]:
if filter_data and filter_for_same_nodes:
    
    string_G.remove_nodes_from(list(string_node_disjoint))
    pathfx_G.remove_nodes_from(list(pathfx_node_disjoint))

    filtered_string_score_df = nx.to_pandas_edgelist(string_G)
    filtered_pathfx_score_df = nx.to_pandas_edgelist(pathfx_G)
    
elif filter_data and filter_for_close_nodes:
    
    df_one = pd.DataFrame()
    df_two = pd.DataFrame()
    df_one = string_score_df[string_score_df["Protein 1"].isin(intersecting_nodes)]
    df_two = string_score_df[string_score_df["Protein 2"].isin(intersecting_nodes)]
    filtered_string_score_df = pd.concat([df_one, df_two])
    filtered_string_score_df = filtered_string_score_df.drop_duplicates()

    df_one = pd.DataFrame()
    df_two = pd.DataFrame()
    df_one = pathfx_score_df[pathfx_score_df["source"].isin(intersecting_nodes)]
    df_two = pathfx_score_df[pathfx_score_df["target"].isin(intersecting_nodes)]
    filtered_pathfx_score_df = pd.concat([df_one, df_two])
    filtered_pathfx_score_df = filtered_pathfx_score_df.drop_duplicates()
    
elif filter_data and filter_for_same_edges:
    
    string_G.remove_edges_from(list(string_edge_disjoint))
    pathfx_G.remove_edges_from(list(pathfx_edge_disjoint))

    filtered_string_score_df = nx.to_pandas_edgelist(string_G)
    filtered_pathfx_score_df = nx.to_pandas_edgelist(pathfx_G)

In [9]:
print()
display("filtered_string_score_df", filtered_string_score_df)
print()
display("filtered_pathfx_score_df", filtered_pathfx_score_df)
print()




'filtered_string_score_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
1,FKBP4,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
2,CYP51A1,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
3,PDK4,CALM2,0.000,0.0,0.104,0.104,0.104,0.1040
4,RALA,CALM2,0.313,0.0,0.000,0.313,0.313,0.3130
...,...,...,...,...,...,...,...,...
11750292,ENSG00000258947,ZNF445,,,,,0.138,0.0895
11750293,ENSG00000258947,ZNF516,,,,,0.093,0.0875
11750294,ENSG00000258947,ZNF607,,,,,0.169,0.1595
11750295,ENSG00000258947,ZNRF1,,,,,0.041,0.0410





'filtered_pathfx_score_df'

Unnamed: 0,source,target,weight
26,CHMP1B,KNSTRN,0.279930
27,CHMP1B,USP8,0.562588
28,CHMP1B,STAMBP,0.676545
29,CHMP1B,SPAST,0.416595
30,CHMP1B,SNRNP200,0.279930
...,...,...,...
273016,tat,GPX2,0.246744
273017,tat,GPX5,0.246744
273018,tat,GPX6,0.246744
273020,tat,BRIX1,0.279930





In [10]:
filtered_string_score_df = filtered_string_score_df.rename(columns={"source": "Protein 1", "target": "Protein 2"})
column_names = ["Protein 1", "Protein 2", "experiments", "database", "textmining", "Physical Combined Score", "Max Physical Combined Score", "Avg Physical Combined Score"]
filtered_string_score_df = filtered_string_score_df.reindex(columns=column_names)

df = pd.DataFrame()
df["Protein 1"] = filtered_string_score_df["Protein 2"]
df["Protein 2"] = filtered_string_score_df["Protein 1"]
df["experiments"] = filtered_string_score_df["experiments"]
df["database"] = filtered_string_score_df["database"]
df["textmining"] = filtered_string_score_df["textmining"]
df["Physical Combined Score"] = filtered_string_score_df["Physical Combined Score"]
df["Max Physical Combined Score"] = filtered_string_score_df["Max Physical Combined Score"]
df["Avg Physical Combined Score"] = filtered_string_score_df["Avg Physical Combined Score"]

filtered_string_score_df = pd.concat([filtered_string_score_df, df])
filtered_string_score_df = filtered_string_score_df.drop_duplicates()
display("filtered_string_score_df", filtered_string_score_df)

'filtered_string_score_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
1,FKBP4,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
2,CYP51A1,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
3,PDK4,CALM2,0.000,0.0,0.104,0.104,0.104,0.1040
4,RALA,CALM2,0.313,0.0,0.000,0.313,0.313,0.3130
...,...,...,...,...,...,...,...,...
11750292,ENSG00000258947,ZNF445,,,,,0.138,0.0895
11750293,ENSG00000258947,ZNF516,,,,,0.093,0.0875
11750294,ENSG00000258947,ZNF607,,,,,0.169,0.1595
11750295,ENSG00000258947,ZNRF1,,,,,0.041,0.0410


In [11]:
df = pd.DataFrame()
df["source"] = filtered_pathfx_score_df["target"]
df["target"] = filtered_pathfx_score_df["source"]
df["weight"] = filtered_pathfx_score_df["weight"]

filtered_pathfx_score_df = pd.concat([filtered_pathfx_score_df, df])
filtered_pathfx_score_df = filtered_pathfx_score_df.drop_duplicates()
display("filtered_pathfx_score_df", filtered_pathfx_score_df)

'filtered_pathfx_score_df'

Unnamed: 0,source,target,weight
26,CHMP1B,KNSTRN,0.279930
27,CHMP1B,USP8,0.562588
28,CHMP1B,STAMBP,0.676545
29,CHMP1B,SPAST,0.416595
30,CHMP1B,SNRNP200,0.279930
...,...,...,...
273016,tat,GPX2,0.246744
273017,tat,GPX5,0.246744
273018,tat,GPX6,0.246744
273020,tat,BRIX1,0.279930


In [12]:
# CHECK Filtered Calc

filtered_string_node_one_list = filtered_string_score_df["Protein 1"].unique()
filtered_string_node_two_list = filtered_string_score_df["Protein 2"].unique()

filtered_string_num_unique_node_one = len(filtered_string_node_one_list)
filtered_string_num_unique_node_two = len(filtered_string_node_two_list)

filtered_string_node_list = []
filtered_string_node_list.extend(filtered_string_node_one_list)
filtered_string_node_list.extend(filtered_string_node_two_list)
filtered_string_unique_nodes = pd.Series(filtered_string_node_list).unique()

filtered_string_num_unique_nodes = len(filtered_string_unique_nodes)

print()
print("Filtered STRING Database")
print("-------------------------------------------")
print("Number of Unique Protein 1s:", filtered_string_num_unique_node_one)
print("Number of Unique Protein 2s:", filtered_string_num_unique_node_two)
print("Number of Unique Protein:", filtered_string_num_unique_nodes)
print()

# PathFX

filtered_source_list = filtered_pathfx_score_df["source"].unique()
filtered_target_list = filtered_pathfx_score_df["target"].unique()

filtered_num_sources = len(filtered_source_list)
filtered_num_targets = len(filtered_target_list)

filtered_pathfx_node_list = []
filtered_pathfx_node_list.extend(filtered_source_list)
filtered_pathfx_node_list.extend(filtered_target_list)
filtered_pathfx_unique_nodes = pd.Series(filtered_pathfx_node_list).unique()

print("Filtered PathFX Interactome")
print("-------------------------------------------")
print("Number of Unique sources:", filtered_num_sources)
print("Number of Unique targets:", filtered_num_targets)
print("Number of Unique sources and targets:", len(filtered_pathfx_unique_nodes))
print()

filtered_string_nodes = set(filtered_string_unique_nodes)
filtered_pathfx_nodes = set(filtered_pathfx_unique_nodes)

filtered_intersecting_nodes = filtered_string_nodes & filtered_pathfx_nodes
filtered_string_node_disjoint = filtered_string_nodes - filtered_intersecting_nodes
filtered_pathfx_node_disjoint = filtered_pathfx_nodes - filtered_intersecting_nodes

filtered_num_shared_nodes = len(filtered_intersecting_nodes)
filtered_num_distinct_string_nodes = len(filtered_string_node_disjoint)
filtered_num_distinct_pathfx_nodes = len(filtered_pathfx_node_disjoint)

print("Both STRING and PathFX")
print("----------------------------------------------------------------")
print("Number of Shared Common Nodes from both graph networks:", filtered_num_shared_nodes)
print("Number of distinct STRING nodes:", filtered_num_distinct_string_nodes)
print("Number of distinct PathFX nodes:", filtered_num_distinct_pathfx_nodes)
print()


Filtered STRING Database
-------------------------------------------
Number of Unique Protein 1s: 19338
Number of Unique Protein 2s: 19338
Number of Unique Protein: 19338

Filtered PathFX Interactome
-------------------------------------------
Number of Unique sources: 22832
Number of Unique targets: 22832
Number of Unique sources and targets: 22832

Both STRING and PathFX
----------------------------------------------------------------
Number of Shared Common Nodes from both graph networks: 13948
Number of distinct STRING nodes: 5390
Number of distinct PathFX nodes: 8884



In [13]:
# Save Filtered DataFrame Tables to output folder
output_filter_string_and_pathfx_score_tables_folder = "outputs/output_filter_string_and_pathfx_score_tables/"
check_directory_exists(output_filter_string_and_pathfx_score_tables_folder)
save_to_csv_file(output_filter_string_and_pathfx_score_tables_folder, filtered_pathfx_score_df, "pathfx_score_table.csv", input_index=False)
save_to_csv_file(output_filter_string_and_pathfx_score_tables_folder, filtered_string_score_df, "string_score_table.csv", input_index=False)

# save final protein interaction table for pathfx_network_analysis.ipython input folder
input_pathfx_network_analysis_folder = "inputs/input_pathfx_network_analysis/"
check_directory_exists(input_pathfx_network_analysis_folder)
save_to_csv_file(input_pathfx_network_analysis_folder, filtered_pathfx_score_df, "pathfx_score_table", input_index=False)

# save final protein interaction table for string_network_analysis.ipython input folder
input_string_network_analysis_folder = "inputs/input_string_network_analysis/"
check_directory_exists(input_string_network_analysis_folder)
save_to_csv_file(input_string_network_analysis_folder, filtered_string_score_df, "string_score_table", input_index=False)

Constructed and saved outputs/output_filter_string_and_pathfx_score_tables/pathfx_score_table.csv.csv
Constructed and saved outputs/output_filter_string_and_pathfx_score_tables/string_score_table.csv.csv
Constructed and saved inputs/input_pathfx_network_analysis/pathfx_score_table.csv
Constructed and saved inputs/input_string_network_analysis/string_score_table.csv
