In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)

In [3]:
"""
Constructing Network graph for Protein-Protein Combined score interaction from PathFX-Github
"""
pathfx_G = read_pickle_file("inputs/input_clean_pathfx_score_table/pathfx_mp_interactome.pkl")

In [4]:
pre_pathfx_score_df = nx.to_pandas_edgelist(pathfx_G)
print()
display("pre_pathfx_score_df", pre_pathfx_score_df)
print()




'pre_pathfx_score_df'

Unnamed: 0,source,target,weight
0,rs1042713,isoproterenol,0.233675
1,rs1042713,Ace Inhibitors,0.285181
2,rs1042713,Angiotensin,0.285181
3,rs1042713,Plain,0.285181
4,rs1042713,risperidone,0.285181
...,...,...,...
139082,VCAN,SELP,0.279930
139083,NFIC,NFIB,0.279930
139084,NFIB,NFIX,0.279930
139085,SELK,SELK,0.416595





In [5]:
print("Nonself Records: ")
pre_pathfx_score_df[pre_pathfx_score_df["source"] == pre_pathfx_score_df["target"]]

Nonself Records: 


Unnamed: 0,source,target,weight
34,CHMP1B,CHMP1B,0.279930
116,ITGA9,ITGA9,0.291073
156,ITGA2,ITGA2,0.380279
199,ITGA4,ITGA4,0.380279
297,TRHR,TRHR,0.359348
...,...,...,...
139078,refseq:NP_003461,refseq:NP_003461,0.291073
139079,uniprotkb:V9HW89,uniprotkb:V9HW89,0.437568
139081,PSPH,PSPH,0.380956
139085,SELK,SELK,0.416595


In [6]:
pre_pathfx_score_df = pre_pathfx_score_df[pre_pathfx_score_df["source"] != pre_pathfx_score_df["target"]]
display("pre_pathfx_score_df", pre_pathfx_score_df)

'pre_pathfx_score_df'

Unnamed: 0,source,target,weight
0,rs1042713,isoproterenol,0.233675
1,rs1042713,Ace Inhibitors,0.285181
2,rs1042713,Angiotensin,0.285181
3,rs1042713,Plain,0.285181
4,rs1042713,risperidone,0.285181
...,...,...,...
139075,SFTPA2,SFTPA1,0.359348
139080,SELL,VCAN,0.279930
139082,VCAN,SELP,0.279930
139083,NFIC,NFIB,0.279930


In [7]:
"""
Outputting Number of Distinct Proteins
"""
source_list = pre_pathfx_score_df["source"].unique()
target_list = pre_pathfx_score_df["target"].unique()

num_sources = len(source_list)
num_targets = len(target_list)

node_list = []
node_list.extend(source_list)
node_list.extend(target_list)
pathfx_unique_nodes = pd.Series(node_list).unique()

print("Outputting Number of Distinct Proteins")
print("------------------------------------------------")
print("Number of Unique sources:", num_sources)
print("Number of Unique targets:", num_targets)
print("Number of Unique sources and targets:", len(pathfx_unique_nodes))

Outputting Number of Distinct Proteins
------------------------------------------------
Number of Unique sources: 17602
Number of Unique targets: 18089
Number of Unique sources and targets: 23970


In [8]:
mirror_copy_pre_pathfx_score_df = pd.DataFrame()
mirror_copy_pre_pathfx_score_df["source"] = pre_pathfx_score_df["target"]
mirror_copy_pre_pathfx_score_df["target"] = pre_pathfx_score_df["source"]
mirror_copy_pre_pathfx_score_df["weight"] = pre_pathfx_score_df["weight"]
print()
display("mirror_copy_pre_pathfx_score_df", mirror_copy_pre_pathfx_score_df)
print()




'mirror_copy_pre_pathfx_score_df'

Unnamed: 0,source,target,weight
0,isoproterenol,rs1042713,0.233675
1,Ace Inhibitors,rs1042713,0.285181
2,Angiotensin,rs1042713,0.285181
3,Plain,rs1042713,0.285181
4,risperidone,rs1042713,0.285181
...,...,...,...
139075,SFTPA1,SFTPA2,0.359348
139080,VCAN,SELL,0.279930
139082,SELP,VCAN,0.279930
139083,NFIB,NFIC,0.279930





In [9]:
pathfx_score_df = pd.concat([pre_pathfx_score_df, mirror_copy_pre_pathfx_score_df])
pathfx_score_df = pathfx_score_df.drop_duplicates()
print()
display("pathfx_score_df", pathfx_score_df)
print()




'pathfx_score_df'

Unnamed: 0,source,target,weight
0,rs1042713,isoproterenol,0.233675
1,rs1042713,Ace Inhibitors,0.285181
2,rs1042713,Angiotensin,0.285181
3,rs1042713,Plain,0.285181
4,rs1042713,risperidone,0.285181
...,...,...,...
139075,SFTPA1,SFTPA2,0.359348
139080,VCAN,SELL,0.279930
139082,SELP,VCAN,0.279930
139083,NFIB,NFIC,0.279930





In [10]:
"""
Outputting Number of Distinct Proteins
"""
source_list = pathfx_score_df["source"].unique()
target_list = pathfx_score_df["target"].unique()

num_sources = len(source_list)
num_targets = len(target_list)

node_list = []
node_list.extend(source_list)
node_list.extend(target_list)
pathfx_unique_nodes = pd.Series(node_list).unique()

print("\nOutputting Number of Distinct Proteins")
print("------------------------------------------------")
print("Number of Unique sources:", num_sources)
print("Number of Unique targets:", num_targets)
print("Number of Unique sources and targets:", len(pathfx_unique_nodes))
print()


Outputting Number of Distinct Proteins
------------------------------------------------
Number of Unique sources: 23970
Number of Unique targets: 23970
Number of Unique sources and targets: 23970



In [11]:
output_clean_pathfx_folder = "outputs/output_clean_pathfx_score_table/"
check_directory_exists(output_clean_pathfx_folder)
save_to_csv_file(output_clean_pathfx_folder, pathfx_score_df, "pathfx_score_table")

Constructed and saved outputs/output_clean_pathfx_score_table/pathfx_score_table.csv


In [12]:
# save final protein interaction table for string_network_analysis.ipython input folder

input_pathfx_network_analysis_folder = "inputs/input_pathfx_network_analysis/"
check_directory_exists(input_pathfx_network_analysis_folder)
save_to_csv_file(input_pathfx_network_analysis_folder, pathfx_score_df, "pathfx_score_table")

Constructed and saved inputs/input_pathfx_network_analysis/pathfx_score_table.csv


In [13]:
# save final protein interaction table for string_network_analysis.ipython input folder
input_filter_string_and_pathfx_score_table_folder = "inputs/input_filter_string_and_pathfx_score_tables/"
check_directory_exists(input_filter_string_and_pathfx_score_table_folder)
save_to_csv_file(input_filter_string_and_pathfx_score_table_folder, pathfx_score_df, "pathfx_score_table")

Constructed and saved inputs/input_filter_string_and_pathfx_score_tables/pathfx_score_table.csv
