In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)

In [3]:
# import score table from combine_subscores_physical
string_pre_score_df = read_csv_file("inputs/input_string_clean_score/string_pre_score_table.csv", input_delim_whitespace=False)

# import info table about all drugs/proteins in STRING
string_info_df = read_csv_file("inputs/input_string_clean_score/9606.protein.info.v11.0.txt", input_sep='\t')
    
    
string_pre_score_df = string_pre_score_df.rename(columns={"protein1": "Protein 1 ID", "protein2": "Protein 2 ID","physical_combined_score":"Physical Combined Score"})
string_info_df = string_info_df.rename(columns={"protein_external_id": "Protein ID", "preferred_name": "Protein"})
print()
display("string_pre_score_df", string_pre_score_df)
display("string_info_df", string_info_df)
print()




'string_pre_score_df'

Unnamed: 0,Protein 1 ID,Protein 2 ID,experiments,database,textmining,Physical Combined Score
0,9606.ENSP00000000233,9606.ENSP00000272298,0,0,0,41
1,9606.ENSP00000000233,9606.ENSP00000253401,0,0,0,41
2,9606.ENSP00000000233,9606.ENSP00000401445,0,0,0,41
3,9606.ENSP00000000233,9606.ENSP00000418915,0,0,542,542
4,9606.ENSP00000000233,9606.ENSP00000327801,0,0,0,41
...,...,...,...,...,...,...
11759449,9606.ENSP00000485678,9606.ENSP00000310488,0,0,0,41
11759450,9606.ENSP00000485678,9606.ENSP00000342448,0,0,0,41
11759451,9606.ENSP00000485678,9606.ENSP00000350222,0,0,0,41
11759452,9606.ENSP00000485678,9606.ENSP00000367590,0,900,0,900


'string_info_df'

Unnamed: 0,Protein ID,Protein,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi..."
...,...,...,...,...
19561,9606.ENSP00000485671,ENSG00000280273,120,HCG1991042
19562,9606.ENSP00000485672,ENSG00000279458,86,annotation not available
19563,9606.ENSP00000485673,ENSG00000279988,243,annotation not available
19564,9606.ENSP00000485675,ENSG00000280116,84,annotation not available





In [4]:
# Constructing protein dataframe that contains both name, ID, and combined score of interaction proteins

string_score_df = string_pre_score_df[["Protein 1 ID", "Protein 2 ID", "experiments", "database", "textmining", "Physical Combined Score"]]
string_score_df = string_score_df.merge(string_info_df, how='inner', left_on='Protein 1 ID', right_on="Protein ID")
string_score_df = string_score_df.drop(columns=["protein_size", "Protein ID", "annotation"])
string_score_df = string_score_df.rename(columns={"Protein": "Protein 1"})

string_score_df = string_score_df.merge(string_info_df, how='inner', left_on='Protein 2 ID', right_on="Protein ID")
string_score_df = string_score_df.drop(columns=["protein_size", "Protein ID", "annotation"])
string_score_df = string_score_df.rename(columns={"Protein": "Protein 2"})

column_names = ["Protein 1", "Protein 1 ID", "Protein 2", "Protein 2 ID", "experiments", "database", "textmining", "Physical Combined Score"]
string_score_df = string_score_df.reindex(columns=column_names)
print()
display("string_score_df", string_score_df)
print()




'string_score_df'

Unnamed: 0,Protein 1,Protein 1 ID,Protein 2,Protein 2 ID,experiments,database,textmining,Physical Combined Score
0,ARF5,9606.ENSP00000000233,CALM2,9606.ENSP00000272298,0,0,0,41
1,FKBP4,9606.ENSP00000001008,CALM2,9606.ENSP00000272298,0,0,0,41
2,CYP51A1,9606.ENSP00000003100,CALM2,9606.ENSP00000272298,0,0,0,41
3,PDK4,9606.ENSP00000005178,CALM2,9606.ENSP00000272298,0,0,104,104
4,RALA,9606.ENSP00000005257,CALM2,9606.ENSP00000272298,313,0,0,313
...,...,...,...,...,...,...,...,...
11759449,DUX4L7,9606.ENSP00000451411,ENSG00000274175,9606.ENSP00000479378,0,0,560,560
11759450,DUX4,9606.ENSP00000458065,ENSG00000274175,9606.ENSP00000479378,0,0,556,556
11759451,DUX4L8,9606.ENSP00000485452,ENSG00000274175,9606.ENSP00000479378,0,0,559,559
11759452,DYNLL2,9606.ENSP00000477310,C17orf47,9606.ENSP00000354874,212,0,0,212





In [5]:
# Construct table of self interacting proteins 
self_interacting_protein = string_score_df[string_score_df["Protein 1"] == string_score_df["Protein 2"]]
print()
display("self_interacting_protein", self_interacting_protein)
print()




'self_interacting_protein'

Unnamed: 0,Protein 1,Protein 1 ID,Protein 2,Protein 2 ID,experiments,database,textmining,Physical Combined Score
2674693,ENSG00000243667,9606.ENSP00000295121,ENSG00000243667,9606.ENSP00000477980,800,0,0,800
7375638,ENSG00000216937,9606.ENSP00000364165,ENSG00000216937,9606.ENSP00000355078,0,0,0,41
7409667,ENSG00000216937,9606.ENSP00000355078,ENSG00000216937,9606.ENSP00000364165,0,0,0,41
8264678,ENSG00000243667,9606.ENSP00000477980,ENSG00000243667,9606.ENSP00000295121,800,0,0,800





In [6]:
# Construct table of non-self interacting proteins as a table
# (Filter non-self interacting proteins)

non_self_interacting_protein = string_score_df[string_score_df["Protein 1"] != string_score_df["Protein 2"]]
print()
display("non_self_interacting_protein", non_self_interacting_protein)
print()




'non_self_interacting_protein'

Unnamed: 0,Protein 1,Protein 1 ID,Protein 2,Protein 2 ID,experiments,database,textmining,Physical Combined Score
0,ARF5,9606.ENSP00000000233,CALM2,9606.ENSP00000272298,0,0,0,41
1,FKBP4,9606.ENSP00000001008,CALM2,9606.ENSP00000272298,0,0,0,41
2,CYP51A1,9606.ENSP00000003100,CALM2,9606.ENSP00000272298,0,0,0,41
3,PDK4,9606.ENSP00000005178,CALM2,9606.ENSP00000272298,0,0,104,104
4,RALA,9606.ENSP00000005257,CALM2,9606.ENSP00000272298,313,0,0,313
...,...,...,...,...,...,...,...,...
11759449,DUX4L7,9606.ENSP00000451411,ENSG00000274175,9606.ENSP00000479378,0,0,560,560
11759450,DUX4,9606.ENSP00000458065,ENSG00000274175,9606.ENSP00000479378,0,0,556,556
11759451,DUX4L8,9606.ENSP00000485452,ENSG00000274175,9606.ENSP00000479378,0,0,559,559
11759452,DYNLL2,9606.ENSP00000477310,C17orf47,9606.ENSP00000354874,212,0,0,212





In [7]:
# list of proteins preferred_name's with 2 protein_external_id's
proteins_with_redundant_ids = ["ENSG00000258947", "ENSG00000239810", "ENSG00000253117", "ENSG00000183628", "ENSG00000205457", 
                               "ENSG00000166160", "ENSG00000216937", "ENSG00000197054", "ENSG00000242852", "ENSG00000243667"]

In [8]:
# filter non-self interacting proteins that have unique protein ID as a table
proteins_interac_unique_ids = non_self_interacting_protein[~non_self_interacting_protein["Protein 1"].isin(proteins_with_redundant_ids)]
proteins_interac_unique_ids = proteins_interac_unique_ids[~proteins_interac_unique_ids["Protein 2"].isin(proteins_with_redundant_ids)]
proteins_interac_unique_ids = proteins_interac_unique_ids[["Protein 1", "Protein 2", "experiments", "database", "textmining", "Physical Combined Score"]]
proteins_interac_unique_ids["Max Physical Combined Score"] = proteins_interac_unique_ids["Physical Combined Score"]
proteins_interac_unique_ids["Avg Physical Combined Score"] = proteins_interac_unique_ids["Physical Combined Score"]
print()
display("proteins_interac_unique_ids", proteins_interac_unique_ids)
print()




'proteins_interac_unique_ids'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0,0,0,41,41,41
1,FKBP4,CALM2,0,0,0,41,41,41
2,CYP51A1,CALM2,0,0,0,41,41,41
3,PDK4,CALM2,0,0,104,104,104,104
4,RALA,CALM2,313,0,0,313,313,313
...,...,...,...,...,...,...,...,...
11759449,DUX4L7,ENSG00000274175,0,0,560,560,560,560
11759450,DUX4,ENSG00000274175,0,0,556,556,556,556
11759451,DUX4L8,ENSG00000274175,0,0,559,559,559,559
11759452,DYNLL2,C17orf47,212,0,0,212,212,212





In [9]:
# filter non-self interacting proteins that have non-unique protein ID as a table
proteins_interac_nonunique_ids_one = non_self_interacting_protein[non_self_interacting_protein["Protein 1"].isin(proteins_with_redundant_ids)]
proteins_interac_nonunique_ids_two = non_self_interacting_protein[non_self_interacting_protein["Protein 2"].isin(proteins_with_redundant_ids)]
proteins_interac_nonunique_ids =  pd.concat([proteins_interac_nonunique_ids_one, proteins_interac_nonunique_ids_two])
proteins_interac_nonunique_ids = proteins_interac_nonunique_ids.drop_duplicates()
print()
display("proteins_interac_nonunique_ids", proteins_interac_nonunique_ids)
print()




'proteins_interac_nonunique_ids'

Unnamed: 0,Protein 1,Protein 1 ID,Protein 2,Protein 2 ID,experiments,database,textmining,Physical Combined Score
720,ENSG00000258947,9606.ENSP00000320295,CALM2,9606.ENSP00000272298,0,0,94,94
1737,ENSG00000258947,9606.ENSP00000451560,CALM2,9606.ENSP00000272298,0,0,84,84
1816,ENSG00000243667,9606.ENSP00000477980,CALM2,9606.ENSP00000272298,0,0,0,41
2128,ENSG00000258947,9606.ENSP00000320295,ARHGEF9,9606.ENSP00000253401,0,0,0,41
2524,ENSG00000258947,9606.ENSP00000451560,ARHGEF9,9606.ENSP00000253401,0,0,0,41
...,...,...,...,...,...,...,...,...
11724878,PADI6,9606.ENSP00000483125,ENSG00000239810,9606.ENSP00000480027,0,0,257,256
11724879,PRAMEF7,9606.ENSP00000484237,ENSG00000239810,9606.ENSP00000480027,0,0,0,41
11724880,PRAMEF25,9606.ENSP00000485258,ENSG00000239810,9606.ENSP00000480027,0,0,0,41
11724881,DUX4L8,9606.ENSP00000485452,ENSG00000239810,9606.ENSP00000480027,0,0,420,420





In [10]:
print()
display(proteins_interac_nonunique_ids.groupby(["Protein 1", "Protein 2"]).count().reset_index().sort_values(by='Physical Combined Score', ascending=False))
print()
print(proteins_interac_nonunique_ids.groupby(["Protein 1", "Protein 2"]).count().reset_index().sort_values(by='Physical Combined Score', ascending=False)["Physical Combined Score"].value_counts())
print()




Unnamed: 0,Protein 1,Protein 2,Protein 1 ID,Protein 2 ID,experiments,database,textmining,Physical Combined Score
5269,ENSG00000258947,ENSG00000166160,3,3,3,3,3,3
1540,ENSG00000166160,ENSG00000258947,3,3,3,3,3,3
5415,ENSG00000258947,GFRA2,2,2,2,2,2,2
4717,ENSG00000258947,ANKS1B,2,2,2,2,2,2
8027,LETM1,ENSG00000183628,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...
4086,ENSG00000243667,PRKAR1B,1,1,1,1,1,1
4085,ENSG00000243667,PRKAR1A,1,1,1,1,1,1
4084,ENSG00000243667,PRKACG,1,1,1,1,1,1
4083,ENSG00000243667,PRKACB,1,1,1,1,1,1



1    5608
2    5220
3       2
Name: Physical Combined Score, dtype: int64



In [11]:
#Note: ENSG00000258947 --- ENSG00000166160  is an anomaly ....

In [12]:
# combine proteins with 2 protein IDs into 1 record. For the Physical Combined Score calculate the Max and Average between the 2 records
corrected_proteins_interac_nonunique_ids = proteins_interac_nonunique_ids.groupby(["Protein 1", "Protein 2"]).agg(max_physical_combined_score=("Physical Combined Score", "max"), mean_physical_combined_score=("Physical Combined Score", "mean"))
corrected_proteins_interac_nonunique_ids = corrected_proteins_interac_nonunique_ids.reset_index()
corrected_proteins_interac_nonunique_ids = corrected_proteins_interac_nonunique_ids.rename(columns={"max_physical_combined_score": "Max Physical Combined Score", "mean_physical_combined_score":"Avg Physical Combined Score"})
column_names = ["Protein 1", "Protein 2", "experiments", "database", "textmining", "Physical Combined Score", "Max Physical Combined Score", "Avg Physical Combined Score"]
corrected_proteins_interac_nonunique_ids = corrected_proteins_interac_nonunique_ids.reindex(columns=column_names)
print()
display("corrected_proteins_interac_nonunique_ids", corrected_proteins_interac_nonunique_ids)
print()




'corrected_proteins_interac_nonunique_ids'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,339010,ENSG00000243667,,,,,41,41.0
1,339010,ENSG00000258947,,,,,41,41.0
2,AAMDC,ENSG00000243667,,,,,41,41.0
3,ABCA4,ENSG00000166160,,,,,145,145.0
4,ABCB1,ENSG00000258947,,,,,329,307.5
...,...,...,...,...,...,...,...,...
10825,ZSCAN5A,ENSG00000239810,,,,,268,267.0
10826,ZSWIM3,ENSG00000197054,,,,,214,214.0
10827,ZSWIM3,ENSG00000239810,,,,,215,213.5
10828,ZSWIM7,ENSG00000166160,,,,,41,41.0





In [13]:
# Construct final protein interaction table
string_score_df = pd.concat([proteins_interac_unique_ids, corrected_proteins_interac_nonunique_ids])
print()
display("string_score_df", string_score_df)
print()




'string_score_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.0,0.0,0.0,41.0,41,41.0
1,FKBP4,CALM2,0.0,0.0,0.0,41.0,41,41.0
2,CYP51A1,CALM2,0.0,0.0,0.0,41.0,41,41.0
3,PDK4,CALM2,0.0,0.0,104.0,104.0,104,104.0
4,RALA,CALM2,313.0,0.0,0.0,313.0,313,313.0
...,...,...,...,...,...,...,...,...
10825,ZSCAN5A,ENSG00000239810,,,,,268,267.0
10826,ZSWIM3,ENSG00000197054,,,,,214,214.0
10827,ZSWIM3,ENSG00000239810,,,,,215,213.5
10828,ZSWIM7,ENSG00000166160,,,,,41,41.0





In [14]:
# Divide all scores by 1000 to match with PathFX 
string_score_df["experiments"] = string_score_df["experiments"] / 1000
string_score_df["database"] = string_score_df["database"] / 1000
string_score_df["textmining"] = string_score_df["textmining"] / 1000
string_score_df["Physical Combined Score"] = string_score_df["Physical Combined Score"] / 1000
string_score_df["Max Physical Combined Score"] = string_score_df["Max Physical Combined Score"] / 1000
string_score_df["Avg Physical Combined Score"] = string_score_df["Avg Physical Combined Score"] / 1000
print()
display("string_score_df", string_score_df)
print()




'string_score_df'

Unnamed: 0,Protein 1,Protein 2,experiments,database,textmining,Physical Combined Score,Max Physical Combined Score,Avg Physical Combined Score
0,ARF5,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
1,FKBP4,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
2,CYP51A1,CALM2,0.000,0.0,0.000,0.041,0.041,0.0410
3,PDK4,CALM2,0.000,0.0,0.104,0.104,0.104,0.1040
4,RALA,CALM2,0.313,0.0,0.000,0.313,0.313,0.3130
...,...,...,...,...,...,...,...,...
10825,ZSCAN5A,ENSG00000239810,,,,,0.268,0.2670
10826,ZSWIM3,ENSG00000197054,,,,,0.214,0.2140
10827,ZSWIM3,ENSG00000239810,,,,,0.215,0.2135
10828,ZSWIM7,ENSG00000166160,,,,,0.041,0.0410





In [15]:
# printing out counting statistics for entire filtering and correction process
a = len(string_pre_score_df) 
b = len(self_interacting_protein)
c = len(non_self_interacting_protein)
d = len(proteins_interac_unique_ids)
e = len(proteins_interac_nonunique_ids)
f = len(corrected_proteins_interac_nonunique_ids)
g = len(string_score_df)
print("\nPrinting out counting statistics for entire filtering and correction process")
print("-----------------------------------------------------------------------------")
print("Number of Original Records:", a)
print("Number of Self-Interacting Protein Records:", b)
print("Number of Non-self Interacting Protein Records: " + str(c) + "\n")
print("Number of Records containing Non-self Interacting Unique Protein ID:", d)
print("Number of Records containing Non-self Interacting Non-unique Protein ID:", e)
print("Number of Records containing Non-self Interacting Non-unique Protein ID (Corrected): " + str(f) + "\n")
print("Number of Records in Final Table:", g)
print("Number of Records reduced: " + str(a-g) + "\n")


Printing out counting statistics for entire filtering and correction process
-----------------------------------------------------------------------------
Number of Original Records: 11759454
Number of Self-Interacting Protein Records: 4
Number of Non-self Interacting Protein Records: 11759450

Number of Records containing Non-self Interacting Unique Protein ID: 11743396
Number of Records containing Non-self Interacting Non-unique Protein ID: 16054
Number of Records containing Non-self Interacting Non-unique Protein ID (Corrected): 10830

Number of Records in Final Table: 11754226
Number of Records reduced: 5228



In [16]:
# save final protein interaction table in output folder
output_clean_string_folder = "outputs/output_clean_string_score_table/"
check_directory_exists(output_clean_string_folder)
save_to_csv_file(output_clean_string_folder, string_score_df, "string_score_table", input_index=False)

Constructed and saved outputs/output_clean_string_score_table/string_score_table.csv


In [17]:
# save final protein interaction table for string_network_analysis.ipython input folder
input_string_network_analysis_folder = "inputs/input_string_network_analysis/"
check_directory_exists(input_string_network_analysis_folder)
save_to_csv_file(input_string_network_analysis_folder, string_score_df, "string_score_table", input_index=False)

Constructed and saved inputs/input_string_network_analysis/string_score_table.csv


In [18]:
# save final protein interaction table for string_network_analysis.ipython input folder
input_filter_string_and_pathfx_score_table_folder = "inputs/input_filter_string_and_pathfx_score_tables/"
check_directory_exists(input_filter_string_and_pathfx_score_table_folder)
save_to_csv_file(input_filter_string_and_pathfx_score_table_folder, string_score_df, "string_score_table", input_index=False)

Constructed and saved inputs/input_filter_string_and_pathfx_score_tables/string_score_table.csv
