In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    print("Constructed and saved", output_filename)

In [3]:
output_clean_string_folder = "outputs/output_combine_subscores_physical/"
check_directory_exists(output_clean_string_folder)

In [4]:
prior = 0.041

In [5]:
def compute_prior_away(score, prior):
    if score < prior: score = prior
    score_no_prior = (score - prior) / (1 - prior)
    return score_no_prior

In [6]:
input_filename = "inputs/input_combine_subscores_physical/9606.protein.links.full.v11.0.txt"
if not os.path.exists(input_filename):
    sys.exit("Can't locate input file %s" % input_filename)
with open(input_filename) as fp:
    n_rows = len(fp.readlines())
    print("Read following file: {}".format(input_filename))
    print("Number of rows in File: {}".format(n_rows))

Read following file: inputs/input_combine_subscores_physical/9606.protein.links.full.v11.0.txt
Number of rows in File: 11759455


In [7]:
dict_protein = {'Protein 1 ID': [], 'Protein 2 ID':[], "experiments":[], "database":[], "textmining":[], "string_physical_interaction_scores":[]}
reduced_protein_calc = pd.DataFrame(data=dict_protein)
reduced_protein_calc

Unnamed: 0,Protein 1 ID,Protein 2 ID,experiments,database,textmining,string_physical_interaction_scores


In [8]:
string_file = open(r"outputs/output_combine_subscores_physical/string_pre_score_table.txt","w+")
string_file.write("protein1 protein2 experiments database textmining physical_combined_score\n")

count = 0

header = True
for line in open(input_filename):

    if header:
        header = False
        continue
    
    ## load the line
    l = line.split()
    (protein1, protein2,
     neighborhood, neighborhood_transferred,
     fusion, cooccurrence,
     homology,
     coexpression, coexpression_transferred,
     experiments, experiments_transferred,
     database, database_transferred,
     textmining, textmining_transferred,
     initial_combined) = l

    ## divide all scores by 1000
    experiments = float(experiments) / 1000
    database = float(database) / 1000
    textmining = float(textmining) / 1000
    initial_combined = int(initial_combined)

    ## compute prior away for each score
    experiments_prior_corrected = compute_prior_away (experiments, prior)   
    database_prior_corrected = compute_prior_away (database, prior)      
    textmining_prior_corrected = compute_prior_away (textmining, prior)            
  
    ## next, do the 1 - multiplication:
    combined_score_one_minus = (1.0 - experiments_prior_corrected) * (1.0 - database_prior_corrected) * (1.0 - textmining_prior_corrected)

    ## and lastly, do the 1 - conversion again, and put back the prior *exactly once*
    physical_combined_score = (1.0 - combined_score_one_minus)            ## 1 - conversion
    physical_combined_score *= (1.0 - prior)                              ## scale down
    physical_combined_score += prior                                      ## and add prior.

    ## round
    physical_combined_score = int(physical_combined_score * 1000)

    # write scores to line
    text_line = protein1 + " " + protein2 + " " + str(int(1000*experiments)) + " " + str(int(1000*database)) + " " + str(int(1000*textmining)) + " " + str(physical_combined_score) + "\n"
    string_file.write(text_line)
    
    # Inform how many lines have been read/written
    count = count + 1
    if count % 1000000 == 0:
        print("So far, calculated this many records:", count)

string_file.close()

So far, calculated this many records: 1000000
So far, calculated this many records: 2000000
So far, calculated this many records: 3000000
So far, calculated this many records: 4000000
So far, calculated this many records: 5000000
So far, calculated this many records: 6000000
So far, calculated this many records: 7000000
So far, calculated this many records: 8000000
So far, calculated this many records: 9000000
So far, calculated this many records: 10000000
So far, calculated this many records: 11000000


In [9]:
# Read in constructed Tetxfile as a DataFrame
string_pre_score_df = pd.read_csv("outputs/output_combine_subscores_physical/string_pre_score_table.txt", delim_whitespace=True)
string_pre_score_df

Unnamed: 0,protein1,protein2,experiments,database,textmining,physical_combined_score
0,9606.ENSP00000000233,9606.ENSP00000272298,0,0,0,41
1,9606.ENSP00000000233,9606.ENSP00000253401,0,0,0,41
2,9606.ENSP00000000233,9606.ENSP00000401445,0,0,0,41
3,9606.ENSP00000000233,9606.ENSP00000418915,0,0,542,542
4,9606.ENSP00000000233,9606.ENSP00000327801,0,0,0,41
...,...,...,...,...,...,...
11759449,9606.ENSP00000485678,9606.ENSP00000310488,0,0,0,41
11759450,9606.ENSP00000485678,9606.ENSP00000342448,0,0,0,41
11759451,9606.ENSP00000485678,9606.ENSP00000350222,0,0,0,41
11759452,9606.ENSP00000485678,9606.ENSP00000367590,0,900,0,900


In [11]:
# Save Constructed DataFrame with Scores
inputs_folder = "inputs/input_string_clean_score_table/"
check_directory_exists(inputs_folder)    
save_to_csv_file(inputs_folder, string_pre_score_df, "string_pre_score_table", input_index=False)

Constructed and saved inputs/input_string_clean_score_table/string_pre_score_table.csv
