In [5]:
import numpy as np
import pandas as pd
import os
import sys
import warnings
import xlrd
import itertools
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)
import re
np.random.seed(0)

In [6]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
        
# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)

def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

In [None]:
# ----- Database Analysis -----

print("Running database_analysis.ipynb\n")
%run ./database_analysis.ipynb
print("\nFinished Running database_analysis\n")

In [7]:
drug_intome_targets = read_pickle_file("important_database_files/drug_intome_targets.pkl")
pfxDB050620_dint = read_pickle_file("important_database_files/pfxDB050620_dint.pkl")

# opening drug-target dictionaries for both PathFX Versions  
drug_db_dict_v1_keys = list(drug_intome_targets.keys())
drug_db_dict_v2_keys = list(pfxDB050620_dint.keys())

# Drug Intersection from both PathFX Versions
all_unique_drugs_both_pathfx_versions_lst = list(set(drug_db_dict_v1_keys) & set(drug_db_dict_v2_keys))

# Drugbank IDs from both PathFX Versions
all_unique_drungbank_ids_both_pathfx_versions_lst = [drug for drug in all_unique_drugs_both_pathfx_versions_lst if drug[0:2] == "DB"]

print()
print("---- NOT LOWERCASED CORRECTED Info ----")
print("===========================================================================================================================================================")
print("Number of unique Drugs (names and DrugBank IDs) in Version 1:", len(drug_intome_targets.keys()))
print("Number of unique Drugs (names and DrugBank IDs) in Version 2:", len(pfxDB050620_dint.keys()))
print("Number of unique Drugs (names and DrugBank IDs) in both versions of the PathFX interaction databases (NOT LOWERCASED CORRECTED): " + str(len(all_unique_drugs_both_pathfx_versions_lst)))
print("Number of unique Drugs (DrugBank ID only) in both versions of the PathFX interaction databases (NOT LOWERCASED CORRECTED): " + str(len(all_unique_drungbank_ids_both_pathfx_versions_lst)))
print()


cleaned_drug_intome_targets = read_pickle_file("output_database_analysis/cleaned_data_files/cleaned_drug_intome_targets.pkl")
cleaned_pfxDB050620_dint = read_pickle_file("output_database_analysis/cleaned_data_files/cleaned_pfxDB050620_dint.pkl")

all_drugs_union_pathfx_versions_lst = list(set(list(cleaned_drug_intome_targets)) | set(list(cleaned_pfxDB050620_dint)))
all_drugs_union_pathfx_versions_lst.sort()

all_drugs_intersecting_pathfx_versions_lst = list(set(list(cleaned_drug_intome_targets)) & set(list(cleaned_pfxDB050620_dint)))
all_drugs_intersecting_pathfx_versions_lst.sort()

all_drug_names_both_pathfx_versions_lst = [drug_name.lower() for drug_name in all_drugs_intersecting_pathfx_versions_lst if "-" not in drug_name and "(" not in drug_name and " " not in drug_name]
all_drug_names_both_pathfx_versions_lst = list(set(all_drug_names_both_pathfx_versions_lst))
all_drug_names_both_pathfx_versions_lst = [drug_name for drug_name in all_drug_names_both_pathfx_versions_lst if drug_name[0:2] != "db"]
all_drug_names_both_pathfx_versions_lst.sort()

all_unique_drungbank_ids_both_pathfx_versions_lst = [drug for drug in all_drugs_intersecting_pathfx_versions_lst if drug[0:2] == "db"]

print("---- LOWERCASED CORRECTED Info ----")
print("===========================================================================================================================================================")
print("Number of unique Drugs (names and DrugBank IDs) in Version 1:", len(cleaned_drug_intome_targets.keys()))
print("Number of unique Drugs (names and DrugBank IDs) in Version 2:", len(cleaned_pfxDB050620_dint.keys()))
print("Number of unique Drugs (names and DrugBank IDs) union in both versions of the PathFX interaction databases: " + str(len(all_drugs_union_pathfx_versions_lst)))
print("Number of unique Drugs (names and DrugBank IDs) intersecting in both versions of the PathFX interaction databases: " + str(len(all_drugs_intersecting_pathfx_versions_lst)))
print("Number of unique Drugs (DrugBank ID only) in both versions of the PathFX interaction databases: " + str(len(all_unique_drungbank_ids_both_pathfx_versions_lst)))
print("Number of unique Drugs (noncomplex name only) in both versions of the PathFX interaction databases: " + str(len(all_drug_names_both_pathfx_versions_lst)))
print()


---- NOT LOWERCASED CORRECTED Info ----
Number of unique Drugs (names and DrugBank IDs) in Version 1: 12096
Number of unique Drugs (names and DrugBank IDs) in Version 2: 14024
Number of unique Drugs (names and DrugBank IDs) in both versions of the PathFX interaction databases (NOT LOWERCASED CORRECTED): 10958
Number of unique Drugs (DrugBank ID only) in both versions of the PathFX interaction databases (NOT LOWERCASED CORRECTED): 5899

---- LOWERCASED CORRECTED Info ----
Number of unique Drugs (names and DrugBank IDs) in Version 1: 12096
Number of unique Drugs (names and DrugBank IDs) in Version 2: 14024
Number of unique Drugs (names and DrugBank IDs) union in both versions of the PathFX interaction databases: 14884
Number of unique Drugs (names and DrugBank IDs) intersecting in both versions of the PathFX interaction databases: 11236
Number of unique Drugs (DrugBank ID only) in both versions of the PathFX interaction databases: 5899
Number of unique Drugs (noncomplex name only) in bo

In [None]:
# ----- run_two_pathfx_versions -----

# obtaining all listed drugs that have 0 bytes in the .xlsx files constructed from the combine_pathfx_version_association_tables.py script
# these drugs cannot be inputted into the association_table_analysis pipeline
zero_byte_drug_lst = ["tetrahydrodeoxyuridine", "aminomethylcyclohexane", "arabinose-5-phosphate", "bendroflumethiazide", "bromodiphenhydramine", "carboxyatractyloside", "cyclohexyl-pentyl-maltoside", "cyclohexylformamide", "debromohymenialdisine", "dimethylthiambutene", 
                      "gamma-butyrolactone", "hydrochlorothiazide", "indane-5-sulfonamide", "lysophosphotidylserine", "methylphenobarbital", "monoisopropylphosphorylserine", 'n-anthracen-2-yl-5-methyl[1,2,4]triazolo[1,5-a]pyrimidin-7-amine', 
                      "n-cyclopropyl-4-pyrazolo[1,5-b]pyridazin-3-ylpyrimidin-2-amine", "pantothenoylaminoethenethiol", "phenoxymethylpenicillin", "phenylpropanolamine", "phosphatidylethanolamine", "phosphoenolpyruvate", "phosphorylisopropane", "tetrahydrodeoxyuridine", 
                      "thenoyltrifluoroacetone", "trifluoromethionine"]

drug_lst = all_drugs_union_pathfx_versions_lst

for drug in zero_byte_drug_lst:
    if drug in drug_lst:
        drug_lst.remove(drug)

print(len(drug_lst))

total_drug_completed = 0

for drug in drug_lst:
    cmd = "python run_two_pathfx_versions.py -d %s"%(drug)
    output = os.popen(cmd).read()
    total_drug_completed = total_drug_completed + 1
    if (total_drug_completed % 100) == 0:
        print("Finished this many drugs:", total_drug_completed)

In [8]:
# ----- generate_pathfx_version_drug_association_table -----

output_generate_pathfx_version_drug_association_table_folder = "output_generate_pathfx_version_drug_association_table/"
check_directory_exists(output_generate_pathfx_version_drug_association_table_folder)

ingrediants_df = read_csv_file("output_database_analysis/output_ingrediants_info/ingrediants_table.csv")
identified_ingrediants_lst = list(ingrediants_df[ingrediants_df["Ingrediant Type"] == "identified"]["Ingrediant"])

# obtaining all listed drugs that already have a .xlsx files constructed from the combine_pathfx_version_association_tables.py script
all_finished_rtpv_drug_lst = [drug for drug in os.listdir("output_run_two_pathfx_versions/")]
all_finished_rtpv_drug_lst.sort()
all_finished_rtpv_drug_lst.remove('.DS_Store')
all_finished_rtpv_drug_lst = [drug_file_name[:-4] for drug_file_name in all_finished_rtpv_drug_lst]

drug_lst = all_finished_rtpv_drug_lst

total_drug_completed = 0

for drug in drug_lst:
    
    input_drug_name = drug

    #print("Running generate_pathfx_version_drug_association_table.ipynb for " + drug + " ...")
    %run ./generate_pathfx_version_drug_association_table.ipynb
    #print("Finished Running generate_pathfx_version_drug_association_table.ipynb for " + drug)
    
    total_drug_completed = total_drug_completed + 1
    if (total_drug_completed % 100) == 0:
        print("Finished this many drugs:", total_drug_completed)

Finished this many drugs: 100
Finished this many drugs: 200
Finished this many drugs: 300
Finished this many drugs: 400
Finished this many drugs: 500
Finished this many drugs: 600
Finished this many drugs: 700
Finished this many drugs: 800
Finished this many drugs: 900
Finished this many drugs: 1000
Finished this many drugs: 1100
Finished this many drugs: 1200
Finished this many drugs: 1300
Finished this many drugs: 1400
Finished this many drugs: 1500
Finished this many drugs: 1600
Finished this many drugs: 1700
Finished this many drugs: 1800


In [None]:
# ----- combine_all_pathfx_version_association_tables -----

print("Running combine_all_pathfx_version_association_tables.ipynb")
%run ./combine_all_pathfx_version_association_tables.ipynb
print("Finished Running combine_all_pathfx_version_association_tables.ipynb")


In [None]:
# ----- modify_all_pathfx_version_drug_association_super_table -----

print("Running modify_all_pathfx_version_drug_association_super_table.ipynb")
%run ./modify_all_pathfx_version_drug_association_super_table.ipynb
print("Finished Running modify_all_pathfx_version_drug_association_super_table.ipynb")
