In [1]:
import networkx as nx
import random
from random import sample
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from multiprocessing import Pool
import numpy as np
import pandas as pd
import os
import sys
import warnings
import xlrd
import itertools
import pickle
from functools import partial
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib as mpl
mpl.use('TkAgg')
from matplotlib import backend_bases
import matplotlib.pyplot as plt
import itertools
import re
import ast

import plotly
import plotly.express as px
import plotly.graph_objects as go

from ast import literal_eval
from collections import Counter

sns.set_theme()
np.random.seed(0)

In [2]:
"""
Helper Functions
"""

# function checks if directory exists, if not it constructs it
def check_directory_exists(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# function saves DataFrame, list, or set as a textfile in a specific folder
def save_to_text_file(output_folder_dest, input_data, text_file_name, show_print_out=True):
    text_file_ouput = output_folder_dest + text_file_name + ".txt"
    drug_output_info_file = open(text_file_ouput, 'w+')
    if isinstance(input_data, pd.DataFrame):
        drug_output_info_file.write(input_data.to_string())
    else:
        drug_output_info_file.write(str(input_data))
    drug_output_info_file.close() 
    if(show_print_out):
        print("Constructed and saved", text_file_ouput)

# Read in Pickle File
def read_pickle_file(file_path):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_pickle(file_path)
    
# Save data into a pickel file
def save_to_pickle_file(output_folder_dest, dict_data, dict_file_name, show_print_out=False):
    output_dict_filename = output_folder_dest + dict_file_name + '.pkl'
    with open(output_dict_filename, 'wb') as handle:
        pickle.dump(dict_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    if(show_print_out):
        print("Constructed and saved", output_dict_filename)

# Read in a CSV file
def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    if not os.path.exists(file_path):
        sys.exit("Can't locate input file %s" % file_path)
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

# funcion saves DataFrame or list to as a textfile
def save_to_csv_file(output_folder_dest, df, csv_file_name, input_index=False, show_print_out=False):
    output_filename = output_folder_dest + csv_file_name + ".csv"
    df.to_csv(output_filename, index=input_index)
    if(show_print_out):
        print("Constructed and saved", output_filename)

def read_csv_file(file_path, input_sep=',', input_delimiter=None, input_index_col=None, input_dtype=None, input_delim_whitespace=False, input_low_memory=True):
    return pd.read_csv(filepath_or_buffer=file_path, sep=input_sep, delimiter=None, index_col=input_index_col, dtype=input_dtype, delim_whitespace=input_delim_whitespace, low_memory=input_low_memory)

def save_to_gpickle_file(output_folder_dest, data, file_name, show_print_out=True):
    output_filename = output_folder_dest + file_name + ".gpickle"
    nx.write_gpickle(data, output_filename)
    if(show_print_out):
        print("Constructed and saved", output_filename)
    
def read_networkx_gpickle_file(input_file_name):
    if not os.path.exists(input_file_name):
        sys.exit("Can't locate input file %s" % input_file_name)
    return nx.read_gpickle(input_file_name)
    
def set_comparison_info(set_x, set_y):
    set_union_x_and_y = set_x | set_y
    set_intersection_x_and_y = set_x & set_y
    x_set_difference = set_x - set_intersection_x_and_y
    y_set_difference = set_y - set_intersection_x_and_y
    return (set_union_x_and_y, set_intersection_x_and_y, x_set_difference, y_set_difference)

# function takes in two lists and returns lists constructed via set operations
def perform_basic_set_operation_for_lsts(lst_one, lst_two):
    set_one = set(lst_one)
    set_two = set(lst_two)
    lst_one_non_redun = list(set_one)
    lst_two_non_redun = list(set_two)
    intersection_lst = list(set_one & set_two)
    union_lst = list(set_one | set_two)
    set_difference_lst_one = list(set_one - set(intersection_lst))
    set_difference_lst_two = list(set_two - set(intersection_lst))
    intersection_lst.sort()
    union_lst.sort()
    set_difference_lst_one.sort()
    set_difference_lst_two.sort()
    return [lst_one_non_redun, lst_two_non_redun, intersection_lst, union_lst, set_difference_lst_one, set_difference_lst_two]

In [3]:
"""
Helper Functions for Graphing
"""

def plot_vertical_barplot(output_folder_dest, fig_size, data_df, x_col, y_col, hue_col, graph_title, show_print_out=False):
    plt.figure(figsize=fig_size)
    ax = sns.barplot(data=data_df, x=x_col, y=y_col, hue=hue_col, ci=None)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
    plt.xlabel(x_col, fontsize=18)
    plt.ylabel(y_col, fontsize=18)
    plt.title(graph_title, fontsize=25)
    
    if(hue_col):
        plt.legend(loc='upper right')
    
    output_filename = output_folder_dest + graph_title + "_vbarplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    if(show_print_out):
        print("Constructed and saved", output_filename)
    
def plot_horizontal_barplot(output_folder_dest, fig_size, data_df, x_col, y_col, hue_col, graph_title, show_print_out=False):
    plt.figure(figsize=fig_size)
    ax = sns.barplot(data=data_df, x=x_col, y=y_col, hue=hue_col, ci=None)
    plt.xlabel(x_col, fontsize=18)
    plt.ylabel(y_col, fontsize=18)
    plt.title(graph_title, fontsize=25)
    
    if(hue_col):
        plt.legend(loc='upper right')
    
    output_filename = output_folder_dest + graph_title + "_hbarplot.png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    if(show_print_out):
        print("Constructed and saved", output_filename)
    
def plot_scatterplot(output_folder_dest, fig_size, data_df, x_col, y_col, hue_col, graph_title, show_print_out=False):
    plt.figure(figsize=fig_size)
    sns.scatterplot(data=data_df, x=x_col, y=y_col, hue=hue_col)
    plt.title(graph_title)
    output_filename = output_folder_dest + graph_title + ".png"
    plt.savefig(output_filename, facecolor='w', edgecolor='w',transparent=False)
    plt.close()
    if(show_print_out):
        print("Constructed and saved", output_filename)

In [4]:
database_analysis_output_folder = "output_database_analysis/"
check_directory_exists(database_analysis_output_folder)

## Cleaning Important Source Files

In [5]:
"""
cleaned_data_files
------------------------------------------------------------------------------------------------------------------------------------------------------------------
"""

# output folder for saving all cleaned dictionaries
cleaned_data_files_output_folder = database_analysis_output_folder + "cleaned_data_files/"
check_directory_exists(cleaned_data_files_output_folder)

# drugbank_id_and_drug_name_source_to_gene_symbol_target_dict
input_filename = "important_database_files/drugbank_data for comparing PathFX Version 2/drugbank_id_and_drug_name_source_to_gene_symbol_target_dict.pkl"
drugbank_id_and_drug_name_source_to_gene_symbol_target_dict = read_pickle_file(input_filename)
cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict= {key.lower():value for key,value in drugbank_id_and_drug_name_source_to_gene_symbol_target_dict.items()}

# drug_intome_targets
# File for PathFX Version 1.0 for drugs_to_drugbank_targets.pkl
input_filename = "important_database_files/drug_intome_targets.pkl"
drug_intome_targets = dict(read_pickle_file(input_filename))
cleaned_drug_intome_targets = {key.lower():value for key,value in drug_intome_targets.items()}
"""
'benzylcysteine': ['MGMT'],
'latrunculin a': ['ACTA1', 'GSN'],
'db00799': ['CYP2C8', 'RARG', 'RARB', 'RXRB', 'RARA'],
'db01190': ['CYP3A4'],
'1-ter-butyl-3-p-tolyl-1h-pyrazolo[3,4-d]pyrimidin-4-ylamine': ['RET', 'HCK'],
'db06972': ['ITGAL'],
"""

# all_phens_to_cuis
input_filename = "important_database_files/all_phens_to_cuis.pkl"
all_phens_to_cuis = read_pickle_file(input_filename)
cleaned_all_phens_to_cuis = {key.lower():value for key,value in all_phens_to_cuis.items()}
"""
'anemia, sickle cell': 'C0002895',
'behcet syndrome': 'C0004943',
'craniosynostoses': 'C0010278',
'dermatitis, atopic': 'C0011615',
"""

# drugs_to_sideEffect_cuis
def Convert(tup, di):
    for a, b in tup:
        di.setdefault(a, []).extend(b)
    return di
input_filename = "important_database_files/drugs_to_sideEffect_cuis.pkl"
drugs_to_sideEffect_cuis_lst = read_pickle_file(input_filename)
output_dict = {}
cleaned_drugs_to_sideEffect_cuis = Convert(drugs_to_sideEffect_cuis_lst, output_dict)
"""
{'DB00783': ['C0027051',
...
'DB01050': ['C3278737',
'C0030305',
"""

# merged_genes_to_cuis
input_filename = "important_database_files/merged_genes_to_cuis.pkl"
merged_genes_to_cuis = dict(read_pickle_file(input_filename))
cleaned_merged_genes_to_cuis = merged_genes_to_cuis
"""
'BRGDA7': ['C2751088', 'C4013699'],
'DFNA39': ['C0011053', 'C0011430', 'C4049050', 'C0011436'],
"""

# merged_unique_cuis2genes
# CUIs to interactome gene lists for Version 1.0
input_filename = "important_database_files/merged_unique_cuis2genes.pkl"
merged_unique_cuis2genes = dict(read_pickle_file(input_filename))
cleaned_merged_unique_cuis2genes = merged_unique_cuis2genes
"""
'C0339959': ['SRSF6', 'NUMA1', 'L3MBTL1'],
'C0026918': ['IFNGR1',
'FRMPD1',
'IL10',
"""

# all_assoc_to_nodes
input_filename = "important_database_files/all_assoc_to_nodes.pkl"
all_assoc_to_nodes = read_pickle_file(input_filename)
cleaned_all_assoc_to_nodes = {key.lower():value for key, value in all_assoc_to_nodes.items()}
for key in cleaned_all_assoc_to_nodes.keys():
    cleaned_all_assoc_to_nodes[key] = list(cleaned_all_assoc_to_nodes[key])
"""
 'fanconi anemia, complementation group j': ['MRPL36', 'BRIP1'],
 ' deafness, autosomal recessive 37, 607821 (3)': ['DFNB37', 'MYO6', 'DFNA22'],
 ' revesz syndrome, 268130 (3)': ['TIN2', 'DKCA3', 'TINF2'],
 'ventral hernia': ['PKD1', 'PRKD1', 'THAS'],
 'malignant tumor of extrahepatic bile duct': ['PROC',
"""

# Pfx050120_all_assoc_to_nodes
input_filename = "important_database_files/Pfx050120_all_assoc_to_nodes.pkl"
Pfx050120_all_assoc_to_nodes = read_pickle_file(input_filename)
cleaned_Pfx050120_all_assoc_to_nodes = {key.lower():value for key, value in Pfx050120_all_assoc_to_nodes.items()}
for key in cleaned_Pfx050120_all_assoc_to_nodes.keys():
    cleaned_Pfx050120_all_assoc_to_nodes[key] = list(cleaned_Pfx050120_all_assoc_to_nodes[key])
"""
'STS'],
'schneckenbecken dysplasia': ['SLC35D1', 'INPPL1'],
'macrothrombocytopenia, autosomal dominant, tubb1-related': ['TUBB1'],
'nuchal rigidity': ['CYP2D6', 'NPY', 'GCH1', 'PDE8B', 'NTS'],
'tapered toe': ['FIG4', 'CPT2'],
'progressive microcephaly': ['PYCR2',
'EFTUD2',
'BRAT1',
"""

# Pfx050120_all_phens_to_cuis
input_filename = "important_database_files/Pfx050120_all_phens_to_cuis.pkl"
Pfx050120_all_phens_to_cuis = read_pickle_file(input_filename)
cleaned_Pfx050120_all_phens_to_cuis = {key.lower():value for key, value in Pfx050120_all_phens_to_cuis.items()}
"""
'nuchal rigidity': 'C1320474',
'fungal keratitis': 'C1262117',
'liver diseases, parasitic': 'C0023897',
"""

# Pfx050120_dbid2name
input_filename = "important_database_files/Pfx050120_dbid2name.pkl"
Pfx050120_dbid2name = read_pickle_file(input_filename)
cleaned_Pfx050120_dbid2name = {key.lower():value.lower() for key,value in Pfx050120_dbid2name.items()}
"""
{'db14244': 'garden snail mucin',
'db02423': 'thiopyrophosphate',
'db08458': '(4-bromophenyl)[4-({(2e)-4-[cyclopropyl(methyl)amino]but-2-enyl}oxy)phenyl]methanone',
'db02501': 'n(2)-succinyl-l-arginine',
'db13391': 'levoverbenone',
"""

# pfx050120_name2dbid (CONSTRUCTED FROM SCRATCH)
cleaned_Pfx050120_name2dbid = {value.lower():key.lower() for key,value in Pfx050120_dbid2name.items()}
"""
'5-methoxy-n,n-diisopropyltryptamine': 'db01441',
'alovudine f-18': 'db14930',
'cg-200745': 'db12259',
'telbermin': 'db12639',
"""

# Pfx050120_dint
# "dint" is short for Drug Interaction Targets
input_filename = "important_database_files/Pfx050120_dint.pkl"
Pfx050120_dint = dict(read_pickle_file(input_filename))
cleaned_Pfx050120_dint = {key.lower():value for key,value in Pfx050120_dint.items()}
for key in cleaned_Pfx050120_dint.keys():
    cleaned_Pfx050120_dint[key] = list(cleaned_Pfx050120_dint[key])
"""
'chymopapain': ['PRG2'],
'apd668': ['GPR119'],
'pozanicline': ['CHRNB2', 'CHRNA4'],
'db05418': ['TACR1'],
"""

# Pfx050120_merged_genes_to_cuis
input_filename = "important_database_files/Pfx050120_merged_genes_to_cuis.pkl"
Pfx050120_merged_genes_to_cuis = dict(read_pickle_file(input_filename))
cleaned_Pfx050120_merged_genes_to_cuis = Pfx050120_merged_genes_to_cuis
for key in cleaned_Pfx050120_merged_genes_to_cuis.keys():
    cleaned_Pfx050120_merged_genes_to_cuis[key] = list(cleaned_Pfx050120_merged_genes_to_cuis[key])
"""
{'MYC': ['C0086404',
  'C0085750',
  'C0024419',
  'C0334299',
  'C0032463',
"""

# Pfx050120_merged_unique_cuis2genes
# CUIs to interactome gene lists for Version 2.0
input_filename = "important_database_files/Pfx050120_merged_unique_cuis2genes.pkl"
Pfx050120_merged_unique_cuis2genes = dict(read_pickle_file(input_filename))
cleaned_Pfx050120_merged_unique_cuis2genes = Pfx050120_merged_unique_cuis2genes
for key in cleaned_Pfx050120_merged_unique_cuis2genes.keys():
    cleaned_Pfx050120_merged_unique_cuis2genes[key] = list(cleaned_Pfx050120_merged_unique_cuis2genes[key])
"""
{'': ['GJB2', 'PRICKLE1', 'EPM2A', 'CSTB', 'GJB6', 'NHLRC1'],
'C4314198': ['GATA1',
'NHP2',
'ATP7B',
'PCCA',
"""

# pfxDB050620_dbid2name
input_filename = "important_database_files/pfxDB050620_dbid2name.pkl"
pfxDB050620_dbid2name = read_pickle_file(input_filename)
cleaned_pfxDB050620_dbid2name = {key.lower():value.lower() for key,value in pfxDB050620_dbid2name.items()}
cleaned_pfxDB050620_dbid2name
"""
'db08258': '6-{[(cyclohexylamino)carbonyl]amino}hexanoic acid',
'db03295': 'glutathionylspermidine',
'db04767': 'n-[1-(4-carbamimidoyl-benzylcarbamoyl)-3-methylsulfanyl-propyl]-3-hydroxy-2-propoxyamino-butyramid',
'db08719': '5-(5-(6-chloro-4-(4,5-dihydro-2-oxazolyl)phenoxy)pentyl)-3-methyl isoxazole',
'db12170': 'veledimex',
"""
# pfxDB050620_name2dbid (CONSTRUCTED FROM SCRATCH)
cleaned_pfxDB050620_name2dbid = {value.lower():key.lower() for key,value in pfxDB050620_dbid2name.items()}
"""
veledimex': 'db12170',
'2-aminoethanimidic acid': 'db02108',
'3-chlorophenol': 'db01957',
'etrasimod': 'db14766',
"""

# pfxDB050620_dint
# File for PathFX Version 2.0 for drugs_to_drugbank_targets.pkl
# "dint" is short for Drug Interaction Targets
input_filename = "important_database_files/pfxDB050620_dint.pkl"
pfxDB050620_dint = dict(read_pickle_file(input_filename))
cleaned_pfxDB050620_dint = {key.lower():value for key,value in pfxDB050620_dint.items()}
for key in cleaned_pfxDB050620_dint.keys():
    cleaned_pfxDB050620_dint[key] = list(cleaned_pfxDB050620_dint[key])
"""
'2-deoxyglucose': ['SLC2A1'],
 'DB06897': ['PLK1'],
 'Abatacept': ['CD86', 'CD80'],
 'Saracatinib': ['CYP3A4'],
 'ONT-093': ['ABCB1'],
 'Protirelin': ['TRHR'],
 'DB11359': ['ALB'],
 'Estradiol cypionate': ['ABCC10',
"""

# unique_network_nodes (PathFX Version 1)
input_filename = "important_database_files/unique_network_nodes.pkl"
unique_network_nodes = list(read_pickle_file(input_filename))
cleaned_unique_network_nodes = unique_network_nodes
"""
['AIRE',
 'FRMD8',
 'MPHOSPH8',
 'TPP2',
 'HSD3B7',
 'RS2295632',
 'RPP40',
 'COMPLEX-GXYNEYWIHLAYF80AJXFVCZ-DCHQ',
 'C8ORF33',
 'ABT1',
 'LPCAT2',
 'BRD1',
"""

# sourced_phens (PathFX Version 1)
input_filename = "important_database_files/sourced_phens.pkl"
sourced_phens = dict(read_pickle_file(input_filename))
cleaned_sourced_phens = sourced_phens
"""
 ('ST5', 'C0848558'): {'DisGeNet', 'PheGenI'},
 ('SCN10A', 'C1142166'): {'DisGeNet', 'PheGenI'},
 ('HEY2', 'C1142166'): {'DisGeNet', 'PheGenI'},
 ('TRE-CTC1-7', 'C1142166'): {'PheGenI'},
 ('SCN5A', 'C1142166'): {'ClinVar', 'DisGeNet', 'PheGenI'},
 ('PKD2', 'C0018099'): {'PheGenI'},
 ('ABCG2', 'C0018099'): {'DisGeNet', 'PheGenI'},
"""

# pfx041520_unique_nodes (PathFX Version 2)
input_filename = "important_database_files/pfx041520_unique_nodes.pkl"
pfx041520_unique_nodes = read_pickle_file(input_filename)
cleaned_pfx041520_unique_nodes = pfx041520_unique_nodes
"""
['LAMA3',
 'rs1053129',
 'STAT1',
 'NDUFA10',
"""

# drugbankid_to_name (PathFX Version 1)
input_filename = "important_database_files/drugbankid_to_name.pkl"
drugbankid_to_name = dict(read_pickle_file(input_filename))
cleaned_drugbankid_to_name = {key.lower():value.lower() for key,value in drugbankid_to_name.items()}
"""
'db03384': 'fica',
'db05712': 'azd-9684',
'db00229': 'cefotiam',
"""

# Pfx050120_sourced_phens (PathFX Version 2)
input_filename = "important_database_files/Pfx050120_sourced_phens.pkl"
Pfx050120_sourced_phens = dict(read_pickle_file(input_filename))
cleaned_Pfx050120_sourced_phens = Pfx050120_sourced_phens
cleaned_Pfx050120_sourced_phens
"""
{('TP53', 'C0750986'): {'DisGeNet'},
 ('FOSL1', 'C1458155'): {'DisGeNet'},
 ('IGFBP3', 'C1168327'): {'DisGeNet'},
"""

# cuis_to_all_phens
input_filename = "important_database_files/cuis_to_all_phens.pkl"
cuis_to_all_phens= dict(read_pickle_file(input_filename))
cleaned_cuis_to_all_phens = {key:list(map(lambda x: x.lower(), value)) for key,value in cuis_to_all_phens.items()}
# Jen said she choose the FIRST phenotype in the list in the values of the dictionary -------------------------------------------------------------------------------- ******
cleaned_cuis_to_all_phens = {key:value[0] for key,value in cleaned_cuis_to_all_phens.items()}
"""
'C0018099': 'gout',
'C0002895': 'anemia, sickle cell',
'C0004943': 'behcet syndrome',
"""

# Pfx050120_cui_to_phens
input_filename = "important_database_files/Pfx050120_cui_to_phens.pkl"
Pfx050120_cui_to_phens = dict(read_pickle_file(input_filename))
cleaned_Pfx050120_cui_to_phens = {key:list(map(lambda x: x.lower(), set(value))) for key,value in Pfx050120_cui_to_phens.items()}
cleaned_Pfx050120_cui_to_phens = {key:list(set(value)) for key,value in cleaned_Pfx050120_cui_to_phens.items()}  # to ensure duplicates removed
# Jen said she choose the FIRST phenotype in the list in the values of the dictionary -------------------------------------------------------------------------------- ******
cleaned_Pfx050120_cui_to_phens = {key:value[0] for key,value in cleaned_Pfx050120_cui_to_phens.items()}
cleaned_Pfx050120_cui_to_phens
"""
'C4314198': 'hemoglobin low',
'C0019191': 'infectious canine hepatitis',
'C4231515': 'hypertension',
'C2256526': 'blood coagulation factors',
"""
# Save Dictionaries as Pickle Files 
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_drugbankid_to_name, "cleaned_drugbankid_to_name")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_pfx041520_unique_nodes, "cleaned_pfx041520_unique_nodes")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_sourced_phens, "cleaned_Pfx050120_sourced_phens")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict, "cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_drug_intome_targets, "cleaned_drug_intome_targets")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_all_phens_to_cuis, "cleaned_all_phens_to_cuis")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_cuis_to_all_phens, "cleaned_cuis_to_all_phens")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_drugs_to_sideEffect_cuis, "cleaned_drugs_to_sideEffect_cuis")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_merged_genes_to_cuis, "cleaned_merged_genes_to_cuis")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_merged_unique_cuis2genes, "cleaned_merged_unique_cuis2genes")
save_to_pickle_file(cleaned_data_files_output_folder, all_assoc_to_nodes, "cleaned_all_assoc_to_nodes")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_all_assoc_to_nodes, "cleaned_Pfx050120_all_assoc_to_nodes")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_all_phens_to_cuis, "cleaned_Pfx050120_all_phens_to_cuis")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_cui_to_phens, "cleaned_Pfx050120_cui_to_phens")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_dbid2name, "cleaned_Pfx050120_dbid2name")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_name2dbid, "cleaned_Pfx050120_name2dbid")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_dint, "cleaned_Pfx050120_dint")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_merged_genes_to_cuis, "cleaned_Pfx050120_merged_genes_to_cuis")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_merged_unique_cuis2genes, "cleaned_Pfx050120_merged_unique_cuis2genes")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_pfxDB050620_dbid2name, "cleaned_pfxDB050620_dbid2name")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_pfxDB050620_name2dbid, "cleaned_pfxDB050620_name2dbid")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_pfxDB050620_dint, "cleaned_pfxDB050620_dint")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_unique_network_nodes, "cleaned_unique_network_nodes")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_pfx041520_unique_nodes, "cleaned_pfx041520_unique_nodes")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_sourced_phens, "cleaned_sourced_phens")
save_to_pickle_file(cleaned_data_files_output_folder, cleaned_Pfx050120_sourced_phens, "cleaned_Pfx050120_sourced_phens")

## Comparing Important Source Files between different PathFX Versions

In [6]:
pathfx_source_files_comparison_folder = database_analysis_output_folder + "output_pathfx_version_source_files_comparison_folder/"
check_directory_exists(pathfx_source_files_comparison_folder)

In [7]:
def invert_dictionary_with_list(d): 
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # If not create a new list
                inverse[item] = [key] 
            else: 
                inverse[item].append(key) 
    return inverse

def invert_simple_dictionary(d):
    return {v: k for k, v in d.items()}

### cleaned_drugbankid_to_name vs. cleaned_pfxDB050620_dbid2name

In [8]:
# Comparing Drugbank IDs: cleaned_drugbankid_to_name (version 1) vs. cleaned_pfxDB050620_dbid2name (version 2)

# Note: cleaned_Pfx050120_dbid2name and cleaned_pfxDB050620_dbid2name have the same exact content

# output folder for comparing two Drugbank IDs to drug files
drugbank_ids_to_drug_name_file_version_comparison_folder = pathfx_source_files_comparison_folder + "drugbank_ids_to_drug_name_file_version_comparison/"
check_directory_exists(drugbank_ids_to_drug_name_file_version_comparison_folder)

cleaned_drugbankid_to_name_df = pd.DataFrame.from_dict(cleaned_drugbankid_to_name, orient='index')
cleaned_drugbankid_to_name_df = cleaned_drugbankid_to_name_df.rename(columns={0: "Drug Name from drugbankid_to_name"})

# cleaned_pfxDB050620_name2dbid was created from scratch, it is an inverted dict of cleaned_pfxDB050620_dbid2name
cleaned_pfxDB050620_dbid2name_df = pd.DataFrame.from_dict(cleaned_pfxDB050620_dbid2name, orient='index')
cleaned_pfxDB050620_dbid2name_df = cleaned_pfxDB050620_dbid2name_df.rename(columns={0: "Drug Name from pfxDB050620"})

drug_id_to_name_version_comparison_df = pd.concat([cleaned_drugbankid_to_name_df, cleaned_pfxDB050620_dbid2name_df], axis=1)
drug_id_to_name_version_comparison_df = drug_id_to_name_version_comparison_df.sort_index(ascending=True)

drug_id_to_name_version_comparison_df["same drug name for both versions"] = drug_id_to_name_version_comparison_df["Drug Name from drugbankid_to_name"] == drug_id_to_name_version_comparison_df["Drug Name from pfxDB050620"]

drug_id_to_name_version_comparison_df["v1 NaN?"] = drug_id_to_name_version_comparison_df["Drug Name from drugbankid_to_name"].isnull()
drug_id_to_name_version_comparison_df["v2 NaN?"] = drug_id_to_name_version_comparison_df["Drug Name from pfxDB050620"].isnull()

drug_id_to_name_version_comparison_df = drug_id_to_name_version_comparison_df.reset_index()

drug_id_to_name_version_comparison_df.rename(columns={"index":"DrugBank ID"}, inplace=True)

# Saving Dataframe of drug targets
save_to_csv_file(drugbank_ids_to_drug_name_file_version_comparison_folder, drug_id_to_name_version_comparison_df, "drug_id_to_name_version_comparison_table", show_print_out=False)

In [9]:
print("\ndrug_id_to_name_version_comparison_df")
display(drug_id_to_name_version_comparison_df)
print("Number of Drug IDs from the Union in PathFX Version 1 and Version 2:", len(drug_id_to_name_version_comparison_df))
print("Number of Drug IDs present in PathFX Version 1 (cleaned_drugbankid_to_name):", len(cleaned_drugbankid_to_name.keys()))
print("Number of Drug IDs present in PathFX Version 2 (cleaned_pfxDB050620_dbid2name):", len(cleaned_pfxDB050620_dbid2name.keys()))
print("Number of Drug IDs NOT recognized in PathFX Version 1 (cleaned_drugbankid_to_name):", sum(drug_id_to_name_version_comparison_df["v1 NaN?"]))
print("Number of Drug IDs NOT recognized in PathFX Version 2 (cleaned_pfxDB050620_dbid2name):", sum(drug_id_to_name_version_comparison_df["v2 NaN?"]))
print("Number of Drug NAMES that are NOT the same for corresponding Drug IDs in both PathFX Versions (including non-recognized Drug IDs Only):", len(drug_id_to_name_version_comparison_df) - sum(drug_id_to_name_version_comparison_df["same drug name for both versions"]))

temp_def = drug_id_to_name_version_comparison_df[drug_id_to_name_version_comparison_df["v1 NaN?"]==False]
temp_def = temp_def[temp_def["v2 NaN?"]==False]
temp_def = temp_def.sort_values(by="same drug name for both versions", ascending=True)
print("Number of Drug NAMES that are NOT the same for corresponding Drug IDs in both PathFX Versions (Recognized Drug IDs Only):", len(temp_def) - sum(temp_def["same drug name for both versions"]))

not_same_names_for_same_db_id_df = temp_def[temp_def["same drug name for both versions"]==False]
not_same_names_for_same_db_id_df = not_same_names_for_same_db_id_df.sort_values(by="DrugBank ID", ascending=False)

print()
not_same_names_for_same_db_id_df


drug_id_to_name_version_comparison_df


Unnamed: 0,DrugBank ID,Drug Name from drugbankid_to_name,Drug Name from pfxDB050620,same drug name for both versions,v1 NaN?,v2 NaN?
0,db00001,lepirudin,lepirudin,True,False,False
1,db00002,cetuximab,cetuximab,True,False,False
2,db00003,dornase alfa,dornase alfa,True,False,False
3,db00004,denileukin diftitox,denileukin diftitox,True,False,False
4,db00005,etanercept,etanercept,True,False,False
...,...,...,...,...,...,...
13669,db15595,,"ebola zaire vaccine (live, attenuated)",False,True,False
13670,db15596,,aminopromazine,False,True,False
13671,db15597,,aminopentamide,False,True,False
13672,db15598,,ferric maltol,False,True,False


Number of Drug IDs from the Union in PathFX Version 1 and Version 2: 13674
Number of Drug IDs present in PathFX Version 1 (cleaned_drugbankid_to_name): 8283
Number of Drug IDs present in PathFX Version 2 (cleaned_pfxDB050620_dbid2name): 13475
Number of Drug IDs NOT recognized in PathFX Version 1 (cleaned_drugbankid_to_name): 5391
Number of Drug IDs NOT recognized in PathFX Version 2 (cleaned_pfxDB050620_dbid2name): 199
Number of Drug NAMES that are NOT the same for corresponding Drug IDs in both PathFX Versions (including non-recognized Drug IDs Only): 6418
Number of Drug NAMES that are NOT the same for corresponding Drug IDs in both PathFX Versions (Recognized Drug IDs Only): 828



Unnamed: 0,DrugBank ID,Drug Name from drugbankid_to_name,Drug Name from pfxDB050620,same drug name for both versions,v1 NaN?,v2 NaN?
11288,db13146,fluciclovine,fluciclovine (18f),False,False,False
11284,db13142,calcium glubionate,calcium glubionate anhydrous,False,False,False
10278,db12095,telotristat,telotristat ethyl,False,False,False
9797,db11604,human clostridium tetani toxoid immune globulin,tetanus immune globulin,False,False,False
9627,db11365,senna glycoside,sennosides,False,False,False
...,...,...,...,...,...,...
43,db00045,ospa lipoprotein,lyme disease vaccine (recombinant ospa),False,False,False
38,db00040,glucagon recombinant,glucagon,False,False,False
27,db00028,immune globulin human,human immunoglobulin g,False,False,False
24,db00025,antihemophilic factor (recombinant),"antihemophilic factor, human recombinant",False,False,False


In [10]:
"""
RECALL
-------
'drugbank_id_and_drug_name_source_to_gene_symbol_target_table' was constructed by the following 3 files from the 
'drugbank_data for comparing PathFX Version 2'/ folder inside the 'important_datavase_files'/ folder

'proteins_050120.tsv'             # tab-delimited text file, with drugbank to target information
'drugbank_050120.tsv'             # tab-delimited file with drugbank to name mapping
'uniprot_to_gene_name_table.txt'  # tab-delimited file mapping entrex gene IDs to gene symbol
"""
# Comments
# The information pulled from Drugbank contains more DB ID's than in cleaned_Pfx050120_dbid2name and cleaned_pfxDB050620_dbid2name
# drugbank_id_and_drug_name_source_to_gene_symbol_target_df = read_csv_file("important_database_files/drugbank_data for comparing PathFX Version 2/drugbank_id_and_drug_name_source_to_gene_symbol_target_table.csv")
# save_to_csv_file(drugbank_ids_to_drug_name_file_version_comparison_folder, drugbank_id_and_drug_name_source_to_gene_symbol_target_df, "drugbank_id_and_drug_name_source_to_gene_symbol_target_table", show_print_out=False)
# print("DrugBank Info Table 050120")
# display(drugbank_id_and_drug_name_source_to_gene_symbol_target_df)
print("")




### cleaned_drug_intome_targets.pkl (version 1) vs. cleaned_pfxDB050620_dint.pkl (version 2) and self-drugbank v2

In [None]:
# Comparing Drugs and their targets from drugbank: cleaned_drug_intome_targets.pkl (version 1) vs. cleaned_pfxDB050620_dint.pkl (version 2)
# there should be a symbolic link from "drugs_to_drugbank_targets.pkl" to each file in their respective branches

# cleaned_Pfx050120_dint is seen in the version 2 folder but cleaned_pfxDB050620_dbid2name was used 

# cleaned_Pfx050120_dint was NOT analyzed for any comparisons for content

# output folder for Comparing Drugs and their targets from drugbank for Version 1.0 vs Version 2.0 
drugs_to_drugbank_interactome_targets_file_version_comparison_folder = pathfx_source_files_comparison_folder + "drugs_to_drugbank_interactome_targets_file_version_comparison/"
check_directory_exists(drugs_to_drugbank_interactome_targets_file_version_comparison_folder)

# ------- Drug Dataframe -------

all_drugs_lst = []
all_drugs_lst.extend(cleaned_drug_intome_targets.keys())
all_drugs_lst.extend(cleaned_pfxDB050620_dint.keys())
all_drugs_lst = list(set(all_drugs_lst))

data_dict = {"Drug":[], "v1_targets":[], "v2_targets":[], "drug_targets_version_union":[], "drug_targets_version_intersection":[], "drug_targets_version_set_difference_v1":[], "drug_targets_version_set_difference_v2":[]}
all_drugs_for_targets_pathfx_version_df = pd.DataFrame(data_dict)
all_drugs_for_targets_pathfx_version_df = all_drugs_for_targets_pathfx_version_df.set_index("Drug")

for drug in all_drugs_lst:
    if drug not in cleaned_drug_intome_targets.keys():
        verison_one_drug_targets = []
    else:
        verison_one_drug_targets = cleaned_drug_intome_targets[drug]
    if drug not in cleaned_pfxDB050620_dint.keys():
        verison_two_drug_targets = []
    else:
        verison_two_drug_targets = cleaned_pfxDB050620_dint[drug]
    (set_union_x_and_y, set_intersection_x_and_y, x_set_difference, y_set_difference) = set_comparison_info(set(verison_one_drug_targets), set(verison_two_drug_targets))
    all_drugs_for_targets_pathfx_version_df.loc[drug] = [verison_one_drug_targets, verison_two_drug_targets, list(set_union_x_and_y), list(set_intersection_x_and_y), list(x_set_difference), list(y_set_difference)]

all_drugs_for_targets_pathfx_version_df["v1_targets_size"] = all_drugs_for_targets_pathfx_version_df["v1_targets"].map(len)
all_drugs_for_targets_pathfx_version_df["v2_targets_size"] = all_drugs_for_targets_pathfx_version_df["v2_targets"].map(len)
all_drugs_for_targets_pathfx_version_df["drug_targets_version_union_size"] = all_drugs_for_targets_pathfx_version_df["drug_targets_version_union"].map(len)
all_drugs_for_targets_pathfx_version_df["drug_targets_version_intersection_size"] = all_drugs_for_targets_pathfx_version_df["drug_targets_version_intersection"].map(len)
all_drugs_for_targets_pathfx_version_df["drug_targets_version_set_difference_v1_size"] = all_drugs_for_targets_pathfx_version_df["drug_targets_version_set_difference_v1"].map(len)
all_drugs_for_targets_pathfx_version_df["drug_targets_version_set_difference_v2_size"] = all_drugs_for_targets_pathfx_version_df["drug_targets_version_set_difference_v2"].map(len)

all_drugs_for_targets_pathfx_version_df = all_drugs_for_targets_pathfx_version_df.reset_index()

save_to_csv_file(drugs_to_drugbank_interactome_targets_file_version_comparison_folder, all_drugs_for_targets_pathfx_version_df, "all_drugs_for_targets_pathfx_version_table", show_print_out=False)

In [None]:
all_drugs_for_targets_pathfx_version_df

In [None]:
# cleaned_Pfx050120_dint was NOT analyzed for any comparisons in terms of content

In [None]:
all_drugs_for_targets_pathfx_version_stats_df = all_drugs_for_targets_pathfx_version_df[["v1_targets_size", "v2_targets_size", "drug_targets_version_union_size", "drug_targets_version_intersection_size", 
                                         "drug_targets_version_set_difference_v1_size", "drug_targets_version_set_difference_v2_size"]]
all_drugs_for_targets_pathfx_version_stats_df = all_drugs_for_targets_pathfx_version_stats_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
all_drugs_for_targets_pathfx_version_stats_df = all_drugs_for_targets_pathfx_version_stats_df.T

# correcting info for v1 to account distribution for keys present only in v1
lst = [len(cleaned_drug_intome_targets[k]) for k in cleaned_drug_intome_targets.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_drugs_for_targets_pathfx_version_stats_df.loc["v1_targets_size"] = temp_df.iloc[0]

# correcting info for v2 to account distribution for keys present only in v2
lst = [len(cleaned_pfxDB050620_dint[k]) for k in cleaned_pfxDB050620_dint.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_drugs_for_targets_pathfx_version_stats_df.loc["v2_targets_size"] = temp_df.iloc[0]

all_drugs_for_targets_pathfx_version_stats_df.rename(index={'v1_targets_size':'Distribution of Drug Target Gene List Size for Drugs in Version 1', 
                                     'v2_targets_size':'Distribution of Drug Target Gene List Size for Drugs in Version 2',
                                     'drug_targets_version_union_size':'Distribution of Drug Target Gene List Size for the Union of Drug Target genes in Both PathFX Versions',
                                     'drug_targets_version_intersection_size':'Distribution of Drug Target Gene List Size for the Intersection of Drug Target genes in Both PathFX Versions',
                                     'drug_targets_version_set_difference_v1_size':'Distribution of Drug Target Gene List Size for PathFX Version 1 Exclusive Drug Target genes',
                                     'drug_targets_version_set_difference_v2_size':'Distribution of Drug Target Gene List Size for PathFX Version 2 Exclusive Drug Target genes',
                                    }, inplace=True)

#all_drugs_for_targets_pathfx_version_stats_df = all_drugs_for_targets_pathfx_version_stats_df.reset_index()

save_to_csv_file(drugs_to_drugbank_interactome_targets_file_version_comparison_folder, all_drugs_for_targets_pathfx_version_stats_df, "all_drugs_for_targets_pathfx_version_stats_table", input_index=True, show_print_out=False)

In [None]:
# Simply Comparison for Drug-Target pairs in several dictionaries related to PathFX V2

print("Number of Drug-Target pairs in drugbank_id_and_drug_name_source_to_gene_symbol_target_dict: " + str(len(drugbank_id_and_drug_name_source_to_gene_symbol_target_dict.keys())))
print("Number of Drug-Target pairs in PathFX Version 1 (cleaned_drug_intome_targets): " + str(len(cleaned_drug_intome_targets.keys())))
print("Number of Drug-Target pairs in PathFX Version 2 (cleaned_pfxDB050620_dint): " + str(len(cleaned_pfxDB050620_dint.keys())))
print("Number of Drug-Target pairs in PathFX Version 2 (cleaned_Pfx050120_dint): " + str(len(cleaned_Pfx050120_dint.keys())))

shared_keys = [k for k in cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict if k in cleaned_pfxDB050620_dint and cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict[k] == cleaned_pfxDB050620_dint[k]]
print("\nNumber of shared Drug-Target pairs between cleaned self-DrugBank v2 and cleaned_pfxDB050620_dint: " + str(len(shared_keys)))

shared_keys = [k for k in cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict if k in cleaned_Pfx050120_dint and cleaned_drugbank_id_and_drug_name_source_to_gene_symbol_target_dict[k] == cleaned_Pfx050120_dint[k]]
print("\nNumber of shared Drug-Target pairs between cleaned self-DrugBank v2 and cleaned_Pfx050120_dint: " + str(len(shared_keys)))

shared_keys = [k for k in cleaned_Pfx050120_dint if k in cleaned_pfxDB050620_dint and cleaned_pfxDB050620_dint[k] == cleaned_Pfx050120_dint[k]]
print("\nNumber of shared Drug-Target pairs between Pfx050120_dint and cleaned_pfxDB050620_dint: " + str(len(shared_keys)))

shared_keys = [k for k in cleaned_drug_intome_targets if k in cleaned_pfxDB050620_dint and cleaned_pfxDB050620_dint[k] == cleaned_drug_intome_targets[k]]
print("\nNumber of shared Drug-Target pairs between cleaned_drug_intome_targets and cleaned_pfxDB050620_dint: " + str(len(shared_keys)))

shared_keys = [k for k in cleaned_drug_intome_targets if k in cleaned_Pfx050120_dint and cleaned_Pfx050120_dint[k] == cleaned_drug_intome_targets[k]]
print("\nNumber of shared Drug-Target pairs between cleaned_drug_intome_targets and cleaned_Pfx050120_dint: " + str(len(shared_keys)))

print()
all_drugs_for_targets_pathfx_version_stats_df

### cleaned_all_assoc_to_nodes.pkl (version 1) vs. cleaned_Pfx050120_all_assoc_to_nodes.pkl (version 2)

In [None]:
# Comparing assoc to nodes: cleaned_all_assoc_to_nodes.pkl (version 1) vs. cleaned_Pfx050120_all_assoc_to_nodes.pkl (version 2)

# output folder for comparing assoc to nodes for Version 1.0 vs Version 2.0 
assoc_to_nodes_comparison_file_version_comparison_folder = pathfx_source_files_comparison_folder + "assoc_to_nodes_comparison_file_version_comparison/"
check_directory_exists(assoc_to_nodes_comparison_file_version_comparison_folder)

# ------- Assoc Dataframe -------

all_assoc_lst = []
all_assoc_lst.extend(cleaned_all_assoc_to_nodes.keys())
all_assoc_lst.extend(cleaned_Pfx050120_all_assoc_to_nodes.keys())
all_assoc_lst = list(set(all_assoc_lst))

data_dict = {"Phenotype":[], "v1_genes":[], "v2_genes":[], "genes_version_union":[], "genes_version_intersection":[], "genes_version_set_difference_v1":[], "genes_version_set_difference_v2":[]}
all_assoc_for_nodes_df = pd.DataFrame(data_dict)
all_assoc_for_nodes_df = all_assoc_for_nodes_df.set_index("Phenotype")

for phenotype in all_assoc_lst:
    if phenotype not in cleaned_all_assoc_to_nodes.keys():
        verison_one_genes = []
    else:
        verison_one_genes = cleaned_all_assoc_to_nodes[phenotype]
    if phenotype not in cleaned_Pfx050120_all_assoc_to_nodes.keys():
        verison_two_genes = []
    else:
        verison_two_genes = cleaned_Pfx050120_all_assoc_to_nodes[phenotype]
    (set_union_x_and_y, set_intersection_x_and_y, x_set_difference, y_set_difference) = set_comparison_info(set(verison_one_genes), set(verison_two_genes))
    all_assoc_for_nodes_df.loc[phenotype] = [verison_one_genes, verison_two_genes, list(set_union_x_and_y), list(set_intersection_x_and_y), list(x_set_difference), list(y_set_difference)]

all_assoc_for_nodes_df["v1_genes_size"] = all_assoc_for_nodes_df["v1_genes"].map(len)
all_assoc_for_nodes_df["v2_genes_size"] = all_assoc_for_nodes_df["v2_genes"].map(len)
all_assoc_for_nodes_df["genes_version_union_size"] = all_assoc_for_nodes_df["genes_version_union"].map(len)
all_assoc_for_nodes_df["genes_version_intersection_size"] = all_assoc_for_nodes_df["genes_version_intersection"].map(len)
all_assoc_for_nodes_df["genes_version_set_difference_v1_size"] = all_assoc_for_nodes_df["genes_version_set_difference_v1"].map(len)
all_assoc_for_nodes_df["genes_version_set_difference_v2_size"] = all_assoc_for_nodes_df["genes_version_set_difference_v2"].map(len)

all_assoc_for_nodes_df.rename(index={'':'Unknown/Missing Phenotype(s)'}, inplace=True)

all_assoc_for_nodes_df = all_assoc_for_nodes_df.reset_index()

save_to_csv_file(assoc_to_nodes_comparison_file_version_comparison_folder, all_assoc_for_nodes_df, "all_assoc_for_nodes_table", show_print_out=False)

In [None]:
all_assoc_for_nodes_df.head(10)

In [None]:
all_assoc_for_nodes_stats_df = all_assoc_for_nodes_df[["v1_genes_size", "v2_genes_size", "genes_version_union_size", "genes_version_intersection_size", 
                                         "genes_version_set_difference_v1_size", "genes_version_set_difference_v2_size"]]
all_assoc_for_nodes_stats_df = all_assoc_for_nodes_stats_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
all_assoc_for_nodes_stats_df = all_assoc_for_nodes_stats_df.T

# correcting info for v1 to account distribution for keys present only in v1
lst = [len(cleaned_all_assoc_to_nodes[k]) for k in cleaned_all_assoc_to_nodes.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_assoc_for_nodes_stats_df.loc["v1_genes_size"] = temp_df.iloc[0]

# correcting info for v2 to account distribution for keys present only in v2
lst = [len(cleaned_Pfx050120_all_assoc_to_nodes[k]) for k in cleaned_Pfx050120_all_assoc_to_nodes.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_assoc_for_nodes_stats_df.loc["v2_genes_size"] = temp_df.iloc[0]

all_assoc_for_nodes_stats_df.rename(index={'v1_genes_size':'Distribution of Gene List Size for Phenotypes in Version 1', 
                                        'v2_genes_size':'Distribution of Gene List Size for Phenotypes in Version 2',
                                        'genes_version_union_size':'Distribution of Gene List Size for the Union of Genes in Both PathFX Versions',
                                        'genes_version_intersection_size':'Distribution of Gene List Size for the Intersection of Genes in Both PathFX Versions',
                                        'genes_version_set_difference_v1_size':'Distribution of Gene List Size for PathFX Version 1 Exclusive Genes',
                                        'genes_version_set_difference_v2_size':'Distribution of Gene List Size for PathFX Version 2 Exclusive Genes'
                                       }, inplace=True)

save_to_csv_file(assoc_to_nodes_comparison_file_version_comparison_folder, all_assoc_for_nodes_stats_df, "all_assoc_for_nodes_stats_table", input_index=True, show_print_out=False)

In [None]:
print("Number of Associations (or Phenotypes) from the Union in PathFX Version 1 and Version 2:", len(all_assoc_for_nodes_df))
print("Number of assoc-node pairs in PathFX Version 1 (cleaned_all_assoc_to_nodes):", len(cleaned_all_assoc_to_nodes))
print("Number of assoc-node pairs in PathFX Version 2 (cleaned_Pfx050120_all_assoc_to_nodes):", len(cleaned_Pfx050120_all_assoc_to_nodes))

shared_keys = [k for k in cleaned_all_assoc_to_nodes if k in cleaned_Pfx050120_all_assoc_to_nodes and cleaned_all_assoc_to_nodes[k] == cleaned_Pfx050120_all_assoc_to_nodes[k]]
print("\nNumber of shared assoc-node pairs between cleaned_all_assoc_to_nodes and cleaned_Pfx050120_all_assoc_to_nodes: " + str(len(shared_keys)))

print()
all_assoc_for_nodes_stats_df

### cleaned_merged_unique_cuis2genes.pkl (version 1) vs. cleaned_Pfx050120_merged_unique_cuis2genes.pkl (version 2)

In [None]:
# Comparing CUIs to interactome gene lists: cleaned_merged_unique_cuis2genes.pkl (version 1) vs. cleaned_Pfx050120_merged_unique_cuis2genes.pkl (version 2)

# output folder for Comparing CUIs to interactome gene lists for Version 1.0 vs Version 2.0 
cui_to_interactome_genes_file_version_comparison_folder = pathfx_source_files_comparison_folder + "cui_to_interactome_genes_file_version_comparison/"
check_directory_exists(cui_to_interactome_genes_file_version_comparison_folder)

# Note: The dictionaries: cleaned_merged_genes_to_cuis (version 1) and cleaned_Pfx050120_merged_genes_to_cuis (version 2) are NOT perfectly matched in terms of content 
# based on the dictionaries merged_unique_cuis2genes.pkl (version 1) and Pfx050120_merged_unique_cuis2genes.pkl (version 2). This can be checked using the invert_dict function. 
# The invert_dictionary_with_list function was created soley for helping to construct the dataframes and do create inverted dictionaries for 
# cleaned_merged_genes_to_cuis (version 1) and cleaned_Pfx050120_merged_genes_to_cuis (version 2) respectfully

inverted_cleaned_merged_unique_cuis2genes = invert_dictionary_with_list(cleaned_merged_unique_cuis2genes)
inverted_cleaned_Pfx050120_merged_unique_cuis2genes = invert_dictionary_with_list(cleaned_Pfx050120_merged_unique_cuis2genes)


# ------- CUI Dataframe -------

all_cuis_lst = []
all_cuis_lst.extend(cleaned_merged_unique_cuis2genes.keys())
all_cuis_lst.extend(cleaned_Pfx050120_merged_unique_cuis2genes.keys())
all_cuis_lst = list(set(all_cuis_lst))

data_dict = {"CUI":[], "v1_genes":[], "v2_genes":[], "genes_version_union":[], "genes_version_intersection":[], "genes_version_set_difference_v1":[], "genes_version_set_difference_v2":[]}
all_cuis_to_genes_pathfx_version_df = pd.DataFrame(data_dict)
all_cuis_to_genes_pathfx_version_df = all_cuis_to_genes_pathfx_version_df.set_index("CUI")

for cui in all_cuis_lst:
    if cui not in cleaned_merged_unique_cuis2genes.keys():
        verison_one_genes = []
    else:
        verison_one_genes = cleaned_merged_unique_cuis2genes[cui]
    if cui not in cleaned_Pfx050120_merged_unique_cuis2genes.keys():
        verison_two_genes = []
    else:
        verison_two_genes = cleaned_Pfx050120_merged_unique_cuis2genes[cui]
    (set_union_x_and_y, set_intersection_x_and_y, x_set_difference, y_set_difference) = set_comparison_info(set(verison_one_genes), set(verison_two_genes))
    all_cuis_to_genes_pathfx_version_df.loc[cui] = [verison_one_genes, verison_two_genes, list(set_union_x_and_y), list(set_intersection_x_and_y), list(x_set_difference), list(y_set_difference)]
all_cuis_to_genes_pathfx_version_df["v1_genes_size"] = all_cuis_to_genes_pathfx_version_df["v1_genes"].map(len)
all_cuis_to_genes_pathfx_version_df["v2_genes_size"] = all_cuis_to_genes_pathfx_version_df["v2_genes"].map(len)
all_cuis_to_genes_pathfx_version_df["genes_version_union_size"] = all_cuis_to_genes_pathfx_version_df["genes_version_union"].map(len)
all_cuis_to_genes_pathfx_version_df["genes_version_intersection_size"] = all_cuis_to_genes_pathfx_version_df["genes_version_intersection"].map(len)
all_cuis_to_genes_pathfx_version_df["genes_version_set_difference_v1_size"] = all_cuis_to_genes_pathfx_version_df["genes_version_set_difference_v1"].map(len)
all_cuis_to_genes_pathfx_version_df["genes_version_set_difference_v2_size"] = all_cuis_to_genes_pathfx_version_df["genes_version_set_difference_v2"].map(len)

all_cuis_to_genes_pathfx_version_df.rename(index={'':'Unknown/Missing CUI(s)'}, inplace=True)

all_cuis_to_genes_pathfx_version_df = all_cuis_to_genes_pathfx_version_df.reset_index()

save_to_csv_file(cui_to_interactome_genes_file_version_comparison_folder, all_cuis_to_genes_pathfx_version_df, "all_cuis_to_genes_pathfx_version_table", show_print_out=False)

In [None]:
all_cuis_to_genes_pathfx_version_df.head(10)

In [None]:
all_cuis_to_genes_pathfx_version_stats_df = all_cuis_to_genes_pathfx_version_df[["v1_genes_size", "v2_genes_size", "genes_version_union_size", "genes_version_intersection_size", 
                                         "genes_version_set_difference_v1_size", "genes_version_set_difference_v2_size"]]
all_cuis_to_genes_pathfx_version_stats_df = all_cuis_to_genes_pathfx_version_stats_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
all_cuis_to_genes_pathfx_version_stats_df = all_cuis_to_genes_pathfx_version_stats_df.T

# correcting info for v1 to account distribution for keys present only in v1
lst = [len(cleaned_merged_unique_cuis2genes[k]) for k in cleaned_merged_unique_cuis2genes.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_cuis_to_genes_pathfx_version_stats_df.loc["v1_genes_size"] = temp_df.iloc[0]

# correcting info for v2 to account distribution for keys present only in v2
lst = [len(cleaned_Pfx050120_merged_unique_cuis2genes[k]) for k in cleaned_Pfx050120_merged_unique_cuis2genes.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_cuis_to_genes_pathfx_version_stats_df.loc["v2_genes_size"] = temp_df.iloc[0]

all_cuis_to_genes_pathfx_version_stats_df.rename(index={'v1_genes_size':'Distribution of Gene List Size for CUIs in Version 1', 
                                        'v2_genes_size':'Distribution of Gene List Size for CUIs in Version 2',
                                        'genes_version_union_size':'Distribution of Gene List Size for Union of CUIs in Both PathFX Versions',
                                        'genes_version_intersection_size':'Distribution of Gene List Size for Intersection of CUIs in Both PathFX Versions',
                                        'genes_version_set_difference_v1_size':'Distribution of Gene List Size for PathFX Version 1 Exclusive Genes',
                                        'genes_version_set_difference_v2_size':'Distribution of Gene List Size for PathFX Version 2 Exclusive Genes'
                                       }, inplace=True)

save_to_csv_file(cui_to_interactome_genes_file_version_comparison_folder, all_cuis_to_genes_pathfx_version_stats_df, "all_cuis_to_genes_pathfx_version_stats_table", input_index=True, show_print_out=False)

In [None]:
print("Number of CUIs from the Union in PathFX Version 1 and Version 2:", len(all_cuis_to_genes_pathfx_version_df))
print("Number of CUI-Gene pairs in PathFX Version 1 (cleaned_merged_unique_cuis2genes):", len(cleaned_merged_unique_cuis2genes))
print("Number of CUI-Gene pairs in PathFX Version 2 (cleaned_Pfx050120_merged_unique_cuis2genes):", len(cleaned_Pfx050120_merged_unique_cuis2genes))

shared_keys = [k for k in cleaned_merged_unique_cuis2genes if k in cleaned_Pfx050120_merged_unique_cuis2genes and cleaned_merged_unique_cuis2genes[k] == cleaned_Pfx050120_merged_unique_cuis2genes[k]]
print("\nNumber of shared CUI-Gene pairs between cleaned_merged_unique_cuis2genes and cleaned_Pfx050120_merged_unique_cuis2genes: " + str(len(shared_keys)))

print()
all_cuis_to_genes_pathfx_version_stats_df

### cleaned_all_phens_to_cuis.pkl (version 1) vs. cleaned_Pfx050120_all_phens_to_cuis.pkl (version 2)

In [None]:
# Comparing phenotype to CUIs dictionary: cleaned_all_phens_to_cuis.pkl (version 1) vs. cleaned_Pfx050120_all_phens_to_cuis.pkl (version 2)

# output folder for Comparing phenotype to CUIs dictionaries for Version 1.0 vs Version 2.0 
phenotype_to_cuis_version_comparison_output_folder = pathfx_source_files_comparison_folder + "phenotype_to_cuis_version_comparison/"
check_directory_exists(phenotype_to_cuis_version_comparison_output_folder)

# ------- Phenotype Dataframe -------

all_phenotypes_lst = []
all_phenotypes_lst.extend(cleaned_all_phens_to_cuis.keys())
all_phenotypes_lst.extend(cleaned_Pfx050120_all_phens_to_cuis.keys())
all_phenotypes_lst = list(set(all_phenotypes_lst))

data_dict = {"phenotype":[], "v1_cuis":[], "v2_cuis":[], "Same CUI?":[]}
all_phenotypes_to_cuis_pathfx_version_df = pd.DataFrame(data_dict)
all_phenotypes_to_cuis_pathfx_version_df = all_phenotypes_to_cuis_pathfx_version_df.set_index("phenotype")

for phenotype in all_phenotypes_lst:
    if phenotype not in cleaned_all_phens_to_cuis.keys():
        verison_one_cuis = np.nan
    else:
        verison_one_cuis = cleaned_all_phens_to_cuis[phenotype]
    if phenotype not in cleaned_Pfx050120_all_phens_to_cuis.keys():
        verison_two_cuis = np.nan
    else:
        verison_two_cuis = cleaned_Pfx050120_all_phens_to_cuis[phenotype]
    
    same_cui = "No"
    if verison_one_cuis==verison_two_cuis:
        same_cui = "Yes"
        
    all_phenotypes_to_cuis_pathfx_version_df.loc[phenotype] = [verison_one_cuis, verison_two_cuis, same_cui]

all_phenotypes_to_cuis_pathfx_version_df = all_phenotypes_to_cuis_pathfx_version_df.reset_index()

all_phenotypes_to_cuis_pathfx_version_df["v1 NaN?"] = all_phenotypes_to_cuis_pathfx_version_df["v1_cuis"].isnull()
all_phenotypes_to_cuis_pathfx_version_df["v2 NaN?"] = all_phenotypes_to_cuis_pathfx_version_df["v2_cuis"].isnull()

save_to_csv_file(phenotype_to_cuis_version_comparison_output_folder, all_phenotypes_to_cuis_pathfx_version_df, "all_phenotypes_to_cuis_pathfx_version_table", show_print_out=False)

In [None]:
print("Number of Phenotypes from the Union in PathFX Version 1 and Version 2:", len(all_phenotypes_to_cuis_pathfx_version_df))
print("Number of Phenotypes present in PathFX Version 1 (cleaned_all_phens_to_cuis):", len(cleaned_all_phens_to_cuis.keys()))
print("Number of Phenotypes present in PathFX Version 2 (cleaned_Pfx050120_all_phens_to_cuis):", len(cleaned_Pfx050120_all_phens_to_cuis.keys()))

temp_def = all_phenotypes_to_cuis_pathfx_version_df[all_phenotypes_to_cuis_pathfx_version_df["v1 NaN?"]==False]
temp_def = temp_def[temp_def["v2 NaN?"]==False]
print("Number of CUIs that are NOT the same for corresponding Phenotypes in both PathFX Versions (Recognized Phenotypes Only):", len(temp_def) - sum(temp_def["Same CUI?"]=="Yes"))

shared_keys = [k for k in cleaned_all_phens_to_cuis if k in cleaned_Pfx050120_all_phens_to_cuis and cleaned_all_phens_to_cuis[k] == cleaned_Pfx050120_all_phens_to_cuis[k]]
print("\nNumber of shared Phenotype-CUI pairs between cleaned_all_phens_to_cuis and cleaned_Pfx050120_all_phens_to_cuis: " + str(len(shared_keys)))

print()
all_phenotypes_to_cuis_pathfx_version_df

In [None]:
temp_def[temp_def["Same CUI?"]=="No"]

### cuis_to_all_phens vs Pfx050120_cui_to_phens

In [None]:
# Comparing CUIs to phenotype dictionary: cleaned_cuis_to_all_phens.pkl (version 1) vs. cleaned_Pfx050120_cui_to_phens.pkl (version 2)

# output folder for Comparing CUIs to phenotype dictionaries for Version 1.0 vs Version 2.0 
cuis_to_phenotypes_version_comparison_output_folder = pathfx_source_files_comparison_folder + "cuis_to_phenotypes_version_comparison/"
check_directory_exists(cuis_to_phenotypes_version_comparison_output_folder)

# ------- CUI Dataframe -------

v1_df = pd.DataFrame.from_dict(cleaned_cuis_to_all_phens, orient='index')
v1_df = v1_df.rename(columns={0: "Phenotype Version 1"})

v2_df = pd.DataFrame.from_dict(cleaned_Pfx050120_cui_to_phens, orient='index')
v2_df = v2_df.rename(columns={0: "Phenotype Version 2"})

all_cuis_to_phenotype_pathfx_version_df = pd.concat([v1_df, v2_df], axis=1)

all_cuis_to_phenotype_pathfx_version_df["Same Phenotype?"] = all_cuis_to_phenotype_pathfx_version_df["Phenotype Version 1"] == all_cuis_to_phenotype_pathfx_version_df["Phenotype Version 2"]
all_cuis_to_phenotype_pathfx_version_df["v1 NaN?"] = all_cuis_to_phenotype_pathfx_version_df["Phenotype Version 1"].isnull()
all_cuis_to_phenotype_pathfx_version_df["v2 NaN?"] = all_cuis_to_phenotype_pathfx_version_df["Phenotype Version 2"].isnull()

save_to_csv_file(cuis_to_phenotypes_version_comparison_output_folder, all_cuis_to_phenotype_pathfx_version_df, "all_cuis_to_phenotype_pathfx_version_table", input_index=True, show_print_out=False)

In [None]:
print("Number of CUIs present from the Union of Both PathFX Version 1 and Version 2", len(all_cuis_to_phenotype_pathfx_version_df))
print("Number of CUI-Phenotype pairs in PathFX Version 1 (cleaned_cuis_to_all_phens):", len(cleaned_cuis_to_all_phens))
print("Number of CUI-Phenotype pairs in PathFX Version 2 (cleaned_Pfx050120_cui_to_phens):", len(cleaned_Pfx050120_cui_to_phens))
    
temp_def = all_cuis_to_phenotype_pathfx_version_df[all_cuis_to_phenotype_pathfx_version_df["v1 NaN?"]==False]
temp_def = temp_def[temp_def["v2 NaN?"]==False]
print("Number of CUIs that are NOT the same for corresponding Phenotypes in both PathFX Versions (Recognized Phenotypes Only):", len(temp_def) - sum(temp_def["Same Phenotype?"]==True))

shared_keys = [k for k in cleaned_cuis_to_all_phens if k in cleaned_Pfx050120_cui_to_phens and cleaned_cuis_to_all_phens[k] == cleaned_Pfx050120_cui_to_phens[k]]
print("\nNumber of shared CUI-Phenotype pairs between cleaned_cuis_to_all_phens and cleaned_Pfx050120_cui_to_phens: " + str(len(shared_keys)))

print()
all_cuis_to_phenotype_pathfx_version_df

### merged_genes_to_cuis vs. Pfx050120_merged_genes_to_cuis

In [None]:
# Comparing genes to CUIs dictionary: cleaned_merged_genes_to_cuis.pkl (version 1) vs. cleaned_Pfx050120_merged_genes_to_cuis.pkl (version 2)

# output folder for Comparing genes to CUIs dictionaries for Version 1.0 vs Version 2.0 
genes_cui_version_comparison_output_folder = pathfx_source_files_comparison_folder + "genes_cui_version_comparison/"
check_directory_exists(genes_cui_version_comparison_output_folder)

# ------- Gene Dataframe -------

all_genes_lst =[]
all_genes_lst.extend(list(cleaned_merged_genes_to_cuis.keys()))
all_genes_lst.extend(list(cleaned_Pfx050120_merged_genes_to_cuis.keys()))
all_genes_lst = list(set(all_genes_lst))

all_genes_to_cui_df = pd.DataFrame(all_genes_lst)
all_genes_to_cui_df["Version 1 CUIs"] = all_genes_to_cui_df[0].map(cleaned_merged_genes_to_cuis)
all_genes_to_cui_df["Version 2 CUIs"] = all_genes_to_cui_df[0].map(cleaned_Pfx050120_merged_genes_to_cuis)
all_genes_to_cui_df.rename(columns={0:"Gene"}, inplace=True)

def replace_null(x):
    if type(x) != list:
        return []
    return x

def get_set_columns(x):
    set_one = set(x["Version 1 CUIs"])
    set_two = set(x["Version 2 CUIs"])
    lst_one_non_redun = list(set_one)
    lst_two_non_redun = list(set_two)
    intersection_lst = list(set_one & set_two)
    union_lst = list(set_one | set_two)
    set_difference_lst_one = list(set_one - set(intersection_lst))
    set_difference_lst_two = list(set_two - set(intersection_lst))
    intersection_lst.sort()
    union_lst.sort()
    set_difference_lst_one.sort()
    set_difference_lst_two.sort()
    x["cui_version_intersection"] = intersection_lst
    x["cui_version_union"] = union_lst
    x["cui_version_set_difference_v1"] = set_difference_lst_one
    x["cui_version_set_difference_v2"] = set_difference_lst_two
    return x

all_genes_to_cui_df['Version 1 CUIs'] = all_genes_to_cui_df['Version 1 CUIs'].apply(replace_null)
all_genes_to_cui_df['Version 2 CUIs'] = all_genes_to_cui_df['Version 2 CUIs'].apply(replace_null)

all_genes_to_cui_df = all_genes_to_cui_df.apply(get_set_columns, axis=1)

all_genes_to_cui_df["cui_version_1_size"] = all_genes_to_cui_df["Version 1 CUIs"].map(len)
all_genes_to_cui_df["cui_version_2_size"] = all_genes_to_cui_df["Version 2 CUIs"].map(len)
all_genes_to_cui_df["cui_version_intersection_size"] = all_genes_to_cui_df["cui_version_intersection"].map(len)
all_genes_to_cui_df["cui_version_union_size"] = all_genes_to_cui_df["cui_version_union"].map(len)
all_genes_to_cui_df["cui_version_set_difference_v1_size"] = all_genes_to_cui_df["cui_version_set_difference_v1"].map(len)
all_genes_to_cui_df["cui_version_set_difference_v2_size"] = all_genes_to_cui_df["cui_version_set_difference_v2"].map(len)

save_to_csv_file(genes_cui_version_comparison_output_folder, all_genes_to_cui_df, "all_genes_to_cui_table", show_print_out=False)

In [None]:
all_genes_to_cui_df

In [None]:
all_genes_to_cui_stats_df = all_genes_to_cui_df[["cui_version_1_size", "cui_version_2_size", "cui_version_union_size", "cui_version_intersection_size", 
                                         "cui_version_set_difference_v1_size", "cui_version_set_difference_v2_size"]]
all_genes_to_cui_stats_df = all_genes_to_cui_stats_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
all_genes_to_cui_stats_df = all_genes_to_cui_stats_df.T

# correcting info for v1 to account distribution for keys present only in v1
lst = [len(cleaned_merged_genes_to_cuis[k]) for k in cleaned_merged_genes_to_cuis.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_genes_to_cui_stats_df.loc["cui_version_1_size"] = temp_df.iloc[0]

# correcting info for v2 to account distribution for keys present only in v2
lst = [len(cleaned_Pfx050120_merged_genes_to_cuis[k]) for k in cleaned_Pfx050120_merged_genes_to_cuis.keys()]
temp_df = pd.DataFrame(lst)
temp_df = temp_df.agg(['count', 'mean', 'std', 'var', 'max', 'min'])
temp_df = temp_df.T
all_genes_to_cui_stats_df.loc["cui_version_2_size"] = temp_df.iloc[0]

all_genes_to_cui_stats_df.rename(index={'cui_version_1_size':'Distribution of CUI List Size for Genes in Version 1', 
                                        'cui_version_2_size':'Distribution of CUI List Size for Genes in Version 2',
                                        'cui_version_union_size':'Distribution of CUI List Size for Union of Genes in Both PathFX Versions',
                                        'cui_version_intersection_size':'Distribution of CUI List Size for Intersection of Genes in Both PathFX Versions',
                                        'cui_version_set_difference_v1_size':'Distribution of CUI List Size for Genes in PathFX Version 1 Exclusive Genes',
                                        'cui_version_set_difference_v2_size':'Distribution of CUI List Size for Genes in PathFX Version 2 Exclusive Genes'
                                       }, inplace=True)

save_to_csv_file(genes_cui_version_comparison_output_folder, all_genes_to_cui_stats_df, "all_genes_to_cui_stats_table", input_index=True, show_print_out=False)

In [None]:
print("Number of Genes present from the Union of Both PathFX Version 1 and Version 2:", len(all_genes_to_cui_stats_df))
print("Number of Genes-CUIs pairs in PathFX Version 1 (cleaned_merged_genes_to_cuis):", len(cleaned_merged_genes_to_cuis))
print("Number of Genes-CUIs pairs in PathFX Version 2 (cleaned_Pfx050120_merged_genes_to_cuis):", len(cleaned_Pfx050120_merged_genes_to_cuis))
    
shared_keys = [k for k in cleaned_merged_genes_to_cuis if k in cleaned_Pfx050120_merged_genes_to_cuis and cleaned_merged_genes_to_cuis[k] == cleaned_Pfx050120_merged_genes_to_cuis[k]]
print("\nNumber of shared Genes-CUIs pairs between cleaned_merged_genes_to_cuis and cleaned_Pfx050120_merged_genes_to_cuis: " + str(len(shared_keys)))

print()
all_genes_to_cui_stats_df

### unique_network_nodes vs. pfx041520_unique_nodes

In [None]:
# Comparing genes to CUIs dictionary: cleaned_unique_network_nodes.pkl (version 1) vs. cleaned_pfx041520_unique_nodes.pkl (version 2)

# output folder for Comparing unique_nodes for Version 1.0 vs Version 2.0 
unique_nodes_version_comparison_output_folder = pathfx_source_files_comparison_folder + "unique_nodes_version_comparison/"
check_directory_exists(unique_nodes_version_comparison_output_folder)

# ------- Gene Dataframe -------

all_unique_nodes_df = []
all_unique_nodes_df.extend(cleaned_unique_network_nodes)
all_unique_nodes_df.extend(cleaned_pfx041520_unique_nodes)
all_unique_nodes_df = list(set(all_unique_nodes_df))

all_unique_nodes_df = pd.DataFrame(all_unique_nodes_df)
all_unique_nodes_df.rename(columns={0:"Node"}, inplace=True)
all_unique_nodes_df["In v1?"] = all_unique_nodes_df["Node"].isin(cleaned_unique_network_nodes)
all_unique_nodes_df["In v2?"] = all_unique_nodes_df["Node"].isin(cleaned_pfx041520_unique_nodes)

all_unique_nodes_df["In v1 and v2?"] = all_unique_nodes_df["In v1?"] ==  all_unique_nodes_df["In v2?"]

save_to_csv_file(unique_nodes_version_comparison_output_folder, all_unique_nodes_df, "all_unique_nodes_table", show_print_out=False)

print("Number of Nodes from the Union of Both PathFX Version 1 and Version 2:", len(all_cuis_to_phenotype_pathfx_version_df))
print("Number of Nodes in PathFX Version 1 (cleaned_unique_network_nodes):", len(cleaned_unique_network_nodes))
print("Number of Nodes in PathFX Version 1 Only (cleaned_unique_network_nodes):", len(cleaned_unique_network_nodes) - sum(all_unique_nodes_df["In v1 and v2?"]))
print("Number of Nodes in PathFX Version 2 (cleaned_pfx041520_unique_nodes):", len(cleaned_pfx041520_unique_nodes))
print("Number of Nodes in PathFX Version 2 Only (cleaned_pfx041520_unique_nodes):", len(cleaned_pfx041520_unique_nodes) - sum(all_unique_nodes_df["In v1 and v2?"]))
print("Number of Nodes from the Intersection of Both PathFX Version 1 and Version 2:", sum(all_unique_nodes_df["In v1 and v2?"]))
print()
all_unique_nodes_df

### sourced_phens vs. Pfx050120_sourced_phens

In [None]:
all_sources = []
for set_source in list(cleaned_sourced_phens.values()):
    all_sources.extend(set_source)
for set_source in list(Pfx050120_sourced_phens.values()):
    all_sources.extend(set_source)
all_sources = list(set(all_sources))
all_sources

In [None]:
# Comparing sourced_phens: sourced_phens.pkl (version 1) vs. Pfx050120_sourced_phens.pkl (version 2)

# output folder for Comparing unique_nodes for Version 1.0 vs Version 2.0 
sourced_phens_version_comparison_output_folder = pathfx_source_files_comparison_folder + "sourced_phens_version_comparison/"
check_directory_exists(sourced_phens_version_comparison_output_folder)

v1_df = pd.DataFrame.from_dict(sourced_phens, orient='index')
v1_df.reset_index(inplace=True)
v1_df[['Gene', 'CUI']] = pd.DataFrame(v1_df['index'].tolist(), index=v1_df.index)
for source in all_sources:
    v1_df[source] = v1_df[0].isin([source]) | v1_df[1].isin([source]) | v1_df[2].isin([source]) | v1_df[3].isin([source])
v1_df = v1_df[["Gene", "CUI", "HumPhenOnt", "DisGeNet", "ClinVar", "PheGenI"]]

v2_df = pd.DataFrame.from_dict(Pfx050120_sourced_phens, orient='index')
v2_df.reset_index(inplace=True)
v2_df[['Gene', 'CUI']] = pd.DataFrame(v2_df['index'].tolist(), index=v2_df.index)
for source in all_sources:
    v2_df[source] = v2_df[0].isin([source]) | v2_df[1].isin([source]) | v2_df[2].isin([source])
v2_df = v2_df[["Gene", "CUI", "HumPhenOnt", "DisGeNet", "ClinVar", "PheGenI"]]

all_sourced_phens_df = v1_df.merge(v2_df, how="outer", left_on=['Gene', 'CUI'], right_on=['Gene', 'CUI'], suffixes=("_v1", "_v2"))

save_to_csv_file(sourced_phens_version_comparison_output_folder, all_sourced_phens_df, "all_sourced_phens_table", show_print_out=False)

In [None]:
print("Number of (Gene, CUI) in Union of PathFX Version 1 and Version 2", len(all_sourced_phens_df))
print()

print("PathFX Version 1")
print("---------------------------------------------------------------------------------------------")
print("Number of (Gene, CUI) references from HumPhenOnt for PathFX Version 1:", sum(all_sourced_phens_df["HumPhenOnt_v1"]==True))
print("Number of (Gene, CUI) references from DisGeNet for PathFX Version 1:", sum(all_sourced_phens_df["DisGeNet_v1"]==True))
print("Number of (Gene, CUI) references from ClinVar for PathFX Version 1:", sum(all_sourced_phens_df["ClinVar_v1"]==True))
print("Number of (Gene, CUI) references from PheGenI for PathFX Version 1:", sum(all_sourced_phens_df["PheGenI_v1"]==True))
print()

print("PathFX Version 2")
print("---------------------------------------------------------------------------------------------")
print("Number of (Gene, CUI) references from HumPhenOnt for PathFX Version 2:", sum(all_sourced_phens_df["HumPhenOnt_v2"]==True))
print("Number of (Gene, CUI) references from DisGeNet for PathFX Version 2:", sum(all_sourced_phens_df["DisGeNet_v2"]==True))
print("Number of (Gene, CUI) references from ClinVar for PathFX Version 2:", sum(all_sourced_phens_df["ClinVar_v2"]==True))
print("Number of (Gene, CUI) references from PheGenI for PathFX Version 2:", sum(all_sourced_phens_df["PheGenI_v2"]==True))
print()

print()
all_sourced_phens_df

In [None]:
# OLD CODE -- SAVED FOR LATER --- comparison

"""
temp_list = [list(list(itertools.product([k], v)) for k, v in cleaned_Pfx050120_dint.items())]
temp = list(itertools.chain.from_iterable(temp_list))
cleaned_Pfx050120_dint_set_list = list(itertools.chain.from_iterable(temp))
cleaned_Pfx050120_dint_set = set(cleaned_Pfx050120_dint_set_list)
temp_list = [list(list(itertools.product([k], v)) for k, v in drugbank_id_and_drug_name_source_to_gene_symbol_target_dict.items())]
temp = list(itertools.chain.from_iterable(temp_list))
drugbank_id_and_drug_name_source_to_gene_symbol_target_set_list = list(itertools.chain.from_iterable(temp))
drugbank_id_and_drug_name_source_to_gene_symbol_target_set = set(drugbank_id_and_drug_name_source_to_gene_symbol_target_set_list)
set_one = cleaned_Pfx050120_dint_set
set_two = cleaned_pfxDB050620_dint_set
set_three = drugbank_id_and_drug_name_source_to_gene_symbol_target_set
"""
# OLD CODE -- SAVED FOR LATER --- comparison
print()

## Drugbank ID, Name, and Synonym Table

In [None]:
"""
drug_synonym/ Folder
------------------------------------------------------------------------------------------------------------------------------------------------------------------
Using the drugbank_vocabulary.csv file (a DrugBank Vocabulary file that was downloaded from the original DrugBank database) To construct a DataFrame Table and 3 dictionaries:
- common drug name to DrugBank ID
- DrugBank ID to a list of all the known synonym Drug name(s) for a specific drug
- a synonym drug name to a DrugBank ID
"""

drug_synonym_output_folder = database_analysis_output_folder + "drug_synonym/"
check_directory_exists(drug_synonym_output_folder)

def get_drug_synonyms(record):
    return str(record).split(" | ")

def construct_all_drug_synonym_names(record):
    all_drug_synonym_names = [record["Common name"]]
    all_drug_synonym_names.extend(list(record["drug_synonyms"]))
    return all_drug_synonym_names

def construct_drugbank_id_and_name_set(record):
    drugbank_id_and_name_lst = []
    return list(itertools.product([record["DrugBank ID"]], record["All Names"]))

# Reading in drugbank_vocabulary.csv file from the local directory folder
drugbank_vocabulary_df = read_csv_file("important_database_files/drugbank_vocabulary.csv")
drugbank_vocabulary_synonyms_and_drugbank_id_df = drugbank_vocabulary_df.loc[:, ["DrugBank ID", "Common name", "Synonyms"]]

# lowercase all (in-column data) content in the DataFrame Table
for col in ["Common name", "Synonyms"]:
    drugbank_vocabulary_synonyms_and_drugbank_id_df[col] = drugbank_vocabulary_synonyms_and_drugbank_id_df[col].str.lower()
    
# Add in 2 columns, one with all synonym names correctly formatted in a list and another column that includes the common name in the synonym list 
drugbank_vocabulary_synonyms_and_drugbank_id_df["drug_synonyms"] = drugbank_vocabulary_synonyms_and_drugbank_id_df["Synonyms"].apply(func=get_drug_synonyms)
drugbank_vocabulary_synonyms_and_drugbank_id_df["all_drug_synonym_names"] = drugbank_vocabulary_synonyms_and_drugbank_id_df.apply(func=construct_all_drug_synonym_names, axis=1)

# small modifications and cleaning up dataframe
drugbank_vocabulary_synonyms_and_drugbank_id_df = drugbank_vocabulary_synonyms_and_drugbank_id_df.drop("Synonyms", axis=1)
drugbank_vocabulary_synonyms_and_drugbank_id_df = drugbank_vocabulary_synonyms_and_drugbank_id_df.rename(columns={"Common name": "Drug", "drug_synonyms":"Synonyms", "all_drug_synonym_names":"All Names"})
def remove_nan(x):
    if "nan" in x:
        x.remove('nan')
drugbank_vocabulary_synonyms_and_drugbank_id_df["Synonyms"].apply(remove_nan)
drugbank_vocabulary_synonyms_and_drugbank_id_df["All Names"].apply(remove_nan)

# Construct a list of sets with the sets containing (DrugBank ID, a synonym drug name)
drugbank_vocabulary_drugbank_id_and_all_synonyms = drugbank_vocabulary_synonyms_and_drugbank_id_df[["DrugBank ID", "All Names"]]
drug_name_to_drugbank_id_with_all_synonyms_set_list = list(drugbank_vocabulary_drugbank_id_and_all_synonyms.apply(func=construct_drugbank_id_and_name_set, axis=1))
drug_name_to_drugbank_id_with_all_synonyms_set_list = list(itertools.chain.from_iterable(drug_name_to_drugbank_id_with_all_synonyms_set_list))

# Constructing Dictionary where Key: a synonym of a drug name and Values are DrugBank IDs
drug_synonym_name_to_drugbank_id_dict = {y:x for x,y in drug_name_to_drugbank_id_with_all_synonyms_set_list}
    
# Constructing Dictionary where Key: DrugBank ID and Values are list of synonym drug name(s)
drugbank_id_to_all_synonym_names_dict = drugbank_vocabulary_drugbank_id_and_all_synonyms.set_index("DrugBank ID").to_dict()['All Names']
drugbank_id_to_all_synonym_names_dict["???"] = "NONE"

# Constructing Dictionary where Key: DrugBank ID and Values the Common Name of the drug
drugbank_id_to_common_name_dict = drugbank_vocabulary_synonyms_and_drugbank_id_df.set_index("DrugBank ID").to_dict()["Drug"]

# Saving file associated with Drug vocabulary Data Table (drugbank_vocabulary)
save_to_csv_file(drug_synonym_output_folder, drugbank_vocabulary_synonyms_and_drugbank_id_df, "drugbank_vocabulary_synonyms_and_drugbank_id_table")

# Save Dictionaries as Pickle Files  
save_to_pickle_file(drug_synonym_output_folder, drug_name_to_drugbank_id_with_all_synonyms_set_list, "drug_name_to_drugbank_id_with_all_synonyms_set_list")
save_to_pickle_file(drug_synonym_output_folder, drugbank_id_to_common_name_dict, "drugbank_id_to_common_name_dict")
save_to_pickle_file(drug_synonym_output_folder, drugbank_id_to_all_synonym_names_dict, "drugbank_id_to_all_synonym_names_dict")
save_to_pickle_file(drug_synonym_output_folder, drug_synonym_name_to_drugbank_id_dict, "drug_synonym_name_to_drugbank_id_dict")

drugbank_vocabulary_synonyms_and_drugbank_id_df

## FDA File Analysis

In [None]:
"""
opening Drugs_labeled_for_AEs.txt file and looking at its content
"""

# text file where severe side effects are listed as column names and drugs with that side-effect on their label are in the columns
# AEs - Adverse Events

input_filename = "important_database_files/Drugs_labeled_for_AEs.txt"
drug_names_for_side_effects_df = read_csv_file(input_filename, input_sep="\t", input_dtype=str)

drug_names_for_side_effects_df = drug_names_for_side_effects_df.dropna(axis=0, how='all')
drug_names_for_side_effects_df = drug_names_for_side_effects_df.rename(columns=str.lower)

all_adverse_events_for_labeled_drugs = list(drug_names_for_side_effects_df.columns)

# lowercase all content
for col in all_adverse_events_for_labeled_drugs:
    drug_names_for_side_effects_df[col] = drug_names_for_side_effects_df[col].str.lower()

drug_names_for_side_effects_df = drug_names_for_side_effects_df.reindex(sorted(drug_names_for_side_effects_df.columns), axis=1)
drug_names_for_side_effects_df

### Initial Adverse Events DataFrame Construction (prototype)

In [None]:
"""
Constructing DataFrame Table to show all the identified and unidentified ingrediants for each adverse event (identified ingrediants are those with found DrugBank IDs)
"""

# Setting up framework for Dataframe
adverse_events_and_ingrediant_info_dict = {'Adverse Event':[], 'CUI':[], 'Number of Ingrediants': [], 'Number of Identified Ingrediants':[], 'Number of Undentified Ingrediants': [], 'Ingrediants':[], 'Identified Ingrediants':[], 
                                           'Undentified Ingrediants':[]}
adverse_events_and_ingrediant_info_df = pd.DataFrame(adverse_events_and_ingrediant_info_dict)
adverse_events_and_ingrediant_info_df = adverse_events_and_ingrediant_info_df.set_index("Adverse Event")

# Constructing dictionaries to keep track of which column ingrediant is present in
unique_ingrediants_dict = {}

# Constructing dictionaries to keep track of what type ingrediant is
ingrediant_type_dict = {}

# Iterating through each column of the drug_names_for_side_effects DataFrame to gather data
for col in all_adverse_events_for_labeled_drugs:
    
    adverse_event = col
    
    # WARNING  <-------------------------------------------------------------------------------------- CUIs for each adverse event in FDA file were determined HERE!!!
    
    # Check CUI for Adverse Event
    #      cleaned_all_phens_to_cuis
    #      cleaned_Pfx050120_all_phens_to_cuis  
    if col in cleaned_Pfx050120_all_phens_to_cuis.keys():
        cui = cleaned_Pfx050120_all_phens_to_cuis[col]
    elif col in cleaned_all_phens_to_cuis.keys():
        cui = cleaned_all_phens_to_cuis[col]
    else:
        cui = "???"
    
    # obtaining all entries in a column as a list
    ingrediant_lst = list(drug_names_for_side_effects_df[col].dropna())
    num_ingrediants = len(ingrediant_lst)
    
    identified_ingrediants = []
    num_identified_ingrediants = 0
    unidentified_ingrediants = []
    num_unidentified_ingrediants = 0;
    
    # checking each entry in the column and identifying it as identified or unidentified if DrugBank ID is found
    # the ingrediant is added to the correct dictionary and list
    for ingrediant in ingrediant_lst:
        
         # Add ingrediant to overall dictionary of found ingrediants
        if ingrediant in unique_ingrediants_dict.keys():
            unique_ingrediants_dict[ingrediant].append(col)
        else:
            unique_ingrediants_dict[ingrediant] = [col]
        
        # add ingrediant to either identified or unidentified dictionary
        #      cleaned_pfxDB050620_dint   <-- LESSS
        #      cleaned_Pfx050120_name2dbid <--- MORE
        #      drug_synonym_name_to_drugbank_id_dict
        if ingrediant in dict(drug_synonym_name_to_drugbank_id_dict).keys():
            ingrediant_type_dict[ingrediant] = "identified"
            identified_ingrediants.append(ingrediant)
            num_identified_ingrediants = num_identified_ingrediants + 1
        else:
            ingrediant_type_dict[ingrediant] = "unidentified"
            unidentified_ingrediants.append(ingrediant)
            num_unidentified_ingrediants = num_unidentified_ingrediants + 1
    
    ingrediant_lst.sort()
    identified_ingrediants.sort()
    unidentified_ingrediants.sort()
    
    # Adding record/row entry to the DataFrame Table
    lst = [cui, num_ingrediants, num_identified_ingrediants, num_unidentified_ingrediants, ingrediant_lst, identified_ingrediants, unidentified_ingrediants]
    adverse_events_and_ingrediant_info_df.loc[col] = lst

# Organizing DataFrame for presentation
adverse_events_and_ingrediant_info_df = adverse_events_and_ingrediant_info_df.reset_index().sort_values(by="Number of Ingrediants", ascending=False)
adverse_events_and_ingrediant_info_df

### Adding Synonyms info to Constructed DataFrame Table for FDA File

In [None]:
"""
Helper Functions for building DataFrame
"""

def map_adverse_event_lst_to_cui_lst(adverse_event_lst):
    source_one = list(map(cleaned_all_phens_to_cuis.get, adverse_event_lst))
    source_two = list(map(cleaned_Pfx050120_all_phens_to_cuis.get, adverse_event_lst))
    combined_sources = list(set(source_one) & set(source_two))
    if None in combined_sources:
        combined_sources.remove(None)
    return combined_sources

def replace_marked_null_with_empty_lst(x):
    if(type(x) != list):
        return []
    return x

# Add the info in Adverse Event and CUI column to the column denotating the Synonyms
def add_original_to_synonym_list(record):
    # add original
    record["Synonym Adverse Events"].append(record["Adverse Event"])
    record["Synonym Adverse Event CUIs"].append(record["CUI"])
    # ensure no duplicates exists
    record["Synonym Adverse Events"] = list(set(record["Synonym Adverse Events"]))
    record["Synonym Adverse Event CUIs"] = list(set(record["Synonym Adverse Event CUIs"]))
    record["Number of Synonym Adverse Events"] = len(record["Synonym Adverse Events"])
    record["Number of Synonym Adverse Event CUIs"] = len(record["Synonym Adverse Event CUIs"])
    return record

In [None]:
"""
Modifying Recently Constructed DataFrame Table from For-loop to show all the identified and unidentified ingrediants for each adverse event with the folowing additional information:
- ingrediant list (different types)
- Adverse Event Synonym
- Adverse Event Synonym CUIs
- Number of synonym adverse events
"""

# Construct DataFrame Table for adverse events and synonyms. Obtain all Synonyms for adverse event names via a dictionary concat trick
synonyms_for_adverse_events_source_one_dict = {Synonyms:adverse_event for adverse_event in all_adverse_events_for_labeled_drugs for Synonyms in cleaned_all_phens_to_cuis if adverse_event in Synonyms} 
synonyms_for_adverse_events_source_one_df = pd.DataFrame.from_dict(synonyms_for_adverse_events_source_one_dict, orient='index').reset_index().rename(columns={0:"Adverse Event", "index":"Synonym Adverse Events"})
synonyms_for_adverse_events_source_two_dict = {Synonyms:adverse_event for adverse_event in all_adverse_events_for_labeled_drugs for Synonyms in cleaned_Pfx050120_all_phens_to_cuis if adverse_event in Synonyms} 
synonyms_for_adverse_events_source_two_df = pd.DataFrame.from_dict(synonyms_for_adverse_events_source_two_dict, orient='index').reset_index().rename(columns={0:"Adverse Event", "index":"Synonym Adverse Events"})
synonyms_for_adverse_events_df = pd.concat([synonyms_for_adverse_events_source_one_df, synonyms_for_adverse_events_source_two_df])
synonyms_for_adverse_events_df = synonyms_for_adverse_events_df.groupby("Adverse Event").agg(set)
synonyms_for_adverse_events_df = synonyms_for_adverse_events_df.reset_index()
synonyms_for_adverse_events_df["Synonym Adverse Events"] = synonyms_for_adverse_events_df["Synonym Adverse Events"].apply(lambda x: list(x))
synonyms_for_adverse_events_df = synonyms_for_adverse_events_df.set_index("Adverse Event")
adverse_events_to_synonym_adverse_events_dict = synonyms_for_adverse_events_df.to_dict("dict")["Synonym Adverse Events"]
adverse_events_and_ingrediant_info_df["Synonym Adverse Events"] = adverse_events_and_ingrediant_info_df.apply(lambda x: [], axis=1)
adverse_events_and_ingrediant_info_df["Synonym Adverse Events"] = adverse_events_and_ingrediant_info_df["Adverse Event"].map(adverse_events_to_synonym_adverse_events_dict)
adverse_events_and_ingrediant_info_df["Synonym Adverse Events"] = adverse_events_and_ingrediant_info_df["Synonym Adverse Events"].apply(func=replace_marked_null_with_empty_lst)

adverse_events_with_synonyms_and_ingrediant_info_df = adverse_events_and_ingrediant_info_df.copy(deep=True)

# Adding in more synoynm terms given from a FDA member, these terms originate for the synoynms.txt file 
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
adverse_events_with_synonyms_and_ingrediant_info_df = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event")
adverse_events_with_synonyms_and_ingrediant_info_df.loc["cerebral infarction", "Synonym Adverse Events"].extend(["emolic stroke", "thrombotic stroke", "ischaemic stroke"])
adverse_events_with_synonyms_and_ingrediant_info_df.loc["cardiac arrest", "Synonym Adverse Events"].extend(["cardio-respiratory arrest"])  
adverse_events_with_synonyms_and_ingrediant_info_df.loc["generalized tonic-clonic seizure", "Synonym Adverse Events"].extend(["seizure", "status epilepticus"]) 
adverse_events_with_synonyms_and_ingrediant_info_df.loc["peripheral neuropathy", "Synonym Adverse Events"].extend(["neuropathy peripheral", "peripheral motor neuropathy", "peripheral sensory neuropathy"]) 
adverse_events_with_synonyms_and_ingrediant_info_df.loc["hyperlipidemia", "Synonym Adverse Events"].extend(["blood cholesterol increased", "hypercholesteraemia"])    
adverse_events_with_synonyms_and_ingrediant_info_df.loc["hepatic necrosis", "Synonym Adverse Events"].extend(["hepatic failure", "acute hepatic failure", "subacute hepatic failure"])  
adverse_events_with_synonyms_and_ingrediant_info_df.loc["edema", "Synonym Adverse Events"].extend(["pulmonary edema", "acute pulmonary edema", "respiratory tract edema"])  
adverse_events_with_synonyms_and_ingrediant_info_df.loc["pulmonary edema", "Synonym Adverse Events"].extend(["acute pulmonary edema", "respiratory tract edema"])
adverse_events_with_synonyms_and_ingrediant_info_df.loc["tardive dyskinesia", "Synonym Adverse Events"].extend(["dyskinesia"])
adverse_events_with_synonyms_and_ingrediant_info_df.loc["myocardial infarction", "Synonym Adverse Events"].extend(["acute myocardial infarction", "silent myocardial infarction"])
adverse_events_with_synonyms_and_ingrediant_info_df.loc["pancreatitis", "Synonym Adverse Events"].extend(["pancreatitis acute"])
adverse_events_with_synonyms_and_ingrediant_info_df.loc["hemorrhage", "Synonym Adverse Events"].extend(["gastrointestinal hemorrhage", "injection site hemorrhage", "vaginal hemorrhage", "rectal hemorrhage"])
adverse_events_with_synonyms_and_ingrediant_info_df = adverse_events_with_synonyms_and_ingrediant_info_df.reset_index()
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

adverse_events_with_synonyms_and_ingrediant_info_df['Synonym Adverse Event CUIs'] = adverse_events_with_synonyms_and_ingrediant_info_df['Synonym Adverse Events'].apply(func=map_adverse_event_lst_to_cui_lst)

adverse_events_with_synonyms_and_ingrediant_info_df = adverse_events_with_synonyms_and_ingrediant_info_df.apply(func=add_original_to_synonym_list, axis=1)

def remove_question(x):
    if "???" in x:
        x.remove('???')
adverse_events_with_synonyms_and_ingrediant_info_df["Synonym Adverse Event CUIs"].apply(remove_question)

# Add additional info for Number of Adverse Event Synonyms
adverse_events_with_synonyms_and_ingrediant_info_df["Number of Synonym Adverse Events"] = adverse_events_with_synonyms_and_ingrediant_info_df["Synonym Adverse Events"].apply(lambda x: len(x))
adverse_events_with_synonyms_and_ingrediant_info_df["Number of Synonym Adverse Event CUIs"] = adverse_events_with_synonyms_and_ingrediant_info_df["Synonym Adverse Event CUIs"].apply(lambda x: len(x))

# Clean and Organize DataFrame for presentation
adverse_events_with_synonyms_and_ingrediant_info_df = adverse_events_with_synonyms_and_ingrediant_info_df.sort_values(by="Adverse Event")

adverse_events_with_synonyms_and_ingrediant_info_df

### Constructing Support Files and Saving

In [None]:
# Constructing Dictionary for Ingrediant to DrugBank ID
adverse_event_to_cui_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event")["CUI"].to_dict()

# Constructing Dictionary for CUI to Adverse Event(s)
# (NOTE: for unknowns, must filter and find ones labeled ???)
# NO ADVERSE EVENT SHARES SAME CUI
cui_to_adverse_events_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("CUI")["Adverse Event"].to_dict()
unidentified_adverse_events = list(adverse_events_with_synonyms_and_ingrediant_info_df[adverse_events_with_synonyms_and_ingrediant_info_df["CUI"] == "???"]["Adverse Event"])
cui_to_adverse_events_dict["???"] = unidentified_adverse_events

# Constructing Dictionary for Adverse Event to ingrediants / identified ingrediants / unidentified ingrediants
adverse_event_to_ingrediants_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event")["Ingrediants"].to_dict()
adverse_event_to_identified_ingrediants_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event")["Identified Ingrediants"].to_dict()
adverse_event_to_unidentified_ingrediants_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event")["Undentified Ingrediants"].to_dict()

# Construct Dictionary for adverse event to synonyms adverse events
adverse_event_to_synonym_adverse_events_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event").to_dict()['Synonym Adverse Events']

# Construct Dictionary for adverse event to synonyms adverse event CUIs
adverse_event_to_synonym_adverse_event_cuis_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("Adverse Event")["Synonym Adverse Event CUIs"].to_dict()

# Construct Dictionary for adverse event CUI to synonyms adverse event CUIs
cui_to_synonym_adverse_event_cuis_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("CUI")["Synonym Adverse Event CUIs"].to_dict()

# Construct Dictionary for adverse event CUI to synonyms adverse events
cui_to_synonym_adverse_events_dict = adverse_events_with_synonyms_and_ingrediant_info_df.set_index("CUI")["Synonym Adverse Events"].to_dict()

output_side_effects_table_info_folder = database_analysis_output_folder + "output_side_effects_table_info/"
check_directory_exists(output_side_effects_table_info_folder)   
    
# Constructing Scatterplot of "Number of Identified Ingrediants" vs "Number of Undentified Ingrediants" for each side effect / adverse event  
plot_scatterplot(output_side_effects_table_info_folder, (20, 20), adverse_events_with_synonyms_and_ingrediant_info_df, "Number of Identified Ingrediants", "Number of Undentified Ingrediants", "Adverse Event", "Adverse Events Vs Ingrediant Type Counts")
    
# Constructing and saving figures of the numeric columns for ingrediants
output_ingrediants_info_folder = database_analysis_output_folder + "output_ingrediants_info/"
check_directory_exists(output_ingrediants_info_folder)
numeric_cols_of_interest = ["Number of Ingrediants", "Number of Identified Ingrediants", "Number of Undentified Ingrediants", "Number of Synonym Adverse Events"]
for col in numeric_cols_of_interest:
    plot_horizontal_barplot(output_ingrediants_info_folder, (30, 15), adverse_events_with_synonyms_and_ingrediant_info_df, col, "Adverse Event", None, "Adverse Events vs. " + col)
    
# Saving DataFrames
save_to_csv_file(output_side_effects_table_info_folder, drug_names_for_side_effects_df, "drug_names_for_side_effects_table")
save_to_csv_file(output_side_effects_table_info_folder, adverse_events_with_synonyms_and_ingrediant_info_df, "adverse_events_with_synonyms_and_ingrediant_info_table")

# Save Dictionaries as Pickle Files  
save_to_pickle_file(output_side_effects_table_info_folder, adverse_event_to_cui_dict, "adverse_event_to_cui_dict")
save_to_pickle_file(output_side_effects_table_info_folder, cui_to_adverse_events_dict, "cui_to_adverse_events_dict")
save_to_pickle_file(output_side_effects_table_info_folder, adverse_event_to_ingrediants_dict, "adverse_event_to_ingrediants_dict")
save_to_pickle_file(output_side_effects_table_info_folder, adverse_event_to_identified_ingrediants_dict, "adverse_event_to_identified_ingrediants_dict")
save_to_pickle_file(output_side_effects_table_info_folder, adverse_event_to_unidentified_ingrediants_dict, "adverse_event_to_unidentified_ingrediants_dict")
save_to_pickle_file(output_side_effects_table_info_folder, adverse_event_to_synonym_adverse_events_dict, "adverse_event_to_synonym_adverse_events_dict")
save_to_pickle_file(output_side_effects_table_info_folder, adverse_event_to_synonym_adverse_event_cuis_dict, "adverse_event_to_synonym_adverse_event_cuis_dict")
save_to_pickle_file(output_side_effects_table_info_folder, cui_to_synonym_adverse_event_cuis_dict, "cui_to_synonym_adverse_event_cuis_dict")
save_to_pickle_file(output_side_effects_table_info_folder, cui_to_synonym_adverse_events_dict, "cui_to_synonym_adverse_events_dict")

### DataFrame and support files for Ingrediants in FDA file

In [None]:
# constructing DataFrame from dictionary 
ingrediants_df = pd.DataFrame(pd.Series(unique_ingrediants_dict))
ingrediants_df = ingrediants_df.reset_index().rename(columns={0:"Adverse Events", "index":"Ingrediant"})
ingrediants_df["Number of Adverse Events"] = ingrediants_df.apply(lambda x: len(x["Adverse Events"]), axis=1)

def map_ingrediant_to_drugbank_id(ingrediant):
    if ingrediant in drug_synonym_name_to_drugbank_id_dict.keys():
        return drug_synonym_name_to_drugbank_id_dict[ingrediant]
    else:
        return "???"
 
def map_ingrediant_synonym_lst_to_drugbank_id_lst(record):
    if(type(record)==list):
        return list(map(drug_synonym_name_to_drugbank_id_dict.get, record))

def replace_none_in_list_with_unknown_mark(lst):
    new_lst = []
    for e in lst:
        if e == None:
            new_lst.append("???")
        else:
            new_lst.append(e)
    return new_lst
    
# Construct column to show Adverse Event CUIs list
ingrediants_df["Adverse Event CUIs"] = ingrediants_df["Adverse Events"].apply(func=map_adverse_event_lst_to_cui_lst)

# identifying ingrediants' DrugBank ID
ingrediants_df["DrugBank ID"] = ingrediants_df["Ingrediant"].apply(func=map_ingrediant_to_drugbank_id)

# identifying type of ingrediant
ingrediants_df["Ingrediant Type"] = ingrediants_df["Ingrediant"].map(ingrediant_type_dict)

# add synonym names for ingrediant
drugbank_id_to_all_synonym_names_dict["???"] = "NONE"
ingrediants_df["Synonym Ingrediant Names"] = ingrediants_df["DrugBank ID"].map(drugbank_id_to_all_synonym_names_dict)

# Add additional info for Number of Adverse Event Synonyms
ingrediants_df["Number of Synonym Ingrediant Names"] = ingrediants_df["Synonym Ingrediant Names"].apply(lambda x: len(x))

# Obtain all CUIs for the adverse event synonym list for each row
ingrediants_df['Synonym Ingrediant DrugBank IDs'] = ingrediants_df['Synonym Ingrediant Names'].apply(func=map_ingrediant_synonym_lst_to_drugbank_id_lst)

# Clean and Organize DataFrame for presentation
column_names = ["Ingrediant", "DrugBank ID", "Number of Adverse Events", "Adverse Events", "Adverse Event CUIs", "Ingrediant Type", "Synonym Ingrediant Names", 'Synonym Ingrediant DrugBank IDs', "Number of Synonym Ingrediant Names"]
ingrediants_df = ingrediants_df.reindex(columns=column_names)

ingrediants_df["Adverse Event CUIs"] = ingrediants_df["Adverse Event CUIs"].apply(func=replace_none_in_list_with_unknown_mark)


# For each Adverse Event listed in the "Adverse Events" column, the Adverse Event's Synonyms and their respective CUIs will be added as a list
# For each drug, information about the Adverse Event's Synonyms and the corresponding CUIs will be added based on the Adverse Events the drug is shown to be a part of

def add_synonym_adverse_events(adverse_event_lst):
    lst = []
    for adverse_event in adverse_event_lst:
        lst.append(adverse_event_to_synonym_adverse_events_dict[adverse_event])
    return lst
    
def add_synonym_adverse_event_cuis(adverse_event_lst):
    lst = []
    for adverse_event in adverse_event_lst:
        lst.append(adverse_event_to_synonym_adverse_event_cuis_dict[adverse_event])
    return lst

ingrediants_df["Synonym Adverse Events"] = ingrediants_df["Adverse Events"].apply(func=add_synonym_adverse_events)
ingrediants_df["Synonym Adverse Event CUIs"] = ingrediants_df["Adverse Events"].apply(func=add_synonym_adverse_event_cuis)

def clean_list(x):
    return x[0]

ingrediants_df["Synonym Adverse Events"] = ingrediants_df["Synonym Adverse Events"].apply(clean_list)
ingrediants_df["Synonym Adverse Event CUIs"] = ingrediants_df["Synonym Adverse Event CUIs"].apply(clean_list)

ingrediants_df = ingrediants_df.sort_values(by="Number of Adverse Events", ascending=False)
ingrediants_df

In [None]:
# Constructing Dictionary for Ingrediant to DrugBank ID
ingrediant_to_drugbank_id_dict = ingrediants_df.set_index("Ingrediant")["DrugBank ID"].to_dict()

# Constructing Dictionary for DrugBank ID to Ingrediant 
# (NOTE: for unknowns, must filter and find ones labeled ???)
drugbank_id_to_ingrediant_dict = ingrediants_df.set_index("DrugBank ID")["Ingrediant"].to_dict()
unidentified_ingrediants = list(ingrediants_df[ingrediants_df["DrugBank ID"] == "???"]["Ingrediant"])
drugbank_id_to_ingrediant_dict["???"] = unidentified_ingrediants

# Constructing Dictionary for ingrediant to Adverse Events
ingrediant_to_adverse_events_dict = ingrediants_df.set_index("Ingrediant")["Adverse Events"].to_dict()

# Constructing Dictionary for ingrediant to Adverse Event CUIs
ingrediant_to_adverse_event_cuis_dict = ingrediants_df.set_index("Ingrediant")["Adverse Event CUIs"].to_dict()

# Constructing Dictionary for ingrediant to Synonyms Adverse Events
ingrediant_to_synonym_adverse_event_cuis_dict = ingrediants_df.set_index("Ingrediant")["Synonym Adverse Event CUIs"].to_dict()

# Constructing Dictionary for ingrediant to Synonyms Adverse Event CUIs
ingrediant_to_synonym_adverse_events_dict = ingrediants_df.set_index("Ingrediant")["Synonym Adverse Events"].to_dict()

# Constructing Dictionary for DrugBank ID to Adverse Event
drugbank_id_to_adverse_events_dict = ingrediants_df.set_index("DrugBank ID")["Adverse Events"].to_dict()

# Constructing Dictionary for DrugBank ID to Adverse Event CUIs
# (NOTE for unknowns, it is possible to have known Adverse Events so the mapping is not unique)
drugbank_id_to_adverse_event_cuis_dict = ingrediants_df.set_index("DrugBank ID")["Adverse Event CUIs"].to_dict()
drugbank_id_to_adverse_event_cuis_dict["???"] = "NOT APPLICABLE"

# Save Dictionary
save_to_pickle_file(output_ingrediants_info_folder, ingrediant_to_drugbank_id_dict, "ingrediant_to_drugbank_id_dict")
save_to_pickle_file(output_ingrediants_info_folder, drugbank_id_to_ingrediant_dict, "drugbank_id_to_ingrediant_dict")
save_to_pickle_file(output_ingrediants_info_folder, ingrediant_to_adverse_events_dict, "ingrediant_to_adverse_events_dict")
save_to_pickle_file(output_ingrediants_info_folder, ingrediant_to_adverse_event_cuis_dict, "ingrediant_to_adverse_event_cuis_dict")
save_to_pickle_file(output_ingrediants_info_folder, drugbank_id_to_adverse_events_dict, "drugbank_id_to_adverse_events_dict")
save_to_pickle_file(output_ingrediants_info_folder, drugbank_id_to_adverse_event_cuis_dict, "drugbank_id_to_adverse_event_cuis_dict")
save_to_pickle_file(output_ingrediants_info_folder, ingrediant_to_synonym_adverse_event_cuis_dict, "ingrediant_to_synonym_adverse_event_cuis_dict")
save_to_pickle_file(output_ingrediants_info_folder, ingrediant_to_synonym_adverse_events_dict, "ingrediant_to_synonym_adverse_events_dict")

# Saving DataFrames
save_to_csv_file(output_ingrediants_info_folder, ingrediants_df, "ingrediants_table")