In [None]:
import pandas as pd
import numpy as np
import os, pickle
from typing import List, Union, Tuple, Iterable, Optional
import json
from tqdm import tqdm

from scipy.spatial.distance import pdist, squareform, cosine, cdist
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from madrigal.utils import DATA_DIR

kg_encoder = 'hgt'
data_source = 'DrugBank'
split_method = 'split_by_classes'
repeat = None
kg_sampling_num_neighbors = None
kg_sampling_num_layers = None
num_workers = 0

use_drug_names = False
use_only_label = False
concat = True

In [None]:
def get_ddi_info(name, description_df, use_para=True):
    if name == 'eval':
        edgelist_df_fname = f"polypharmacy_new/{data_source}/{split_method}/eval_suffle_df.csv"
    else:
        edgelist_df_fname = f"polypharmacy_new/{data_source}/{split_method}/{name}_df.csv"
    edge_df = pd.read_csv(os.path.join(DATA_DIR, edgelist_df_fname))

    pos_edgelist = edge_df[['head', 'tail']].values     
    neg_edgelist1 = edge_df[['head','neg_tail']].values 
    neg_edgelist2 = edge_df[['neg_head','tail']].values 
    neg_edgelist = np.concatenate([neg_edgelist1, neg_edgelist2], axis=0)

    labels = edge_df['label_indexed'].values 
    num_labels = max(labels) + 1 if max(labels) > 1 else 1
    
    tmp_df = pd.DataFrame(pos_edgelist, columns=['drug_index_1', 'drug_index_2'])
    merged_df = tmp_df.merge(description_df, on=['drug_index_1', 'drug_index_2'])
    
    drug_descriptions = merged_df['description'].values
    
    if use_para:
        descriptions = merged_df['paraphrased_descriptions'].values
    else:
        descriptions = merged_df['generalized_description'].values
        
    label_text = merged_df[['label']].values
            
    full_descriptions = np.concatenate((descriptions, descriptions, descriptions))
    full_drug_descriptions = np.concatenate([drug_descriptions]*3)
    label_text = np.concatenate([label_text]*3)
    
    edgelist = np.concatenate((pos_edgelist, neg_edgelist))
    labels = np.concatenate((labels, labels, labels))
    pos_neg = np.concatenate((np.ones((pos_edgelist.shape[0])), np.zeros((neg_edgelist1.shape[0])), np.zeros((neg_edgelist2.shape[0]))))
    
    return full_descriptions, full_drug_descriptions, edgelist, labels, pos_neg, label_text

description_df = pd.read_csv(os.path.join(DATA_DIR, 'polypharmacy_new/DrugBank/drugbank_ddi_directed_final_cleaned.tsv'))

## Only Drugbank descriptions

In [None]:
def get_new_df(split, description_df):
    full_descriptions, full_drug_descriptions, edgelist, labels, pos_neg, label_text = get_ddi_info(split, description_df, use_para=False)
    df = pd.DataFrame({'head': edgelist[:,0], 'tail': edgelist[:,1], 'labels': labels, 'pos_neg': pos_neg, 'descriptions': full_descriptions, 'label_descriptions': label_text[:,0]})
    return df

In [27]:
train_df = get_new_df('train', description_df)
val_df = get_new_df('val', description_df)
test_df = get_new_df('test', description_df)

In [29]:
eval_df = pd.concat([val_df, test_df], ignore_index=True)
eval_df = eval_df.sample(frac=1).reset_index(drop=True)
eval_df.to_csv('./label_dataset/eval_df.csv')

In [30]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv('./label_dataset/train_df.csv')

## Paraphrased descriptions

In [3]:
def replace_descriptions(description_df, paraphrase_file, how='rephrase'):
    
    unique_ddis = description_df['generalized_description'].unique()
    data = []
    
    RESULTS_FILENAMES = [paraphrase_file]

    for result_filename in RESULTS_FILENAMES:
        with open(result_filename, "r") as f:
            for line_num, line in enumerate(f):
                try:
                    json_object = json.loads(line)
                except json.JSONDecodeError:
                    print(line_num)
                    print(line)
                    raise

                try:
                    # Note: this is how the openai_api_request_parallel_processor.py has been amended to save results (along with the request and idx)

                    idx, request, response = json_object[0]['idx'], json_object[0]['request'], json_object[1]
                    prompt = request['messages'][1]['content']

                    if how not in prompt:
                        continue

                    row = {
                        'id': unique_ddis[idx]
                    }

                    for choice in response['choices']:
                        i = choice['index']

                        response_text = choice['message']['content']
                        row[f"text_{i}"] = response_text

                    data.append(row)

                except TypeError:
                    print(line_num)
                    print(request)
                    print(response)

                except KeyError:
                    print(line_num)
                    print(request)
                    print(response)
                    raise
    return {d['id']: d['text_0'] for d in data}

In [None]:
description_df = pd.read_csv(os.path.join(DATA_DIR, 'polypharmacy_new/DrugBank/drugbank_ddi_directed_final_cleaned.tsv'))

paraphrased_descriptions = []
paraphrase_file = './api_requests_results_multi_new.jsonl'
replace_dict = replace_descriptions(description_df, paraphrase_file, how='extend')
for i in range(description_df.shape[0]):
    description = description_df.at[description_df.index[i], 'generalized_description']
    paraphrased_descriptions.append(replace_dict[description])

In [None]:
def delete_before_word(s, word):
    # Check if the word is in the string
    if word in s:
        # Split the string at the first occurrence of the word
        parts = s.split(word, 1)
        # Return the part after the word, including the word itself
        return word + parts[1]
    else:
        # If the word is not in the string, return the original string or handle as needed
        return s

def delete_after_word(text, word):
    # Splitting the text at the specified word. The result is a list.
    parts = text.split(word, 1)  # The '1' indicates we want to split at the first occurrence only.
    
    # If the word is found, parts[0] will contain the text before the word.
    # We return this part along with the word itself, assuming you want to keep the word after which everything is deleted.
    if len(parts) > 1:
        return parts[0] + word
    else:
        # If the word is not found, return the original text
        return text
    
def process_text(text):
    text = delete_before_word(text, '"medical')
    text = delete_after_word(text, '}')
    text = "{" + text
    text = text.replace('”', '"')
    return eval(text)

In [8]:
paraphrased_descriptions = [process_text(i) for i in paraphrased_descriptions]

In [9]:
def list_of_dicts_to_dict_of_lists(list_of_dicts):
    # Initialize the output dictionary
    dict_of_lists = {}
    # Iterate over each dictionary in the list
    for dictionary in list_of_dicts:
        for key, value in dictionary.items():
            # If the key doesn't exist in the output dictionary, initialize it with an empty list
            if key not in dict_of_lists:
                dict_of_lists[key] = []
            # Append the current value to the list corresponding to the current key
            dict_of_lists[key].append(value)
    return dict_of_lists

In [10]:
paraphrased_descriptions_dict = list_of_dicts_to_dict_of_lists(paraphrased_descriptions)

In [11]:
paraphrased_descriptions_df = pd.DataFrame.from_dict(paraphrased_descriptions_dict)

In [12]:
description_df = pd.concat([description_df, paraphrased_descriptions_df], axis = 1)

In [None]:
def get_ddi_info(name, description_df, use_para=True):
    if name == 'eval':
        edgelist_df_fname = f"polypharmacy_new/{data_source}/{split_method}/eval_suffle_df.csv"
    else:
        edgelist_df_fname = f"polypharmacy_new/{data_source}/{split_method}/{name}_df.csv"
    edge_df = pd.read_csv(os.path.join(DATA_DIR, edgelist_df_fname))

    pos_edgelist = edge_df[['head', 'tail']].values     
    neg_edgelist1 = edge_df[['head','neg_tail']].values 
    neg_edgelist2 = edge_df[['neg_head','tail']].values 
    neg_edgelist = np.concatenate([neg_edgelist1, neg_edgelist2], axis=0)

    labels = edge_df['label_indexed'].values 
    num_labels = max(labels) + 1 if max(labels) > 1 else 1
    
    tmp_df = pd.DataFrame(pos_edgelist, columns=['drug_index_1', 'drug_index_2'])
    merged_df = tmp_df.merge(description_df, on=['drug_index_1', 'drug_index_2'])
    
    drug_descriptions = merged_df['description'].values
    
    if use_para:
        descriptions = merged_df[['generalized_description',
                                  'medical_doctor_1', 'medical_doctor_2','medical_doctor_3', 
                                  'pharmacologist_1', 'pharmacologist_2','pharmacologist_3', 
                                  'toxicologist_1', 'toxicologist_2', 'toxicologist_3']].values
    else:
        descriptions = merged_df['generalized_description'].values
        
    label_text = merged_df[['label']].values
            
    full_descriptions = np.concatenate((descriptions, descriptions, descriptions))
    full_drug_descriptions = np.concatenate([drug_descriptions]*3)
    label_text = np.concatenate([label_text]*3)
    
    edgelist = np.concatenate((pos_edgelist, neg_edgelist))
    labels = np.concatenate((labels, labels, labels))
    pos_neg = np.concatenate((np.ones((pos_edgelist.shape[0])), np.zeros((neg_edgelist1.shape[0])), np.zeros((neg_edgelist2.shape[0]))))
    
    return full_descriptions, full_drug_descriptions, edgelist, labels, pos_neg, label_text

In [None]:
def get_new_df(split, description_df):
    full_descriptions, full_drug_descriptions, edgelist, labels, pos_neg, label_text = get_ddi_info(split, description_df, use_para=True)
    df = pd.DataFrame({'head': edgelist[:,0], 'tail': edgelist[:,1], 'labels': labels, 'pos_neg': pos_neg,
                       'descriptions_0': full_descriptions[:,0], 'label_descriptions': label_text[:,0], 
                       'descriptions_1': full_descriptions[:,1],
                       'descriptions_2': full_descriptions[:,2],
                       'descriptions_3': full_descriptions[:,3],
                       'descriptions_4': full_descriptions[:,4],
                       'descriptions_5': full_descriptions[:,5],
                       'descriptions_6': full_descriptions[:,6],
                       'descriptions_7': full_descriptions[:,7],
                       'descriptions_8': full_descriptions[:,8],
                       'descriptions_9': full_descriptions[:,9]})
    return df

In [21]:
train_df = get_new_df('train', description_df)
val_df = get_new_df('val', description_df)
test_df = get_new_df('test', description_df)

In [23]:
eval_df = pd.concat([val_df, test_df], ignore_index=True)
eval_df = eval_df.sample(frac=1).reset_index(drop=True)
eval_df.to_csv('./paraphrased_dataset_new/eval_df.csv')

In [24]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv('./paraphrased_dataset_new/train_df.csv')