This notebook prepares the following matrices from the ESP dataset to be used by NCMF:

Ensure the following commands are run before the notebook:

```
Download the file from https://drive.google.com/file/d/1r_SJJRoQQ-K12IooYunDoj2DgaKQgqJe/view?usp=sharing and place in the current directory.
bzip2 -d ESP_data_original.tar.bz2
tar -xvf ESP_data_original.tar
```
A folder called polypharmacy must be available now.

drug-drug_sideeffect matrix from decagon_split_train_ddi.tsv # 645 x (645 * 963) X0

drug-protein matrix from decagon_split_train_drug_protein.tsv # 645 x 22583 X1

protein-protein matrix from decagon_split_train_ppi.tsv # 22583 x 22583 X2



```
            drug_se      protein
           ++++++++      ++++++++
           +      +      +      +
    drug   +  X0  +      +  X1  +
           +      +      +      +
           ++++++++      ++++++++
           
                         ++++++++
                         +      +
   protein               +  X2  +  
                         +      +
                         ++++++++
```

In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
import os
import os.path

In [4]:
esp_data_folder = "./polypharmacy/"

In [5]:
ppi_df = pd.read_csv(esp_data_folder + "decagon_split_train_ppi.tsv", sep="\t", header=None)
ppi_df.columns=["proteinA", "predicate", "proteinB"]
ppi_df["subject"] = ppi_df["proteinA"]
ppi_df["object"] = ppi_df["proteinB"]
print(ppi_df.shape)
ppi_df.head()

(2289960, 5)


Unnamed: 0,proteinA,predicate,proteinB,subject,object
0,114787,INTERACTS_WITH,375519,114787,375519
1,114787,INTERACTS_WITH,285613,114787,285613
2,114787,INTERACTS_WITH,7448,114787,7448
3,114787,INTERACTS_WITH,4914,114787,4914
4,114787,INTERACTS_WITH,51343,114787,51343


In [6]:
dpi_df = pd.read_csv(esp_data_folder + "decagon_split_train_drug_protein.tsv", sep="\t", header=None)
dpi_df.columns = ["drug", "predicate", "protein"]
dpi_df.head()

Unnamed: 0,drug,predicate,protein
0,CID000003345,TARGETS,3757
1,CID000003345,TARGETS,2850
2,CID000003345,TARGETS,4157
3,CID000003345,TARGETS,1902
4,CID000003345,TARGETS,2925


In [7]:
ddi_df = pd.read_csv(esp_data_folder + "decagon_split_train_ddi.tsv", sep="\t", header=None)
ddi_df.columns = ["drugA", "sideeffect", "drugB"]
print(ddi_df.shape)
ddi_df.head()

(7323790, 3)


Unnamed: 0,drugA,sideeffect,drugB
0,CID000002173,C0004144,CID000005651
1,CID000002173,C0004144,CID000003440
2,CID000002173,C0004144,CID000003016
3,CID000002173,C0004144,CID000005538
4,CID000002173,C0004144,CID000004159


In [8]:
ddi_test_pos_df = pd.read_csv(esp_data_folder + "decagon_split_test_ddi_pos.tsv", sep="\t", header=None)
ddi_test_pos_df.columns = ["drugA", "sideeffect", "drugB"]
ddi_test_pos_df["label"] = 1
print(ddi_test_pos_df.shape)
ddi_test_pos_df.head()

(914392, 4)


Unnamed: 0,drugA,sideeffect,drugB,label
0,CID000002244,C0004144,CID000003365,1
1,CID000003440,C0004144,CID000004691,1
2,CID000003957,C0004144,CID000004075,1
3,CID000004168,C0004144,CID000005090,1
4,CID000001117,C0004144,CID000077993,1


In [9]:
ddi_test_neg_df = pd.read_csv(esp_data_folder + "decagon_split_test_ddi_neg.tsv", sep="\t", header=None)
ddi_test_neg_df.columns = ["drugA", "sideeffect", "drugB"]
ddi_test_neg_df["label"] = 0
print(ddi_test_neg_df.shape)
ddi_test_neg_df.head()

(914392, 4)


Unnamed: 0,drugA,sideeffect,drugB,label
0,CID000057469,C0004144,CID000004607,0
1,CID000005466,C0004144,CID000153941,0
2,CID009571074,C0004144,CID000093860,0
3,CID000001206,C0004144,CID000060795,0
4,CID000004675,C0004144,CID004659569,0


In [10]:
ddi_test_df = pd.concat([ddi_test_pos_df, ddi_test_neg_df])
print(ddi_test_df.shape)
ddi_test_df.head()

(1828784, 4)


Unnamed: 0,drugA,sideeffect,drugB,label
0,CID000002244,C0004144,CID000003365,1
1,CID000003440,C0004144,CID000004691,1
2,CID000003957,C0004144,CID000004075,1
3,CID000004168,C0004144,CID000005090,1
4,CID000001117,C0004144,CID000077993,1


In [11]:
drug_names_df = pd.read_csv(esp_data_folder + "drug_names.csv", header=None)
drug_names_df.columns = ["drug_ID", "drug_name"]
drug_names_df.drug_name = drug_names_df.drug_name.apply(
    lambda x: x.replace(" ", "_").lower())
drug_names = drug_names_df.set_index('drug_ID').to_dict()['drug_name']
drug_names_df.head()

Unnamed: 0,drug_ID,drug_name
0,CID000000085,carnitine
1,CID000000119,gaba
2,CID000000143,leucovorin
3,CID000000158,pge2
4,CID000000159,prostacyclin


In [12]:
sideeffects_df = pd.read_csv(esp_data_folder + "side_effect_names.tsv", sep="\t", header=None)
sideeffects_df.columns = ["sideeffect_ID", "sideeffect_name"]
sideeffects_df.sideeffect_name = sideeffects_df.sideeffect_name.apply(
    lambda x: x.replace(" ", "_").upper())
side_effect_names = sideeffects_df.set_index(
    "sideeffect_ID").to_dict()["sideeffect_name"]
sideeffects_df.head()

Unnamed: 0,sideeffect_ID,sideeffect_name
0,C0151714,HYPERMAGNESEMIA
1,C0035344,RETINOPATHY_OF_PREMATURITY
2,C0004144,ATELECTASIS
3,C0002063,ALKALOSIS
4,C0004604,BACK_ACHE


In [13]:
resolve_drug_id = lambda drug_id: drug_names[drug_id] if drug_id in drug_names.keys() else drug_id
resolve_side_effect_id = lambda side_effect_id: side_effect_names[side_effect_id[:-2]] if side_effect_id[-2:]=="-2" else side_effect_names[side_effect_id]

def resolve_names_ddi_df(df):
    df['subject'] = df["drugA"].apply(resolve_drug_id)
    df['predicate'] = df["sideeffect"].apply(resolve_side_effect_id)
    df['object'] = df['drugB'].apply(resolve_drug_id)
    

In [14]:
resolve_names_ddi_df(ddi_df)
ddi_df.head()

Unnamed: 0,drugA,sideeffect,drugB,subject,predicate,object
0,CID000002173,C0004144,CID000005651,ampicillin,ATELECTASIS,vancomycin
1,CID000002173,C0004144,CID000003440,ampicillin,ATELECTASIS,furosemide
2,CID000002173,C0004144,CID000003016,ampicillin,ATELECTASIS,diazepam
3,CID000002173,C0004144,CID000005538,ampicillin,ATELECTASIS,retinoic_acid
4,CID000002173,C0004144,CID000004159,ampicillin,ATELECTASIS,methylprednisolone


In [15]:
resolve_names_ddi_df(ddi_test_df)
ddi_test_df.head()

Unnamed: 0,drugA,sideeffect,drugB,label,subject,predicate,object
0,CID000002244,C0004144,CID000003365,1,aspirin,ATELECTASIS,fluconazole
1,CID000003440,C0004144,CID000004691,1,furosemide,ATELECTASIS,paroxetine
2,CID000003957,C0004144,CID000004075,1,loratadine,ATELECTASIS,5-aminosalicylic_acid
3,CID000004168,C0004144,CID000005090,1,metoclopramide,ATELECTASIS,rofecoxib
4,CID000001117,C0004144,CID000077993,1,sulfate,ATELECTASIS,eletriptan


In [16]:
print(len(set(ddi_test_df.subject) | set(ddi_test_df.object)))
print(len(set(ddi_test_df.predicate)))

645
963


In [17]:
def resolve_drug_dpi(row):
    if row["predicate"] == "TARGETS":
        subject = drug_names[row["drug"]] if row["drug"] in drug_names.keys() else row["drug"]
        obj = row["protein"]
    elif row["predicate"] == "TARGETS-2":
        subject = drug_names[row["protein"]] if row["protein"] in drug_names.keys() else row["protein"]
        obj = row["drug"]
    return subject, obj

In [18]:
dpi_df[["subject", "object"]] = dpi_df.apply(resolve_drug_dpi, axis = 1, result_type="expand")
# dpi_df["object"] = dpi_df["protein"]
print(dpi_df.shape)
dpi_df.head()

(29756, 5)


Unnamed: 0,drug,predicate,protein,subject,object
0,CID000003345,TARGETS,3757,fentanyl,3757
1,CID000003345,TARGETS,2850,fentanyl,2850
2,CID000003345,TARGETS,4157,fentanyl,4157
3,CID000003345,TARGETS,1902,fentanyl,1902
4,CID000003345,TARGETS,2925,fentanyl,2925


In [19]:
dpi_df.tail()

Unnamed: 0,drug,predicate,protein,subject,object
29751,9283,TARGETS-2,CID000004011,maprotiline,9283
29752,2652,TARGETS-2,CID000004011,maprotiline,2652
29753,118442,TARGETS-2,CID000004011,maprotiline,118442
29754,64805,TARGETS-2,CID000002182,anagrelide,64805
29755,1586,TARGETS-2,CID000005482,tioconazole,1586


In [20]:
# exploring single side effect dataset from SNAP
single_se_df = pd.read_csv("./polypharmacy/bio-decagon-mono.csv")
single_se_df.head()

Unnamed: 0,STITCH,Individual Side Effect,Side Effect Name
0,CID003062316,C1096328,central nervous system mass
1,CID003062316,C0162830,Photosensitivity reaction
2,CID003062316,C1611725,leukaemic infiltration brain
3,CID003062316,C0541767,platelet adhesiveness abnormal
4,CID003062316,C0242973,Ventricular dysfunction


In [21]:
(set(ddi_df.sideeffect)) - (set(sideeffects_df.sideeffect_ID) & set(single_se_df["Individual Side Effect"]))

{'C0024117',
 'C0595939',
 'C0014869-2',
 'C0221776',
 'C0002884',
 'C0035854-2',
 'C0497156',
 'C0020440-2',
 'C0003962-2',
 'C0221245',
 'C0836924',
 'C0023467',
 'C0001546',
 'C0018801-2',
 'C0700200',
 'C0085662',
 'C0241353',
 'C0392525',
 'C0575081-2',
 'C0032290',
 'C0025061-2',
 'C1510431-2',
 'C0156404-2',
 'C0024205',
 'C0152025-2',
 'C0030319-2',
 'C0016199',
 'C0973461',
 'C0001122-2',
 'C0699791-2',
 'C1442903-2',
 'C0037287',
 'C0035258-2',
 'C0015544-2',
 'C0010055',
 'C0042963',
 'C0221264',
 'C0040997',
 'C0018524',
 'C0010032-2',
 'C0016242-2',
 'C0014130-2',
 'C0039503-2',
 'C0038220',
 'C0024586',
 'C0020490-2',
 'C0009421-2',
 'C0019360-2',
 'C0023418',
 'C0029925-2',
 'C0152031',
 'C0019284',
 'C0003611',
 'C0085166',
 'C0151611',
 'C0005687',
 'C0011603-2',
 'C0019189-2',
 'C0080194',
 'C0206062-2',
 'C0151966-2',
 'C0476427-2',
 'C0151825-2',
 'C0008767',
 'C0042514',
 'C0020505-2',
 'C0012833',
 'C0007876-2',
 'C0190211-2',
 'C0041909-2',
 'C0024902',
 'C000581

#### Helper files creation

In [29]:
if os.path.isfile(esp_data_folder + "se_963.csv"):
    print("Reading existing file")
    sideeffects = list(pd.read_csv(esp_data_folder + "se_963.csv")["sideeffect"])
else:
    sideeffects = set(ddi_df.predicate.unique())
sideeffect_count = len(sideeffects)
print("Side effect count =" ,sideeffect_count)

Reading existing file
Side effect count = 963


In [26]:
if os.path.isfile(esp_data_folder + "drug_ID_mapping.csv"):
    print("Reading existing file")
    set_of_drugs = list(pd.read_csv(esp_data_folder + "drug_ID_mapping.csv")["drug"])
    set_of_proteins = list(pd.read_csv(esp_data_folder + "protein_ID_mapping.csv")["protein"])
else:
    set_of_drugs = set(ddi_df.subject.unique()) | set(ddi_df.object.unique()) | set(dpi_df.subject.unique())
    set_of_proteins = set(ppi_df.subject.unique()) | set(ppi_df.object.unique()) | set(dpi_df.object.unique())
drug_count = len(set_of_drugs)
protein_count = len(set_of_proteins)
print(f"Drug count = {drug_count}")
print(f"Protein count = {protein_count}")

Reading existing file
Drug count = 645
Protein count = 22583


In [27]:
proteins_df = pd.DataFrame(set_of_proteins, columns = ["protein"], dtype=int)
proteins_df["idx"] = proteins_df.index
print(proteins_df.shape)
proteins_df.head()
# proteins_df.to_csv("/data/ajayago/dataset/polypharmacy/protein_ID_mapping.csv", index=False)

(22583, 2)


Unnamed: 0,protein,idx
0,1,0
1,2,1
2,131076,2
3,9,3
4,10,4


In [28]:
drugs_df = pd.DataFrame(set_of_drugs, columns = ["drug"])
drugs_df["idx"] = drugs_df.index
print(drugs_df.shape)
drugs_df.head()
# drugs_df.to_csv("/data/ajayago/dataset/polypharmacy/drug_ID_mapping.csv", index=False)

(645, 2)


Unnamed: 0,drug,idx
0,clobetasol,0
1,ibandronate,1
2,minoxidil,2
3,cefuroxime,3
4,chloramphenicol,4


In [30]:
sideeffect_df = pd.DataFrame(sideeffects, columns = ["sideeffect"])
sideeffect_df["idx"] = sideeffect_df.index
print(sideeffect_df.shape)
sideeffect_df.head()
# sideeffect_df.to_csv("/data/ajayago/dataset/polypharmacy/se_963.csv", index=False)

(963, 2)


Unnamed: 0,sideeffect,idx
0,LOWER_GI_BLEEDING,0
1,OSTEOARTHRITIS,1
2,ALLERGIES,2
3,SEPTIC_SHOCK,3
4,ACUTE_KIDNEY_FAILURE,4


In [31]:
drugs_df["key"] = 0
sideeffect_df["key"] = 0
drug_se_df = pd.merge(drugs_df, sideeffect_df, how="outer", on="key", suffixes=("_drug", "_se")).drop("key", axis=1)

In [33]:
drug_se_df.head()

Unnamed: 0,drug,idx_drug,sideeffect,idx_se
0,clobetasol,0,LOWER_GI_BLEEDING,0
1,clobetasol,0,OSTEOARTHRITIS,1
2,clobetasol,0,ALLERGIES,2
3,clobetasol,0,SEPTIC_SHOCK,3
4,clobetasol,0,ACUTE_KIDNEY_FAILURE,4


In [34]:
drug_se_df["drug_se_pair"] = drug_se_df.apply(lambda row: (row["drug"], row["sideeffect"]), axis =1)
drug_se_df["drug_se_pair_idx"] = drug_se_df.apply(lambda row: (row["idx_drug"], row["idx_se"]), axis =1)
drug_se_df["idx"] = drug_se_df.index
# drug_se_df.to_csv("/data/ajayago/dataset/polypharmacy/drug_se_ID_mapping.csv", index=False)
drug_se_df.head()

Unnamed: 0,drug,idx_drug,sideeffect,idx_se,drug_se_pair,drug_se_pair_idx,idx
0,clobetasol,0,LOWER_GI_BLEEDING,0,"(clobetasol, LOWER_GI_BLEEDING)","(0, 0)",0
1,clobetasol,0,OSTEOARTHRITIS,1,"(clobetasol, OSTEOARTHRITIS)","(0, 1)",1
2,clobetasol,0,ALLERGIES,2,"(clobetasol, ALLERGIES)","(0, 2)",2
3,clobetasol,0,SEPTIC_SHOCK,3,"(clobetasol, SEPTIC_SHOCK)","(0, 3)",3
4,clobetasol,0,ACUTE_KIDNEY_FAILURE,4,"(clobetasol, ACUTE_KIDNEY_FAILURE)","(0, 4)",4


In [35]:
entity_df = pd.DataFrame(columns = ["entity_name"])
entity_df["entity_name"] = list(drugs_df.drug) + list(drug_se_df.drug_se_pair) + list(proteins_df.protein)
print(entity_df.shape)
entity_df.head()

(644363, 1)


Unnamed: 0,entity_name
0,clobetasol
1,ibandronate
2,minoxidil
3,cefuroxime
4,chloramphenicol


In [36]:
entity_df["entity_ID"] = entity_df.index
# entity_df.to_csv("/data/ajayago/dataset/polypharmacy/entity_df.csv")

#### Matrix creation

##### X0 -- drug x drug_se pair matrix

In [37]:
X0_df = pd.DataFrame(np.zeros((drug_count, drug_se_df.shape[0])), columns = list(drug_se_df.drug_se_pair))
X0_df.index = list(drugs_df.drug)
X0_df.head()

Unnamed: 0,"(clobetasol, LOWER_GI_BLEEDING)","(clobetasol, OSTEOARTHRITIS)","(clobetasol, ALLERGIES)","(clobetasol, SEPTIC_SHOCK)","(clobetasol, ACUTE_KIDNEY_FAILURE)","(clobetasol, MOUTH_PAIN)","(clobetasol, PLEURAL_FIBROSIS)","(clobetasol, POSTHERPETIC_NEURALGIA)","(clobetasol, ADENOCARCINOMA)","(clobetasol, ERYSIPELAS)",...,"(methocarbamol, STRABISMUS)","(methocarbamol, AUTONOMIC_INSTABILITY)","(methocarbamol, EYE_SWELLING)","(methocarbamol, COLLAGEN_DISEASE)","(methocarbamol, ASPIRATION_PNEUMONIA)","(methocarbamol, ESOPHAGEAL_CANCER)","(methocarbamol, KERATITIS_SICCA)","(methocarbamol, POLYMYOSITIS)","(methocarbamol, DYSMENORRHEA)","(methocarbamol, HYPERALIMENTATION)"
clobetasol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibandronate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
minoxidil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cefuroxime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chloramphenicol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
X0_df.loc["clobetasol"][('clobetasol', 'LOWER_GI_BLEEDING')]

0.0

In [41]:
def fill_x0(row):
    X0_df.loc[row["subject"]][str((row["object"], row["predicate"]))] = 1
    X0_df.loc[row["object"]][str((row["subject"], row["predicate"]))] = 1

In [None]:
ddi_df.apply(lambda row: fill_x0(row), axis = 1)

In [46]:
X0_df.head()

Unnamed: 0,"(clobetasol, LOWER_GI_BLEEDING)","(clobetasol, OSTEOARTHRITIS)","(clobetasol, ALLERGIES)","(clobetasol, SEPTIC_SHOCK)","(clobetasol, ACUTE_KIDNEY_FAILURE)","(clobetasol, MOUTH_PAIN)","(clobetasol, PLEURAL_FIBROSIS)","(clobetasol, POSTHERPETIC_NEURALGIA)","(clobetasol, ADENOCARCINOMA)","(clobetasol, ERYSIPELAS)",...,"(methocarbamol, STRABISMUS)","(methocarbamol, AUTONOMIC_INSTABILITY)","(methocarbamol, EYE_SWELLING)","(methocarbamol, COLLAGEN_DISEASE)","(methocarbamol, ASPIRATION_PNEUMONIA)","(methocarbamol, ESOPHAGEAL_CANCER)","(methocarbamol, KERATITIS_SICCA)","(methocarbamol, POLYMYOSITIS)","(methocarbamol, DYSMENORRHEA)","(methocarbamol, HYPERALIMENTATION)"
clobetasol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibandronate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
minoxidil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cefuroxime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chloramphenicol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
np.count_nonzero(X0_df.values)

8787042

In [None]:
# X0_df.to_csv("/data/ajayago/dataset/polypharmacy/drug_drug_se.csv")

In [None]:
# np.save("/data/ajayago/dataset/polypharmacy/X0.npy", X0_df.values)

##### X1 - drug x protein matrix

In [None]:
X1_df = pd.DataFrame(np.zeros((drug_count, protein_count)), columns = proteins_df.protein)
X1_df.index = drugs_df.drug
print(X1_df.shape)
X1_df.head()

In [None]:
X1_df.loc["clobetasol"][1]

In [None]:
def fill_x1(row):
    X1_df.loc[row["subject"]][int(row["object"])] = 1

In [None]:
dpi_df.apply(fill_x1, axis = 1)

In [None]:
X1_df.head()

In [None]:
np.count_nonzero(X1_df.values)

In [None]:
# X1_df.to_csv("/data/ajayago/dataset/polypharmacy/drug_protein.csv")

In [None]:
X1_df = pd.read_csv("./polypharmacy/drug_protein.csv", index_col=0)
X1_df.head()

In [None]:
# np.save("/data/ajayago/dataset/polypharmacy/X1.npy", X1_df.values)

##### X2 - protein x protein matrix

In [None]:
X2_df = pd.DataFrame(np.zeros((protein_count, protein_count)), columns = proteins_df.protein)
X2_df.index = proteins_df.protein
print(X2_df.shape)
X2_df.head()

In [None]:
def fill_x2(row):
    X2_df.loc[row["subject"]][row["object"]] = 1
    X2_df.loc[row["object"]][row["subject"]] = 1

In [None]:
ppi_df.apply(fill_x2, axis=1)

In [None]:
X2_df.head()

In [None]:
np.count_nonzero(X2_df.values)

In [None]:
# X2_df.to_csv("/data/ajayago/dataset/polypharmacy/protein_protein.csv")

In [None]:
# np.save("/data/ajayago/dataset/polypharmacy/X2.npy", X2_df.values)

##### Test matrix csv file creation

In [None]:
drugs_list = pd.read_csv("./polypharmacy/drug_ID_mapping.csv")
drugs_list.head()

In [None]:
drug_se_list = pd.read_csv("./polypharmacy/drug_se_ID_mapping.csv")
drug_se_list.head()

In [None]:
X0_df_test = pd.DataFrame(np.zeros((drug_count, drug_se_df.shape[0])), columns = list(drug_se_list.drug_se_pair))
X0_df_test.index = list(drugs_list.drug)
X0_df_test.head()

In [None]:
X0_df_test["('fluconazole', 'ATELECTASIS')"]

In [None]:
def fill_x0_test(row):
    X0_df_test.loc[row["subject"]][str((row["object"], row["predicate"]))] = 1
#     X0_df_test.loc[row["object"]][str((row["subject"], row["predicate"]))] = 1

In [None]:
ddi_test_df[ddi_test_df.label == 1].apply(lambda row: fill_x0_test(row), axis = 1)

In [51]:
np.count_nonzero(X0_df_test.values)

914392

In [52]:
X0_df_test.head()

Unnamed: 0,"(clobetasol, LOWER_GI_BLEEDING)","(clobetasol, OSTEOARTHRITIS)","(clobetasol, ALLERGIES)","(clobetasol, SEPTIC_SHOCK)","(clobetasol, ACUTE_KIDNEY_FAILURE)","(clobetasol, MOUTH_PAIN)","(clobetasol, PLEURAL_FIBROSIS)","(clobetasol, POSTHERPETIC_NEURALGIA)","(clobetasol, ADENOCARCINOMA)","(clobetasol, ERYSIPELAS)",...,"(methocarbamol, STRABISMUS)","(methocarbamol, AUTONOMIC_INSTABILITY)","(methocarbamol, EYE_SWELLING)","(methocarbamol, COLLAGEN_DISEASE)","(methocarbamol, ASPIRATION_PNEUMONIA)","(methocarbamol, ESOPHAGEAL_CANCER)","(methocarbamol, KERATITIS_SICCA)","(methocarbamol, POLYMYOSITIS)","(methocarbamol, DYSMENORRHEA)","(methocarbamol, HYPERALIMENTATION)"
clobetasol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibandronate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
minoxidil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cefuroxime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chloramphenicol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# X0_df_test.to_csv("/data/ajayago/dataset/polypharmacy/drug_drug_se_test.csv")

In [None]:
# np.save("/data/ajayago/dataset/polypharmacy/X0_test.npy", X0_df_test.values)

##### Test idx file creation

In [None]:
X0_df_test.values.flatten()

In [None]:
drugs_list["key"] = 0
drug_se_list["key"] = 0
flattened_df = pd.merge(drugs_list, drug_se_list, how="outer", on = "key")
print(flattened_df.shape)
flattened_df.head()

In [None]:
flattened_df["matrix_values"] = X0_df_test.values.flatten()
flattened_df["test_idx"] = flattened_df.index
flattened_df.head()

In [None]:
test_idx_df = ddi_test_df.merge(flattened_df, left_on=["subject", "predicate", "object"], right_on=["drug_x", "sideeffect", "drug_y"])
print(test_idx_df.shape)
test_idx_df.head()

In [None]:
test_idx_df.tail()

In [None]:
# test_idx_df['test_idx'].to_csv("/data/ajayago/dataset/polypharmacy/drug_drug_se_test_idx.csv", index=False)

In [None]:
ent_df = pd.read_csv("./polypharmacy/entity_df.csv")
ent_df.iloc[646]

#### Reducing X0 and X0_test

In [53]:
sum_df = pd.DataFrame(X0_df.sum(), columns = ["total_num"])
sum_df

Unnamed: 0,total_num
"(clobetasol, LOWER_GI_BLEEDING)",9.0
"(clobetasol, OSTEOARTHRITIS)",55.0
"(clobetasol, ALLERGIES)",29.0
"(clobetasol, SEPTIC_SHOCK)",8.0
"(clobetasol, ACUTE_KIDNEY_FAILURE)",34.0
...,...
"(methocarbamol, ESOPHAGEAL_CANCER)",0.0
"(methocarbamol, KERATITIS_SICCA)",8.0
"(methocarbamol, POLYMYOSITIS)",0.0
"(methocarbamol, DYSMENORRHEA)",0.0


In [54]:
sum_df[sum_df.total_num == 0]

Unnamed: 0,total_num
"(clobetasol, POSTHERPETIC_NEURALGIA)",0.0
"(clobetasol, ERYSIPELAS)",0.0
"(clobetasol, GLOMERULONEPHRITIS)",0.0
"(clobetasol, TRIGGER_FINGER)",0.0
"(clobetasol, COLONIC_POLYP)",0.0
...,...
"(methocarbamol, AUTONOMIC_INSTABILITY)",0.0
"(methocarbamol, COLLAGEN_DISEASE)",0.0
"(methocarbamol, ESOPHAGEAL_CANCER)",0.0
"(methocarbamol, POLYMYOSITIS)",0.0


In [55]:
sum_df_test = pd.DataFrame(X0_df_test.sum(), columns = ["total_num"])
sum_df_test

Unnamed: 0,total_num
"(clobetasol, LOWER_GI_BLEEDING)",2.0
"(clobetasol, OSTEOARTHRITIS)",3.0
"(clobetasol, ALLERGIES)",1.0
"(clobetasol, SEPTIC_SHOCK)",1.0
"(clobetasol, ACUTE_KIDNEY_FAILURE)",7.0
...,...
"(methocarbamol, ESOPHAGEAL_CANCER)",0.0
"(methocarbamol, KERATITIS_SICCA)",1.0
"(methocarbamol, POLYMYOSITIS)",0.0
"(methocarbamol, DYSMENORRHEA)",0.0


In [56]:
sum_df_test[sum_df_test.total_num == 0]

Unnamed: 0,total_num
"(clobetasol, PLEURAL_FIBROSIS)",0.0
"(clobetasol, POSTHERPETIC_NEURALGIA)",0.0
"(clobetasol, ERYSIPELAS)",0.0
"(clobetasol, GLOMERULONEPHRITIS)",0.0
"(clobetasol, TRIGGER_FINGER)",0.0
...,...
"(methocarbamol, AUTONOMIC_INSTABILITY)",0.0
"(methocarbamol, COLLAGEN_DISEASE)",0.0
"(methocarbamol, ESOPHAGEAL_CANCER)",0.0
"(methocarbamol, POLYMYOSITIS)",0.0


In [67]:
ddi_test_df["drug_se_pair"] = ddi_test_df.apply(lambda row: (row["object"], row["predicate"]), axis = 1)
ddi_test_df.head()

Unnamed: 0,drugA,sideeffect,drugB,label,subject,predicate,object,drug_se_pair
0,CID000002244,C0004144,CID000003365,1,aspirin,ATELECTASIS,fluconazole,"(fluconazole, ATELECTASIS)"
1,CID000003440,C0004144,CID000004691,1,furosemide,ATELECTASIS,paroxetine,"(paroxetine, ATELECTASIS)"
2,CID000003957,C0004144,CID000004075,1,loratadine,ATELECTASIS,5-aminosalicylic_acid,"(5-aminosalicylic_acid, ATELECTASIS)"
3,CID000004168,C0004144,CID000005090,1,metoclopramide,ATELECTASIS,rofecoxib,"(rofecoxib, ATELECTASIS)"
4,CID000001117,C0004144,CID000077993,1,sulfate,ATELECTASIS,eletriptan,"(eletriptan, ATELECTASIS)"


In [76]:
len(ddi_test_df.drug_se_pair.unique()) # Keep the drug se pairs that are there in the test df

425883

In [77]:
len(ddi_test_df.subject.unique())

645

In [81]:
drug_se_pairs_to_retain = list(ddi_test_df.drug_se_pair.unique())
len(drug_se_pairs_to_retain)

425883

In [82]:
X0_reduced_train_df = X0_df[drug_se_pairs_to_retain]
print(X0_reduced_train_df.shape)
X0_reduced_train_df.head()

(645, 425883)


Unnamed: 0,"(fluconazole, ATELECTASIS)","(paroxetine, ATELECTASIS)","(5-aminosalicylic_acid, ATELECTASIS)","(rofecoxib, ATELECTASIS)","(eletriptan, ATELECTASIS)","(docetaxel, ATELECTASIS)","(simvastatin, ATELECTASIS)","(zopiclone, ATELECTASIS)","(vitamin_k1, ATELECTASIS)","(vancomycin, ATELECTASIS)",...,"(naltrexone, BALANITIS)","(thiotepa, BALANITIS)","(bimatoprost, BALANITIS)","(nortriptyline, BALANITIS)","(oxybutynin, BALANITIS)","(procainamide, BALANITIS)","(alendronate, BALANITIS)","(quinapril, BALANITIS)","(ifosfamide, BALANITIS)","(amantadine, BALANITIS)"
clobetasol,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibandronate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
minoxidil,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cefuroxime,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chloramphenicol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
np.count_nonzero(X0_reduced_train_df.values)

8581324

In [84]:
X0_reduced_test_df = X0_df_test[drug_se_pairs_to_retain]
print(X0_reduced_test_df.shape)
X0_reduced_test_df.head()

(645, 425883)


Unnamed: 0,"(fluconazole, ATELECTASIS)","(paroxetine, ATELECTASIS)","(5-aminosalicylic_acid, ATELECTASIS)","(rofecoxib, ATELECTASIS)","(eletriptan, ATELECTASIS)","(docetaxel, ATELECTASIS)","(simvastatin, ATELECTASIS)","(zopiclone, ATELECTASIS)","(vitamin_k1, ATELECTASIS)","(vancomycin, ATELECTASIS)",...,"(naltrexone, BALANITIS)","(thiotepa, BALANITIS)","(bimatoprost, BALANITIS)","(nortriptyline, BALANITIS)","(oxybutynin, BALANITIS)","(procainamide, BALANITIS)","(alendronate, BALANITIS)","(quinapril, BALANITIS)","(ifosfamide, BALANITIS)","(amantadine, BALANITIS)"
clobetasol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibandronate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
minoxidil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cefuroxime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chloramphenicol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
np.count_nonzero(X0_reduced_test_df.values)

914392

In [89]:
reduced_drug_se_df = pd.DataFrame(drug_se_pairs_to_retain)
reduced_drug_se_df["idx"] = reduced_drug_se_df.index
reduced_drug_se_df["drug_se_pair"] = reduced_drug_se_df.apply(lambda row: (row[0], row[1]), axis = 1)
print(reduced_drug_se_df.shape)
reduced_drug_se_df.head()

(425883, 4)


Unnamed: 0,0,1,idx,drug_se_pair
0,fluconazole,ATELECTASIS,0,"(fluconazole, ATELECTASIS)"
1,paroxetine,ATELECTASIS,1,"(paroxetine, ATELECTASIS)"
2,5-aminosalicylic_acid,ATELECTASIS,2,"(5-aminosalicylic_acid, ATELECTASIS)"
3,rofecoxib,ATELECTASIS,3,"(rofecoxib, ATELECTASIS)"
4,eletriptan,ATELECTASIS,4,"(eletriptan, ATELECTASIS)"


In [91]:
reduced_drug_se_df[["drug_se_pair", "idx"]].to_csv("./polypharmacy/reduced_drug_se_ID_mapping.csv")

In [92]:
# X0_reduced_train_df.to_csv("./polypharmacy/reduced_drug_drug_se.csv")
# np.save("/data/ajayago/dataset/polypharmacy/reduced_X0_train.npy", X0_reduced_train_df.values)

In [93]:
# np.save("/data/ajayago/dataset/polypharmacy/reduced_X0_test.npy", X0_reduced_test_df.values)

In [109]:
reduced_entity_df = pd.DataFrame(columns = ["Entity Names"])
reduced_entity_df["Entity Names"] = list(drugs_df.drug) + list(reduced_drug_se_df.drug_se_pair) + list(proteins_df.protein)
print(reduced_entity_df.shape)
reduced_entity_df.head()

(449111, 1)


Unnamed: 0,Entity Names
0,clobetasol
1,ibandronate
2,minoxidil
3,cefuroxime
4,chloramphenicol


In [110]:
reduced_entity_df["entity_ID"] = reduced_entity_df.index
# reduced_entity_df.to_csv("./polypharmacy/reduced_entity_df.csv")

In [97]:
drugs_df["key"] = 0
reduced_drug_se_df["key"] = 0
flattened_df = pd.merge(drugs_df, reduced_drug_se_df, how="outer", on = "key")
print(flattened_df.shape)
flattened_df.head()

(274694535, 7)


Unnamed: 0,drug,idx_x,key,0,1,idx_y,drug_se_pair
0,clobetasol,0,0,fluconazole,ATELECTASIS,0,"(fluconazole, ATELECTASIS)"
1,clobetasol,0,0,paroxetine,ATELECTASIS,1,"(paroxetine, ATELECTASIS)"
2,clobetasol,0,0,5-aminosalicylic_acid,ATELECTASIS,2,"(5-aminosalicylic_acid, ATELECTASIS)"
3,clobetasol,0,0,rofecoxib,ATELECTASIS,3,"(rofecoxib, ATELECTASIS)"
4,clobetasol,0,0,eletriptan,ATELECTASIS,4,"(eletriptan, ATELECTASIS)"


In [98]:
flattened_df["matrix_values"] = X0_reduced_test_df.values.flatten()
flattened_df["test_idx"] = flattened_df.index
flattened_df.head()

Unnamed: 0,drug,idx_x,key,0,1,idx_y,drug_se_pair,matrix_values,test_idx
0,clobetasol,0,0,fluconazole,ATELECTASIS,0,"(fluconazole, ATELECTASIS)",0.0,0
1,clobetasol,0,0,paroxetine,ATELECTASIS,1,"(paroxetine, ATELECTASIS)",0.0,1
2,clobetasol,0,0,5-aminosalicylic_acid,ATELECTASIS,2,"(5-aminosalicylic_acid, ATELECTASIS)",0.0,2
3,clobetasol,0,0,rofecoxib,ATELECTASIS,3,"(rofecoxib, ATELECTASIS)",0.0,3
4,clobetasol,0,0,eletriptan,ATELECTASIS,4,"(eletriptan, ATELECTASIS)",0.0,4


In [100]:
test_idx_df = ddi_test_df.merge(flattened_df, left_on=["subject", "predicate", "object"], right_on=["drug", 1, 0])
print(test_idx_df.shape)
test_idx_df.head()

(1828784, 17)


Unnamed: 0,drugA,sideeffect,drugB,label,subject,predicate,object,drug_se_pair_x,drug,idx_x,key,0,1,idx_y,drug_se_pair_y,matrix_values,test_idx
0,CID000002244,C0004144,CID000003365,1,aspirin,ATELECTASIS,fluconazole,"(fluconazole, ATELECTASIS)",aspirin,143,0,fluconazole,ATELECTASIS,0,"(fluconazole, ATELECTASIS)",1.0,60901269
1,CID000003440,C0004144,CID000004691,1,furosemide,ATELECTASIS,paroxetine,"(paroxetine, ATELECTASIS)",furosemide,427,0,paroxetine,ATELECTASIS,1,"(paroxetine, ATELECTASIS)",1.0,181852042
2,CID000003957,C0004144,CID000004075,1,loratadine,ATELECTASIS,5-aminosalicylic_acid,"(5-aminosalicylic_acid, ATELECTASIS)",loratadine,210,0,5-aminosalicylic_acid,ATELECTASIS,2,"(5-aminosalicylic_acid, ATELECTASIS)",1.0,89435432
3,CID000004168,C0004144,CID000005090,1,metoclopramide,ATELECTASIS,rofecoxib,"(rofecoxib, ATELECTASIS)",metoclopramide,445,0,rofecoxib,ATELECTASIS,3,"(rofecoxib, ATELECTASIS)",1.0,189517938
4,CID000001117,C0004144,CID000077993,1,sulfate,ATELECTASIS,eletriptan,"(eletriptan, ATELECTASIS)",sulfate,282,0,eletriptan,ATELECTASIS,4,"(eletriptan, ATELECTASIS)",1.0,120099010


In [101]:
# test_idx_df['test_idx'].to_csv("./polypharmacy/reduced_drug_drug_se_test_idx.csv", index=False)

In [102]:
X0_reduced_test_df.values.shape

(645, 425883)

In [103]:
X0_reduced_train_df.values.shape

(645, 425883)

In [104]:
X0_full = X0_reduced_train_df.values + X0_reduced_test_df.values
print(X0_full.shape)
X0_full

(645, 425883)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [2., 1., 0., ..., 0., 0., 0.]])

In [105]:
X0_full_df = pd.DataFrame(X0_full, columns = list(X0_reduced_test_df.columns))
X0_full_df.index = X0_reduced_test_df.index
print(X0_full_df.shape)
X0_full_df.head()

(645, 425883)


Unnamed: 0,"(fluconazole, ATELECTASIS)","(paroxetine, ATELECTASIS)","(5-aminosalicylic_acid, ATELECTASIS)","(rofecoxib, ATELECTASIS)","(eletriptan, ATELECTASIS)","(docetaxel, ATELECTASIS)","(simvastatin, ATELECTASIS)","(zopiclone, ATELECTASIS)","(vitamin_k1, ATELECTASIS)","(vancomycin, ATELECTASIS)",...,"(naltrexone, BALANITIS)","(thiotepa, BALANITIS)","(bimatoprost, BALANITIS)","(nortriptyline, BALANITIS)","(oxybutynin, BALANITIS)","(procainamide, BALANITIS)","(alendronate, BALANITIS)","(quinapril, BALANITIS)","(ifosfamide, BALANITIS)","(amantadine, BALANITIS)"
clobetasol,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibandronate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
minoxidil,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cefuroxime,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chloramphenicol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
# np.save("./polypharmacy/X0_complete.npy", X0_full)
# X0_full_df.to_csv("./polypharmacy/X0_full_dca.csv", header=False)
# X0_full_df.to_csv("./polypharmacy/X0_full.csv")

In [118]:
X0_full_transpose = X0_full.transpose(1,0)
X0_full_transpose.shape

(425883, 645)

In [119]:
X0_full_df_transpose = pd.DataFrame(X0_full_transpose, columns = list(X0_reduced_test_df.index))
X0_full_df_transpose.index = list(X0_reduced_test_df.columns)
print(X0_full_df_transpose.shape)
X0_full_df_transpose.head()

(425883, 645)


Unnamed: 0,clobetasol,ibandronate,minoxidil,cefuroxime,chloramphenicol,phenytoin,pregabalin,ketoprofen,orphenadrine,oxcarbazepine,...,diazepam,indapamide,moexipril,nicardipine,balsalazide,rifaximin,metoprolol,ciclesonide,imipramine,methocarbamol
"(fluconazole, ATELECTASIS)",0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
"(paroxetine, ATELECTASIS)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
"(5-aminosalicylic_acid, ATELECTASIS)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(rofecoxib, ATELECTASIS)",1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
"(eletriptan, ATELECTASIS)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
# X0_full_df_transpose.to_csv("../../NCMF/data_preparation/dca/X0_full_dca_transpose.csv", header=False)

#### DCA based drug representation 
Execute this section after DCA based representations are obtained - this section creates files for NCMF.

In [112]:
drug_dca_df = pd.read_csv("../../NCMF/data_preparation/dca/results_reduced_drug/latent.tsv", sep="\t", header=None, index_col=0)
drug_dca_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
clobetasol,-1.477965,0.055263,-0.387671,-0.410348,-0.779983,-0.211626,0.187983,-0.400219,0.229604,-0.813527,...,-1.226239,-1.029161,-0.252736,-1.222882,0.145419,-0.578294,-0.137620,-0.482548,-1.017901,-0.282062
ibandronate,-0.979523,0.512495,-1.118406,-0.957976,-1.058672,0.806900,0.854696,0.127340,0.057910,-0.765330,...,-1.267329,-1.394762,0.070129,-0.254322,0.476076,-0.315635,0.462407,-0.230711,-0.824041,-0.972451
minoxidil,-0.496299,0.310231,-0.380913,-0.520707,0.230287,0.045624,0.132420,0.117775,0.342519,-0.476457,...,-0.438298,-0.743557,-0.095661,-0.312968,0.416863,-0.113540,0.205695,-0.092409,-0.494010,-0.232103
cefuroxime,-1.815878,1.354533,1.591419,-0.453772,-2.323653,-0.734762,1.673545,-0.283735,0.745561,-2.169232,...,-1.600488,-2.387323,-0.145609,-1.595881,0.854974,-0.008406,-0.108984,-0.801288,-1.197318,-1.016796
chloramphenicol,-0.375469,-0.144983,0.126306,-0.098677,-0.219148,-0.025716,0.148492,-0.026394,-0.117954,-0.396537,...,-0.142247,-0.471953,-0.148310,-0.240029,0.175427,-0.129073,-0.015071,-0.233254,-0.197914,0.036223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rifaximin,-0.385021,0.182271,-0.086517,-0.221110,-0.131803,0.052850,0.130658,0.048632,0.095234,-0.308210,...,-0.274282,-0.668339,-0.041375,-0.165037,0.181825,-0.114359,0.000330,-0.148656,-0.172120,-0.273700
metoprolol,-2.398308,1.909733,-3.190359,-0.370883,-2.649178,-0.840486,0.681998,-0.394658,1.424900,-1.174966,...,-3.723595,-3.354362,0.440026,-0.819633,1.242410,-0.753205,-0.196449,-0.589857,-3.368162,-2.261014
ciclesonide,-0.407766,0.162416,-0.098271,-0.211982,-0.163655,0.052205,0.128414,0.050991,0.086685,-0.305937,...,-0.300587,-0.706917,-0.043945,-0.151815,0.157449,-0.104454,0.006980,-0.163499,-0.169896,-0.280388
imipramine,-1.004041,1.269748,-1.099304,-0.662551,-0.633680,0.206425,0.536312,-0.095614,0.923682,-1.300600,...,-1.457985,-1.702855,-0.300546,-0.945898,1.229130,-0.442815,-0.301082,0.304847,-1.120130,-1.268883


In [113]:
X0_drug_dca = drug_dca_df.values
# np.save("./polypharmacy/X0_drug_dca.npy", X0_drug_dca)

In [114]:
entity_dca_df = pd.DataFrame(columns=["entity_name"])
entities = list(drug_dca_df.index) + [f"drug_rep_{i}" for i in range(1,1025)] + list(proteins_df.protein)
entity_dca_df["entity_name"] = entities
print(entity_dca_df.shape)
entity_dca_df.head()

(24252, 1)


Unnamed: 0,entity_name
0,clobetasol
1,ibandronate
2,minoxidil
3,cefuroxime
4,chloramphenicol


In [115]:
entity_dca_df.tail()

Unnamed: 0,entity_name
24247,729438
24248,26046
24249,3417
24250,26049
24251,26050


In [116]:
# entity_dca_df.to_csv("./polypharmacy/entity_dca.csv")