In [1]:
import numpy as np
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create dataset for training and test

## Preprocess for train, test, validation data

In [2]:
train_data = pd.read_csv("data/train.csv")
val_data = pd.read_csv("data/val.csv")
test_data = pd.read_csv("data/test.csv")

In [3]:
train_data.head()

Unnamed: 0,Drug,Cell
0,682298,PC_3
1,141549,OVCAR_8
2,56410,MDA_MB_231
3,603077,U251
4,757804,IGROV1


## Convert drug and cell name to graph nodes' index

In [4]:
converter = dict(
    pd.DataFrame(np.load("data/idxs.npy", allow_pickle=True)).T[[1, 0]].values
)

In [5]:
def get_idx(X):
    X["Drug"] = [converter[(i)] for i in X["Drug"]]
    X["Cell"] = [converter[(i)] for i in X["Cell"]]
    return X

In [6]:
train_data = get_idx(train_data)
val_data = get_idx(val_data)
test_data = get_idx(test_data)

In [7]:
train_data.head()

Unnamed: 0,Drug,Cell
0,194,319
1,67,316
2,23,270
3,146,279
4,242,312


# Get edge index

In [8]:
edge_index = pd.DataFrame(np.load("data/edges.npy")).T
edge_index

Unnamed: 0,0,1
0,0,269
1,0,274
2,0,275
3,0,276
4,0,279
...,...,...
1500627,3046,324
1500628,3046,325
1500629,3046,326
1500630,3046,327


# Masking

In [9]:
edge_index = (
    edge_index.merge(
        test_data.rename(columns={"Drug": 0, "Cell": 1}),
        on=[0, 1],
        how="outer",
        indicator=True,
    )
    .query('_merge == "left_only"')
    .drop("_merge", axis=1)
)
edge_index

Unnamed: 0,0,1
5,0,279
7,0,281
8,0,282
9,0,283
10,0,284
...,...,...
1502351,3046,324
1502352,3046,325
1502353,3046,326
1502354,3046,327


In [10]:
edge_index = torch.tensor(edge_index.values.T).int()
edge_index = edge_index.type(torch.int64)
edge_index

tensor([[   0,    0,    0,  ..., 3046, 3046, 3046],
        [ 279,  281,  282,  ...,  326,  327,  328]])

## Process for torch

In [11]:
train_drug = train_data.values[:, 0]
train_cell = train_data.values[:, 1]
val_drug = val_data.values[:, 0]
val_cell = val_data.values[:, 1]

In [12]:
train_labels = np.load("data/train_labels.npy")
val_labels = np.load("data/val_labels.npy")

train_labels = torch.tensor(train_labels).float()
val_labels = torch.tensor(val_labels).float()

## Get feature matrix

In [13]:
drug = pd.read_csv("data/drug_sim.csv", index_col=0)
cell = pd.read_csv("data/cell_sim.csv", index_col=0)
gene = pd.read_csv("data/gene_sim.csv", index_col=0)

In [14]:
drug = torch.tensor(drug.values).float()
cell = torch.tensor(cell.values).float()
gene = torch.tensor(gene.values).float()

# Create the dataset

In [15]:
data = [
    drug,
    cell,
    gene,
    edge_index,
    train_drug,
    train_cell,
    val_drug,
    val_cell,
    train_labels,
    val_labels,
]
data

[tensor([[1.0000, 0.9683, 0.9683,  ..., 0.9394, 0.9603, 0.9575],
         [0.9683, 1.0000, 0.9932,  ..., 0.9477, 0.9659, 0.9650],
         [0.9683, 0.9932, 1.0000,  ..., 0.9514, 0.9697, 0.9688],
         ...,
         [0.9394, 0.9477, 0.9514,  ..., 1.0000, 0.9472, 0.9500],
         [0.9603, 0.9659, 0.9697,  ..., 0.9472, 1.0000, 0.9598],
         [0.9575, 0.9650, 0.9688,  ..., 0.9500, 0.9598, 1.0000]]),
 tensor([[1.0000, 0.0170, 0.0069,  ..., 0.4732, 0.0140, 0.2338],
         [0.0170, 1.0000, 0.4351,  ..., 0.0421, 0.4509, 0.0916],
         [0.0069, 0.4351, 1.0000,  ..., 0.0196, 0.2926, 0.0337],
         ...,
         [0.4732, 0.0421, 0.0196,  ..., 1.0000, 0.0267, 0.3622],
         [0.0140, 0.4509, 0.2926,  ..., 0.0267, 1.0000, 0.0919],
         [0.2338, 0.0916, 0.0337,  ..., 0.3622, 0.0919, 1.0000]]),
 tensor([[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 1., 0., 0.],
         [

In [16]:
# torch.save(data, 'train.pt')

## Create test data

In [17]:
test_drug = test_data.values[:, 0]
test_cell = test_data.values[:, 1]

test_labels = np.load("data/test_labels.npy")
test_labels = torch.tensor(test_labels).float()

In [18]:
test = [drug, cell, gene, edge_index, test_drug, test_cell, test_labels]

In [19]:
# torch.save(test, 'test.pt')