In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch_geometric
import torch_geometric.transforms as T
from torch.nn.functional import relu, sigmoid
from torch.nn import Linear, Module, Dropout, MSELoss, CrossEntropyLoss, BatchNorm1d

from torch_geometric.nn import GCNConv, GATConv, GraphNorm
from torch_geometric.data import Data
from torch_sparse import SparseTensor

In [2]:
import drGAT

In [3]:
?drGAT

[0;31mType:[0m        module
[0;31mString form:[0m <module 'drGAT' from '/panfs/jay/groups/33/kuangr/inoue019/drGAT/model/drGAT.py'>
[0;31mFile:[0m        /panfs/jay/groups/33/kuangr/inoue019/drGAT/model/drGAT.py
[0;31mDocstring:[0m  
This is the official implementation of "drGAT: Attention-Guided Gene Assessment 
for Drug Response in Drug-Cell-Gene Heterogeneous Network."

Written by inoue0426
If you have any quesionts, feel free to make an issue to https://github.com/inoue0426/drGAT

# Preprocess

In [4]:
train_data = pd.read_csv('data/train.csv')
val_data = pd.read_csv('data/val.csv')
test_data = pd.read_csv('data/test.csv')

In [5]:
df = pd.read_csv('data/drug_cell_gene.csv.gz', index_col=0)
idx = pd.DataFrame(df.index).reset_index()
idx.columns = ['id', 'Original']
idx['Original'] = idx['Original'].astype(str)
df.index = list(idx['id'])
df.columns = list(idx['id'])

  df = pd.read_csv('../data/drug_cell_gene.csv.gz', index_col=0)


In [6]:
converter = dict(idx[['Original', 'id']].values)

In [7]:
def get_idx(X):
    X['Drug'] = [converter[str(i)] for i in X['Drug']]
    X['Cell'] = [converter[str(i)] for i in X['Cell']]
    return X

In [8]:
train_data = get_idx(train_data)
val_data = get_idx(val_data)
test_data = get_idx(test_data)

In [9]:
train_data.head()

Unnamed: 0,Drug,Cell
0,194,319
1,67,316
2,23,270
3,146,279
4,242,312


In [10]:
test_data.head()

Unnamed: 0,Drug,Cell
0,9,292
1,30,303
2,217,284
3,114,269
4,206,282


# Masking to remove leakage

In [11]:
np.sum(np.sign(df.values))

3512405.0

In [12]:
for i in test_data.values:
    df.loc[i[0], i[1]] = 0 
    df.loc[i[1], i[0]] = 0 

In [13]:
np.sum(np.sign(df.values))

3510746.0

# Convert input to tensor

In [14]:
def get_data(X):
    f = X.values
    x = torch.tensor(f).float()
    torch.cuda.empty_cache()
    adj = SparseTensor(
        row= torch.tensor(np.array(f.nonzero()))[0], 
        col= torch.tensor(np.array(f.nonzero()))[1], 
        sparse_sizes=(x.shape[0], x.shape[0])
    )
    torch.cuda.empty_cache()
    
    return x, adj

In [15]:
x, adj = get_data(df)
x

tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.9932,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.9932, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]])

In [16]:
adj

SparseTensor(row=tensor([   0,    0,    0,  ..., 4429, 4429, 4429]),
             col=tensor([   0,   18,   30,  ..., 4421, 4426, 4429]),
             size=(4430, 4430), nnz=3510746, density=17.89%)

In [17]:
train_drug = train_data.values[:, 0]
train_cell = train_data.values[:, 1]
val_drug = val_data.values[:, 0]
val_cell = val_data.values[:, 1]

In [18]:
train_labels = np.load('data/train_values.npy')
val_labels = np.load('data/val_values.npy')

train_labels = torch.tensor(train_labels).float()
val_labels = torch.tensor(val_labels).float()

In [19]:
data = [x, adj, train_drug, train_cell, train_labels, val_drug, val_cell, val_labels]
data

[tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 1.0000, 0.9932,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.9932, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]]),
 SparseTensor(row=tensor([   0,    0,    0,  ..., 4429, 4429, 4429]),
              col=tensor([   0,   18,   30,  ..., 4421, 4426, 4429]),
              size=(4430, 4430), nnz=3510746, density=17.89%),
 array([194,  67,  23, ...,  53, 193,   7]),
 array([319, 316, 270, ..., 272, 327, 296]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 array([217, 257,  75, ..., 202, 205,   8]),
 array([285, 283, 308, ..., 273, 270, 325]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.])]

# Train model 

In [20]:
model, attention = drGAT.train(data)

Epoch:  10
Train Loss:  0.6834854483604431
Val Loss:  0.6732016205787659
Train Accuracy:  0.5533870301528294
Val Accuracy:  0.5762081784386617 

Epoch:  20
Train Loss:  0.681586742401123
Val Loss:  0.6671024560928345
Train Accuracy:  0.5614415530772409
Val Accuracy:  0.588909541511772 

Epoch:  30
Train Loss:  0.6692971587181091
Val Loss:  0.6775858402252197
Train Accuracy:  0.5766212308963239
Val Accuracy:  0.5920074349442379 

Epoch:  40
Train Loss:  0.6617752909660339
Val Loss:  0.6457549333572388
Train Accuracy:  0.6070838496489054
Val Accuracy:  0.6267038413878563 

Epoch:  50
Train Loss:  0.6422420144081116
Val Loss:  0.6241583824157715
Train Accuracy:  0.6215406856670798
Val Accuracy:  0.644361833952912 

Epoch:  60
Train Loss:  0.6275438070297241
Val Loss:  0.5993730425834656
Train Accuracy:  0.6383725733168112
Val Accuracy:  0.6765799256505576 

Epoch:  70
Train Loss:  0.6136031150817871
Val Loss:  0.5809856653213501
Train Accuracy:  0.6490086741016109
Val Accuracy:  0.6889714

# Eval model

In [21]:
test_drug = test_data.values[:, 0]
test_cell = test_data.values[:, 1]

test_labels = np.load('data/test_values.npy')
test_labels = torch.tensor(test_labels).float()

In [22]:
data = [x, adj, test_drug, test_cell, test_labels]

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load('model.pt')
model = model.to(device)

In [24]:
drGAT.eval(model, data)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,True Positive,True Negative,False Positive,False Negative
0,0.773854,0.75869,0.754654,0.756667,1135,1363,361,369


# Attention coefficient
This will be utilized here.
[Fig2.ipynb](https://github.com/inoue0426/drGAT/blob/main/results/Fig2.ipynb)

In [25]:
attention = pd.DataFrame(attention)
attention

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4420,4421,4422,4423,4424,4425,4426,4427,4428,4429
0,0.036216,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.005549,0.004894,0.011156,0.010854,0.006821,0.0,0.005473,0.006567,0.039358,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.006484,0.005803,0.011456,0.010799,0.007105,0.0,0.006154,0.007456,0.034012,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.007834,0.007079,0.011270,0.011469,0.008628,0.0,0.007543,0.009150,0.029340,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.003840,0.003597,0.006058,0.005412,0.004176,0.0,0.003709,0.004691,0.014150,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.002038,0.000000,0.0,0.0,0.000000,0.002070,0.001779,0.002060,0.000000,0.000000
4426,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.001489,0.001396,0.0,0.0,0.000000,0.001450,0.001275,0.000000,0.000000,0.001298
4427,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.002059,0.002451,0.000000,0.002457,0.000000,0.000000
4428,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.232858,0.000000


In [26]:
# attention.to_csv('attention.csv.gz', compression='gzip')