In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import sklearn
from datetime import datetime
from torch.nn.functional import mse_loss
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from sklearn.model_selection import train_test_split
from utils_pp import replace_cell_names_with_id
from torch.utils.tensorboard import SummaryWriter
%load_ext autoreload
%autoreload 2

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
columns = ["cell_line", "drugA_name", "drugB_name", "drugA_conc", "drugB_conc", "target"]
df_train = pd.read_csv("../data_raw/oneil.csv", usecols=(1,2,3,4,5,12)).iloc[:,[0,1,3,2,4,5]].set_axis(columns, axis=1)
# df_train["cell_line"]
df_test = pd.read_csv("../data/test_yosua.csv").set_axis(columns + ["std"], axis=1).convert_dtypes()

drug_data = pd.read_pickle("../data/drug_data.pkl.compress", compression="gzip")
cell_data = pd.read_pickle("../data/cell_line_data.pkl.compress", compression="gzip")

df_train = replace_cell_names_with_id(dataframe=df_train, mapping_file="../data/mappingccl.csv")
df_test = replace_cell_names_with_id(dataframe=df_test, mapping_file="../data/mappingccl.csv")
df_train = df_train[df_train.cell_line.isin(cell_data.index)]
df_train, df_val = train_test_split(df_train, test_size=0.2, shuffle=True)
cell_data = cell_data[cell_data.index.isin(pd.concat([df_train.cell_line, df_test.cell_line]))]
print("oneil", df_train.memory_usage().sum()/1e6, df_train.shape,"\n", df_train.dtypes)
print("drug_feat", drug_data.memory_usage().sum()/1e6, drug_data.shape)
print("cell_feat", cell_data.memory_usage().sum()/1e6, cell_data.shape)


oneil 13.13536 (234560, 6) 
 cell_line      object
drugA_name     object
drugB_name     object
drugA_conc    float64
drugB_conc    float64
target        float64
dtype: object
drug_feat 0.407338 (42, 2416)
cell_feat 0.686136 (32, 5011)


In [12]:
class Dataset_from_pd(Dataset):
    def __init__(self, drug_comb_data, drug_feat, cell_feat):
        self.drug_comb_data = drug_comb_data.to_numpy()
        self.drug_feat = drug_feat.to_numpy()
        self.cell_feat = cell_feat.to_numpy()
        self.drug_mapping = pd.Series(range(len(self.drug_feat)), index=drug_feat.index).to_dict()
        self.cell_mapping = pd.Series(range(len(self.cell_feat)), index=cell_feat.index).to_dict()
        # print(self.cell_mapping, self.drug_mapping)

        print()
    def __len__(self):
        return len(self.drug_comb_data)
    
    def __getitem__(self, idx):
        combi = self.drug_comb_data[idx]
        drug_A = self.drug_feat[self.drug_mapping[combi[1]]]
        drug_B = self.drug_feat[self.drug_mapping[combi[2]]]
        cell_line = self.cell_feat[self.cell_mapping[combi[0]]]
        # if np.isnan(drug_A).sum() >0 or np.isnan(drug_B).sum() >0 or np.isnan(cell_line).sum() > 0:             
        #     print(np.isnan(drug_A),np.isnan(drug_A).sum(),"\n", np.isnan(drug_B),np.isnan(drug_B).sum(), "\n", np.isnan(cell_line),np.isnan(cell_line).sum())
        #     raise NotImplementedError
        return np.concatenate([drug_A, drug_B, cell_line, combi[3:5].astype("float32")], dtype="float32"), combi[5:6].astype("float32")

train_set  = Dataset_from_pd(df_train, drug_data, cell_data)
train_dl = DataLoader(train_set, batch_size=256, shuffle=True)
xi, yi = next(iter(train_dl))
print(xi.shape, yi.shape)
# print(np.argwhere(xi>1e2))
# print(tuple(np.argwhere(xi>100)))
print(xi.numpy()[tuple(np.argwhere(xi>100))])
# print(xi.numpy()[:,tuple(np.argwhere(xi>100))[1]])



torch.Size([256, 9845]) torch.Size([256, 1])
[250. 250. 250. 250. 250. 250. 250.]


In [4]:
class MLP(torch.nn.Module):
    def __init__(self, h_sizes):
        super().__init__()
        self.hidden = nn.ModuleList()
        for k in range(len(h_sizes)-1):
            self.hidden.append(nn.Linear(h_sizes[k], h_sizes[k+1]))
            self.hidden.append(nn.Dropout(0.2))
            self.hidden.append(nn.ReLU())
        self.hidden.append(nn.Linear(h_sizes[-1], 1))
    def forward(self, x):
        for lay in self.hidden:
            # print(torch.sum(torch.isnan(x)))
            # print(torch.sum(x>1e3))
            x = lay(x)
        return x
model = MLP([xi.shape[1],32])
yi = model.forward(xi)
print(yi.shape)

torch.Size([256, 1])


In [59]:
def train_one_epoch(epoch_index, tb_writer, training_loader, optimizer, loss_fn):
    running_loss = 0.
    last_loss = 0.
    for i, data in enumerate(training_loader):
        inputs, labels = data
        inputs = inputs.to(device=device)
        labels = labels.to(device=device)
        # Zero your gradients for every batch!
        optimizer.zero_grad()
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        running_loss += loss.item() / inputs.shape[0]
        if i % 10 == 9:
            last_loss = running_loss / 10 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss), outputs[0][0].item(), labels[0][0].item())
            # tb_x = epoch_index * len(training_loader) + i + 1
            # tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [58]:
batch_size = 1024
train_set  = Dataset_from_pd(df_train, drug_data, cell_data)
val_set = Dataset_from_pd(df_val, drug_data, cell_data)
test_set  = Dataset_from_pd(df_test, drug_data, cell_data)
train_dl = DataLoader(train_set, batch_size=batch_size)
xi, yi = next(iter(train_dl))
val_dl = DataLoader(val_set, batch_size=batch_size)
test_dl = DataLoader(test_set)

model = MLP([xi.shape[1],256,256,256, 128,128,128,64,64,64])
print(summary(model.to("cuda"), xi.shape))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()
# Initializing in a separate cell so we can easily add more epochs to the same run
# timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 1

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    model = model.to(device=device)

    avg_loss = train_one_epoch(epoch_number, "writer", train_dl, optimizer, loss_fn)

    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(val_dl):
            vinputs, vlabels = vdata
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device)
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels) / vinputs.shape[0]
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1) 
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    epoch_number += 1

with torch.no_grad():
    for i, data in enumerate(test_dl):
        inputs, labels = data
        inputs = inputs.to(device=device)
        labels = labels.to(device=device)
        outputs = model(inputs)
        print(outputs.detach().to("cpu").numpy(), labels.detach().to("cpu").numpy())




----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1            [-1, 1024, 256]       2,520,576
           Dropout-2            [-1, 1024, 256]               0
              ReLU-3            [-1, 1024, 256]               0
            Linear-4            [-1, 1024, 256]          65,792
           Dropout-5            [-1, 1024, 256]               0
              ReLU-6            [-1, 1024, 256]               0
            Linear-7            [-1, 1024, 256]          65,792
           Dropout-8            [-1, 1024, 256]               0
              ReLU-9            [-1, 1024, 256]               0
           Linear-10            [-1, 1024, 128]          32,896
          Dropout-11            [-1, 1024, 128]               0
             ReLU-12            [-1, 1024, 128]               0
           Linear-13            [-1, 1024, 128]          16,512
          Dropout-14            [-1,

In [69]:
for i, parameters in enumerate(model.hidden.parameters()):
    print(parameters)

Parameter containing:
tensor([[ 0.0118, -0.0108,  0.0058,  ...,  0.0058,  0.0030,  0.0047],
        [ 0.0059,  0.0063, -0.0077,  ...,  0.0031,  0.0070,  0.0044],
        [-0.0090,  0.0068,  0.0027,  ..., -0.0026,  0.0033, -0.0075],
        ...,
        [-0.0022, -0.0079,  0.0012,  ..., -0.0086,  0.0008, -0.0007],
        [ 0.0047,  0.0080,  0.0099,  ..., -0.0008,  0.0015,  0.0003],
        [-0.0066, -0.0032, -0.0019,  ..., -0.0002,  0.0064,  0.0058]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([ 0.0037,  0.0079, -0.0049, -0.0013, -0.0060, -0.0011, -0.0059, -0.0074,
         0.0092,  0.0003,  0.0023, -0.0005,  0.0023,  0.0075,  0.0081,  0.0131,
         0.0061, -0.0007, -0.0036, -0.0055,  0.0009, -0.0043,  0.0064,  0.0067,
         0.0021, -0.0007, -0.0025, -0.0038, -0.0101, -0.0062,  0.0035, -0.0015,
        -0.0019, -0.0007, -0.0024, -0.0053,  0.0043, -0.0020, -0.0054,  0.0084,
         0.0039, -0.0064, -0.0081,  0.0058, -0.0035,  0.0050, -0.0108, -0.0071