In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip3 install torch torchvision torchaudio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
!pip3 install pandas duckdb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:
import os
import time
import numpy as np
import pandas as pd
import duckdb
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')
wd = os.path.join(os.getcwd(), 'drive', 'MyDrive', 'GT', 'bd4h_proj')
os.chdir(wd)
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/GT/bd4h_proj'

In [18]:
con = duckdb.connect()

In [32]:
feature_df = pd.read_csv('patient_feature.csv')
feature_df.head(2)

Unnamed: 0,SUBJECT_ID,1,2,3,4,5,6,7,8,9,...,1706,1707,1708,1709,1710,1711,1712,1713,1714,1715
0,4367,0,0,12,0,17,0,0,7,6,...,0,0,0,42,0,0,0,0,0,62
1,83395,4,34,26,1,4,4,12,5,5,...,28,8,1,58,5,5,4,1,1,8


In [33]:
target_df = pd.read_csv('patient_target.csv')
target_df.head(2)

Unnamed: 0,SUBJECT_ID,1,2,3,4,5,6,7,8,9,...,2794,2795,2796,2797,2798,2799,2800,2801,2802,2803
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
feature_df = con.execute("""
select *
from feature_df f 
where exists (select 1 from target_df t 
  where f.subject_id = t.subject_id)
order by subject_id
""").df()
feature_df.head(2)

Unnamed: 0,SUBJECT_ID,1,2,3,4,5,6,7,8,9,...,1706,1707,1708,1709,1710,1711,1712,1713,1714,1715
0,4367,0,0,12,0,17,0,0,7,6,...,0,0,0,42,0,0,0,0,0,62
1,83395,4,34,26,1,4,4,12,5,5,...,28,8,1,58,5,5,4,1,1,8


In [35]:
target_df = con.execute("""
select *
from target_df t
where exists (select 1 from feature_df f
  where f.subject_id = t.subject_id)
order by subject_id
""").df()
target_df.head(2)

Unnamed: 0,SUBJECT_ID,1,2,3,4,5,6,7,8,9,...,2794,2795,2796,2797,2798,2799,2800,2801,2802,2803
0,4367,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,83395,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
def set_device():
  print(f"cuda: {torch.cuda.is_available()}")
  device = torch.device("cuda")
  return device
device = set_device()
torch.manual_seed(1)
if device.type == "cuda":
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

cuda: True


In [46]:
feature = torch.tensor(feature_df.iloc[:, 1:].values, dtype=torch.float)
target = torch.tensor(target_df.iloc[:, 1:].values, dtype=torch.long)
dataset = TensorDataset(feature, target)
# train_size = 0.8 * feature_df.count()
# valid_size = feature_df.count() - train_size
# split = [train_size, valid_size]
split = [1, 1]
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, split, 
                                                  generator=torch.Generator().manual_seed(42))

In [53]:
NUM_EPOCHS = 1
BATCH_SIZE = 1
feature_count = len(feature_df.columns) - 1
target_count = len(target_df.columns) - 1
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [57]:
class PatientReprMlp(nn.Module):
	def __init__(self, feature_count=feature_count, 
                        target_count=target_count):
		super(PatientReprMlp, self).__init__()
		self.input_dim = feature_count
		self.hidden_size = 1000
		self.num_labels = target_count
		self.p = 0.2
		self.linear_layer = nn.Sequential(
            nn.Dropout(self.p),
			nn.Linear(self.input_dim, self.hidden_size),
            nn.ReLU(),
			nn.Linear(self.hidden_size, self.num_labels)
        )

	def forward(self, x):
		for layer in self.linear_layer:
			x = layer(x)
		return x

In [61]:
model = PatientReprMlp()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
device = set_device()
model.to(device)
criterion.to(device)
save_file = 'model.pth'

cuda: True


In [60]:
class AverageMeter(object):
	"""Computes and stores the average and current value"""

	def __init__(self):
		self.reset()

	def reset(self):
		self.val = 0
		self.avg = 0
		self.sum = 0
		self.count = 0

	def update(self, val, n=1):
		self.val = val
		self.sum += val * n
		self.count += n
		self.avg = self.sum / self.count


def compute_batch_accuracy(output, target):
	"""Computes the accuracy for a batch"""
	with torch.no_grad():

		batch_size = target.size(0)
		_, pred = output.max(1)
		correct = pred.eq(target).sum()

		return correct * 100.0 / batch_size


def train(model, device, data_loader, criterion, optimizer, epoch, print_freq=10):
	batch_time = AverageMeter()
	data_time = AverageMeter()
	losses = AverageMeter()
	accuracy = AverageMeter()

	model.train()

	end = time.time()
	for i, (input, target) in enumerate(data_loader):
		# measure data loading time
		data_time.update(time.time() - end)

		if isinstance(input, tuple):
			input = tuple([e.to(device) if type(e) == torch.Tensor else e for e in input])
		else:
			input = input.to(device)
		target = target.to(device)

		optimizer.zero_grad()
		output = model(input)
		loss = criterion(output, target)
		assert not np.isnan(loss.item()), 'Model diverged with loss = NaN'

		loss.backward()
		optimizer.step()

		# measure elapsed time
		batch_time.update(time.time() - end)
		end = time.time()

		losses.update(loss.item(), target.size(0))
		accuracy.update(compute_batch_accuracy(output, target).item(), target.size(0))

		if i % print_freq == 0:
			print('Epoch: [{0}][{1}/{2}]\t'
				  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
				  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
				  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
				  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
				epoch, i, len(data_loader), batch_time=batch_time,
				data_time=data_time, loss=losses, acc=accuracy))

	return losses.avg, accuracy.avg


def evaluate(model, device, data_loader, criterion, print_freq=10):
	batch_time = AverageMeter()
	losses = AverageMeter()
	accuracy = AverageMeter()

	results = []

	model.eval()

	with torch.no_grad():
		end = time.time()
		for i, (input, target) in enumerate(data_loader):

			if isinstance(input, tuple):
				input = tuple([e.to(device) if type(e) == torch.Tensor else e for e in input])
			else:
				input = input.to(device)
			target = target.to(device)

			output = model(input)
			loss = criterion(output, target)

			# measure elapsed time
			batch_time.update(time.time() - end)
			end = time.time()

			losses.update(loss.item(), target.size(0))
			accuracy.update(compute_batch_accuracy(output, target).item(), target.size(0))

			y_true = target.detach().to('cpu').numpy().tolist()
			y_pred = output.detach().to('cpu').max(1)[1].numpy().tolist()
			results.extend(list(zip(y_true, y_pred)))

			if i % print_freq == 0:
				print('Test: [{0}/{1}]\t'
					  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
					  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
					  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
					i, len(data_loader), batch_time=batch_time, loss=losses, acc=accuracy))

	return losses.avg, accuracy.avg, results

In [65]:
best_val_acc = 0.0
train_losses, train_accuracies = [], []
valid_losses, valid_accuracies = [], []

for epoch in range(NUM_EPOCHS):
	train_loss, train_accuracy = train(model, device, train_loader, criterion, optimizer, epoch)
	valid_loss, valid_accuracy, valid_results = evaluate(model, device, valid_loader, criterion)

	train_losses.append(train_loss)
	valid_losses.append(valid_loss)

	train_accuracies.append(train_accuracy)
	valid_accuracies.append(valid_accuracy)

	is_best = valid_accuracy > best_val_acc  # let's keep the model that has the best accuracy, but you can also use another metric.
	if is_best:
		best_val_acc = valid_accuracy
		torch.save(model, save_file)