## Main

In [1]:
# import neccessary packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split, Dataset, DataLoader
import numpy as np
import time
import os
import torchvision
import matplotlib.pyplot as plt
from sklearn.metrics.ranking import roc_auc_score
from myDataset import trainDataset, otherDataset
from myDataset import tv_collate
#from myModel import LAS
from torch.utils import data
from modelTrainer import ModelTrainer
import pickle
import os
import time
import collections
import torch.nn.functional as F

%matplotlib inline

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [4]:
# Dataset with augumentation
train_x_str = './data/train_x'
train_y_str = './data/train_y.npy'
vali_x_str = './data/val_x.npy'
vali_y_str = './data/val_y.npy'
test_x_str = './data/test_x.npy'

# with augumentation
train_dataset = trainDataset(train_x_str, [1,2], train_y_str)
train_dataloader = data.DataLoader(dataset=train_dataset, batch_size=batch_size, 
	collate_fn=tv_collate, shuffle=True, num_workers=4)

In [3]:
with open(r'word2idx.pickle','rb') as inputf:
    word2idx = pickle.load(inputf)

In [5]:
# Dataset without augumentation
train_x_str = './raw_data/x_train.npy'
train_y_str = './raw_data/y_train.npy'
vali_x_str = './raw_data/x_vali.npy'
vali_y_str = './raw_data/y_vali.npy'

# without augumentation
train_dataset = otherDataset(train_x_str, train_y_str)
train_dataloader = data.DataLoader(dataset=train_dataset, batch_size=batch_size, 
	collate_fn=tv_collate, shuffle=True, num_workers=4)

In [4]:
batch_size = 32
vocab_size = len(word2idx)
epoch = 20
tf_num = 1
output_dim_resnet = 1024
context_dim = 256 # key, values, query, energy, attention should all have this dim. 
embedding_dim = 256
lstmcell_hidden_dim = 512

In [5]:
vali_dataset = otherDataset(vali_x_str, vali_y_str)
vali_dataloader = data.DataLoader(dataset=vali_dataset, batch_size=batch_size, 
	collate_fn=tv_collate, shuffle=False, num_workers=4)

In [10]:
model = LAS(vocab_size, output_dim_resnet, context_dim, embedding_dim, lstmcell_hidden_dim)
#cp = torch.load('./experiments/1544067728/model-4.pkl')
#model.load_state_dict(cp['state_dict'])



In [11]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1544593722


In [6]:
import torch
import torch.nn as nn

import numpy as np
from torch.autograd import Variable
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from torch.distributions.gumbel import Gumbel
import torchvision
import math
import random


class MLP(nn.Module):
	def __init__(self, input_dim, output_dim, hidden_dim=1024):
		super(MLP, self).__init__()
		self.fc1 = nn.Linear(input_dim, output_dim)
		#self.elu = nn.ELU()
		#self.fc2 = nn.Linear(hidden_dim, output_dim)

	def forward(self, x):
		x = self.fc1(x)
		#x = self.elu(x)
		#x = self.fc2(x)
		return x

In [9]:
import collections 
checkpoint = torch.load('chestxnet_batch_75_best.pth.tar')
pretrained_dict = collections.OrderedDict()
for key, value in checkpoint['state_dict'].items():
    try:
        tmp_key = key[19:]
        lst = tmp_key.split('.')
        #print(lst)
        lst[3] = lst[3] + lst[4]
        lst[4] = lst[5]
        lst = lst[:5]
        #print(lst)
        new_key = '.'.join(lst)
        pretrained_dict[new_key] = value
    except:
        pass

## Model Architecture

In [10]:
# visual features
class Listener(nn.Module):
	def __init__(self, hidden_dim, output_dim, pretrained_dict = pretrained_dict):
		super(Listener, self).__init__()
		model = torchvision.models.densenet121(pretrained=False)
		model_dict = model.state_dict()

        # 1. filter out unnecessary keys
		pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
        # 2. overwrite entries in the existing state dict
		model_dict.update(pretrained_dict) 
        # 3. load the new state dict
		model.load_state_dict(model_dict)
        
		self.dense121 = nn.Sequential(*list(model.children())[:-1])
		self.mlp = nn.Linear(hidden_dim, output_dim)

	def forward(self, x):
		x1 = x[:, :, :, :224]
		x2 = x[:, :, :, 224:]    
		#print(x1.shape)
		x1 = self.dense121(x1) # (N, 512, 7, 14) 
		x2 = self.dense121(x2)
		#print(x1.shape)
		x = torch.cat((x1, x2), dim = -1)        
		x = x.permute(0,2,3,1) # (N, 7, 14, 512)
		#print(x.shape)
		values = self.mlp(x) # (N, 7, 14, context_size)
		bs, h, w, c = values.shape
		values = values.view(bs, -1, c) # (N, 98, context_size)
		return values

In [7]:
class Attention(nn.Module):
	def __init__(self):
		super(Attention, self).__init__()

	def forward(self, query, values):

		query = query.unsqueeze(1) 
		energy = torch.bmm(query, values.permute(0,2,1)) # (N, 1, context_size).dot(N, context_size, 98) = (N, 1, 98)

		attention_distribution = F.softmax(energy, dim=2) # (N, 1, 98)

		context = torch.bmm(attention_distribution, values).squeeze(1) # (N, 512)

		# attention has shape [32, 1, 98]
		# context has shape [32, 512]
		return context, attention_distribution

class Speller(nn.Module):
	def __init__(self, vocab_size, context_dim, embedding_dim, lstmcell_hidden_dim):
		super(Speller, self).__init__()

		self.embedding = nn.Embedding(vocab_size, embedding_dim)
		self.lstmcell1 = nn.LSTMCell(context_dim+embedding_dim, lstmcell_hidden_dim)
		self.lstmcell2 = nn.LSTMCell(lstmcell_hidden_dim, lstmcell_hidden_dim)
		self.lstmcell3 = nn.LSTMCell(lstmcell_hidden_dim, lstmcell_hidden_dim)
		self.queryProjection = nn.Linear(lstmcell_hidden_dim, context_dim)
		self.characterDistribution = nn.Linear(lstmcell_hidden_dim+context_dim, vocab_size)
		self.attention = Attention()
		self.h1 = nn.Parameter(torch.zeros(1, lstmcell_hidden_dim))
		self.c1 = nn.Parameter(torch.zeros(1, lstmcell_hidden_dim))
		self.h2 = nn.Parameter(torch.zeros(1, lstmcell_hidden_dim))
		self.c2 = nn.Parameter(torch.zeros(1, lstmcell_hidden_dim))
		self.h3 = nn.Parameter(torch.zeros(1, lstmcell_hidden_dim))
		self.c3 = nn.Parameter(torch.zeros(1, lstmcell_hidden_dim))
		self.result = []
		self.attention_map = []
		self.vocab_size = vocab_size

	def forward(self, values, y, tf_num):

		h1 = self.h1.expand(values.shape[0], -1)
		c1 = self.c1.expand(values.shape[0], -1)
		h2 = self.h2.expand(values.shape[0], -1)
		c2 = self.c2.expand(values.shape[0], -1)
		h3 = self.h3.expand(values.shape[0], -1)
		c3 = self.c3.expand(values.shape[0], -1)

		self.result = []
		self.attention_map = []

		start_symbol = torch.zeros(y.shape[0], self.vocab_size).to(device)
		start_symbol[:,0] = 1
		self.result.append(start_symbol)		

		embedding_y = self.embedding(y)

		query = self.queryProjection(h3)
		context, attention_distribution = self.attention(query, values)
		teacher_force = None
		self.attention_map.append(attention_distribution)

		count = 0
		for i in range(y.shape[1] - 1):
			count += 1
			if random.random() > tf_num:
				if i == 0:
					embedding_yi = embedding_y[:,i,:]
				else:
					embedding_yi = self.embedding(teacher_force)
			else:
				embedding_yi = embedding_y[:,i,:]
			cell_input = torch.cat((embedding_yi, context), dim=1)
			h1, c1 = self.lstmcell1(cell_input, (h1, c1))
			h2, c2 = self.lstmcell2(h1, (h2, c2))
			h3, c3 = self.lstmcell3(h2, (h3, c3))
			query = self.queryProjection(h3)
			context, attention_distribution = self.attention(query, values)
			character_layer_input = torch.cat((h3, context), dim=1)
			output = self.characterDistribution(character_layer_input)
			self.result.append(output)
			self.attention_map.append(attention_distribution)
			_, tf = output.max(1)
			teacher_force = tf
		result = torch.stack(self.result)
		result = result.permute(1,0,2)
		attention_map = torch.cat(self.attention_map, dim=1)
		attention_map =  np.asarray(attention_map.cpu().detach())
		return result, attention_map
	
	def beam_decoder(self, values, x_lengths, beam_width):

		h1 = self.h1.expand(values.shape[0], -1)
		c1 = self.c1.expand(values.shape[0], -1)
		h2 = self.h2.expand(values.shape[0], -1)
		c2 = self.c2.expand(values.shape[0], -1)
		h3 = self.h3.expand(values.shape[0], -1)
		c3 = self.h3.expand(values.shape[0], -1)

		max_length = 399

		# pool = []
		sequence = [[list(), 0, h1, c1, h2, c2, h3, c3]]
		candidate_set = []
		
		count = 0
		while (count < max_length):
			for i in range(len(sequence)):
				seq, prob, h1, c1, h2, c2, h3, c3 = sequence[i]
				if count == 0:
					y = 0
				else:
					y = seq[-1]
				if y == 1667 and count != 0:
					candidate_set.append(sequence[i])
				else:
					tensor_y = torch.tensor([[y]]).to(device)
					embedding_y = self.embedding(tensor_y)
					query = self.queryProjection(h3)
					context, attention_distribution = self.attention(query, values)
					embedding_y = embedding_y.squeeze(1)
					cell_input = torch.cat((embedding_y, context), dim=1)
					h1, c1 = self.lstmcell1(cell_input, (h1, c1))
					h2, c2 = self.lstmcell2(h1, (h2, c2))
					h3, c3 = self.lstmcell3(h2, (h3, c3))
					query = self.queryProjection(h3)
					context, attention_distribution = self.attention(query, values)
					character_layer_input = torch.cat((h3, context), dim=1)
					output = self.characterDistribution(character_layer_input)
					output = F.log_softmax(output, dim=1)
					output = output.view(output.shape[1],)
					output = output.cpu().numpy()
					for k in range(len(output)):
						new_prob = 0
						if len(seq) != 0:
							new_prob = (output[k] + (prob * len(seq))) / (len(seq)+1)
						else:
							new_prob = output[k]
						candidate = [seq+[k], new_prob, h1, c1, h2, c2, h3, c3]
						candidate_set.append(candidate)
			candidate_set = sorted(candidate_set, key=lambda tup:tup[1], reverse=True)
			candidate_set = np.asarray(candidate_set)
			sequence =  candidate_set[:beam_width]
			candidate_set = []
			# print(sequence)
			count += 1
			# print(sequence)
		return sequence[0]

	def greedy_decoder(self, values, x_lengths, y):
		h1 = self.h1.expand(values.shape[0], -1)
		c1 = self.c1.expand(values.shape[0], -1)
		h2 = self.h2.expand(values.shape[0], -1)
		c2 = self.c2.expand(values.shape[0], -1)
		h3 = self.h3.expand(values.shape[0], -1)
		c3 = self.h3.expand(values.shape[0], -1)

		self.result = []
		self.attentions = []
		self.result.append(0)
		count = 0
		max_length = 299
		while (count < max_length):
			if (y == 1667 and count != 0):
				break
			else:
				tensor_y = torch.tensor([[y]]).to(device)
				embedding_y = self.embedding(tensor_y)
				query = self.queryProjection(h3)
				context, attention_distribution = self.attention(query, values)
				embedding_y = embedding_y.squeeze(1)
				cell_input = torch.cat((embedding_y, context), dim=1)
				h1, c1 = self.lstmcell1(cell_input, (h1, c1))
				h2, c2 = self.lstmcell2(h1, (h2, c2))
				h3, c3 = self.lstmcell3(h2, (h3, c3))
				query = self.queryProjection(h3)
				context, attention_distribution = self.attention(query, values)
				character_layer_input = torch.cat((h3, context), dim=1)
				output = self.characterDistribution(character_layer_input)
				prob = F.softmax(output, dim=-1).squeeze()
				y = np.random.choice(self.vocab_size, p=prob.cpu().detach().numpy())
				self.result.append(y)
				self.attentions.append(attention_distribution.cpu().detach().numpy())
			count += 1
		return self.result, self.attentions

class LAS(nn.Module):

	def __init__(self, vocab_size, output_dim_resnet, context_dim, embedding_dim, lstmcell_hidden_dim):
		super(LAS, self).__init__()

		self.listener = Listener(output_dim_resnet, context_dim)
		self.speller = Speller(vocab_size, context_dim, embedding_dim, lstmcell_hidden_dim)

	def forward(self, x, y, tf_num):
		# values
		values = self.listener(x)
		pred = self.speller(values, y, tf_num)
		return pred

	def beam_decoder(self, x, x_lengths, y):
		values = self.listener(x)
		pred = self.speller.beam_decoder(values, x_lengths, y) 
		return pred

	def greedy_decoder(self, x, x_lengths, y):
		values = self.listener(x)
		pred, attention = self.speller.greedy_decoder(values, x_lengths, y)
		return pred, attention


def init_weights(m):
	if type(m) == nn.Linear:
		print("initilziing linear.....")
		torch.nn.init.xavier_uniform_(m.weight)
	if type(m) == nn.Conv2d:
		print("initilziing cnn.....")
		torch.nn.init.kaiming_uniform_(m.weight)
	if type(m) == nn.LSTM:
		for name, param in m.named_parameters():
			if 'weight' in name:
				print("initilziing lstm.....")
				torch.nn.init.xavier_uniform_(param)

## Training

In [None]:
trainer = ModelTrainer(model, train_dataloader, vali_dataloader, epoch, tf_num, run_id)

for i in range(epoch):
	trainer.train_val_epoch()
	trainer.save()

[TRAIN]  Epoch [1/20]   Loss: 4.2064 
[TRAIN]  Epoch [1/20]   Perplexity: 67.1166 
[VAL]  Epoch [1/20]   Loss: 2.9883
[VAL]  Epoch [1/20]   Perplexity: 19.8524
[TRAIN]  Epoch [2/20]   Loss: 2.9754 
[TRAIN]  Epoch [2/20]   Perplexity: 19.5981 
[VAL]  Epoch [2/20]   Loss: 2.5931
[VAL]  Epoch [2/20]   Perplexity: 13.3717
[TRAIN]  Epoch [3/20]   Loss: 2.6389 
[TRAIN]  Epoch [3/20]   Perplexity: 13.9980 
[VAL]  Epoch [3/20]   Loss: 2.4956
[VAL]  Epoch [3/20]   Perplexity: 12.1296
[TRAIN]  Epoch [4/20]   Loss: 2.2216 
[TRAIN]  Epoch [4/20]   Perplexity: 9.2222 
[VAL]  Epoch [4/20]   Loss: 2.4800
[VAL]  Epoch [4/20]   Perplexity: 11.9409
[TRAIN]  Epoch [5/20]   Loss: 1.9264 
[TRAIN]  Epoch [5/20]   Perplexity: 6.8648 
[VAL]  Epoch [5/20]   Loss: 2.4738
[VAL]  Epoch [5/20]   Perplexity: 11.8679
[TRAIN]  Epoch [6/20]   Loss: 2.0642 
[TRAIN]  Epoch [6/20]   Perplexity: 7.8786 
[VAL]  Epoch [6/20]   Loss: 2.5150
[VAL]  Epoch [6/20]   Perplexity: 12.3664
Epoch     6: reducing learning rate of grou

## Generator

In [14]:
import numpy as np
import os
from testDataset import testDataset
from testDataset import test_collate
from torch.utils import data
import torch
from torch.autograd import Variable
from torch import nn
import torch.optim as optim
import time
import pickle

# gcloud
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
# gcloud
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
test_x_str = './raw_data/x_test.npy'
test_y_str = './raw_data/y_test.npy'
test_y = np.load(test_y_str)
test_X = np.load(test_x_str)

with open(r'word2idx.pickle','rb') as inputf:
    word2idx = pickle.load(inputf)
idx2word = {idx:word for word, idx in word2idx.items()}

cuda:0


In [22]:
batch_size = 1
vocab_size = len(word2idx)
epoch = 20
tf_num = 1
output_dim_resnet = 1024
context_dim = 256 # key, values, query, energy, attention should all have this dim. 
embedding_dim = 256
lstmcell_hidden_dim = 512

In [16]:
test_dataset = testDataset(test_x_str)
test_dataloader = data.DataLoader(dataset=test_dataset, batch_size=batch_size, 
	collate_fn=test_collate, shuffle=False, num_workers=0)

(403, 3, 224, 448)


In [12]:
model = LAS(vocab_size, output_dim_resnet, context_dim, embedding_dim, lstmcell_hidden_dim)
cp = torch.load('./experiments/1544589762/model-7.pkl')
model.load_state_dict(cp['state_dict'])
model.to(device)
model.eval()
result = []



In [None]:
model.eval()
attentions = []
with torch.no_grad():
	count = 0
	for (x, x_lengths) in test_dataloader:
		# x, y has the shape (batch, seq_length, features)
		x = Variable(torch.from_numpy(x)).to(device)
		x_lengths = Variable(torch.from_numpy(x_lengths)).to(device)
		pred, attention = model.greedy_decoder(x, x_lengths, 0)
		count += 1
		print(count)
		result.append(pred)
		attentions.append(attention)
gt = test_y

In [31]:
np.save('Prediction.npy', result)

## Visualization

In [None]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

In [None]:
def toString(pred):
    lst = []
    for idx in pred:
        lst.append(idx2word[idx])
    return " ".join(lst)

In [None]:
from scipy.interpolate import Rbf
def smoothen_attention(attention, eps=64):
    rbf = Rbf(np.arange(7).repeat(14)*32+16, np.tile(np.arange(14), 7)*32+16, attention.reshape(-1), epsilon = eps)
    z = rbf(np.arange(224).repeat(448), np.tile(np.arange(448),224))
    z = z.reshape(224,448)
    return z

In [None]:
def plot_attention(img, pred, attention):
    print(toString(pred))
    l = len(attention)
    plt.figure(figsize = (6, 3*l))
    grey = rgb2gray(img.transpose(1,2,0))
    vmax = np.amax(grey)
    vmin = np.amin(grey)
    for i in range(l):
        plt.subplot(l, 1, i+1)
        plt.imshow(attention[i].reshape(7, 14), vmin = 0, vmax = 1)
        #smoothed_attention = smoothen_attention(attention[i])
        #plt.imshow(smoothed_attention, , vmin = 0, vmax = 1)
        # I need to fix color scale for images
        #plt.imshow(grey*0.2+ 0.8*np.multiply(grey, smoothed_attention),cmap = plt.get_cmap('gray'))
        plt.colorbar()

In [None]:
def plot_img(img, pred, attention):
    print(toString(pred))
    l = len(attention)
    plt.figure(figsize = (6, 3*l))
    grey = rgb2gray(img.transpose(1,2,0))
    vmax = np.amax(grey)
    vmin = np.amin(grey)
    for i in range(l):
        plt.subplot(l, 1, i+1)
        #plt.imshow(attention[i].reshape(7, 14))
        smoothed_attention = smoothen_attention(attention[i])
        #plt.imshow(smoothed_attention, vmin = 0, vmax = 1)
        # I need to fix color scale for images
        plt.imshow(grey*0.5+ 0.5*np.multiply(grey, smoothed_attention), vmax = vmax, vmin = vmin, cmap = plt.get_cmap('gray'))
        plt.colorbar()

In [None]:
i = 60
print(toString(test_y[i]))
print(toString(result[i]))
#plot_img(test_X[i], result[i][:], attentions[i][:])

In [None]:
i = 20
print(toString(test_y[i]))
plot_attention(test_X[i], result[i][:], attentions[i][:])
plot_img(test_X[i], result[i][:], attentions[i][:])

In [None]:
with open("test.csv", "w") as file:
	file.write("ID")
	file.write(",")
	file.write("Predicted\n")
	for i, sample in enumerate(result):
		file.write(str(i))
		file.write(",")
		for c_index in sample:
			c = idx2word[c_index]
			file.write(c)
			file.write(' ')
		file.write("\n")

		for c_index in gt[i]:
			c = idx2word[c_index]
			file.write(c)
			file.write(' ')
		file.write("\n")