In [1]:
import numpy as np
import json
import gensim
from text_prep import *
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_sequence, pad_sequence, pad_packed_sequence, pack_padded_sequence
import matplotlib.pyplot as plt
from torch import optim



In [2]:
with open("C:/Users/blackbak/Documents/github/data/squad_data/train-v2.0.json") as f:
    data = json.load(f)

In [3]:
#m =0
questions = []
for i in range(len(data["data"])):
    for j in range(len(data["data"][i]["paragraphs"])):
        for k in range(len(data["data"][i]["paragraphs"][j]["qas"])):
            questions.append(data["data"][i]["paragraphs"][j]["qas"][k]["question"])

In [4]:
model = build_w2v_model(questions)

In [5]:
questions_idx = [sentence2idx(model, sentence) for sentence in questions]

In [6]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu:0")
torch.backends.cudnn.benchmark = True

In [8]:
embedding = gensim2embedding(model, device)

In [41]:
class Discriminator(nn.Module):
    def __init__(self, embed_dimension, hidden_size, num_layers):
        super(Discriminator, self).__init__()
        self.embed_dimension = embed_dimension
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size=embed_dimension, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(2*hidden_size, 300) # since we want bidirectional
        self.fc2 = nn.Linear(300, 1)
        self.activation_fc1 = nn.SELU()
        self.activation_fc2 = nn.Sigmoid()
        
    def forward(self, padded_input, input_lengths, batch_size):
        #total_length = padded_input.size(1) #padded_input must be ordered by size
        packed_input = pack_padded_sequence(padded_input, input_lengths, batch_first=True)
        packed_output, last_hidden = self.rnn(packed_input)
        last_hidden = last_hidden.permute(1, 0, 2)
        last_hidden = last_hidden[:, -2:, :].reshape([batch_size,-1])
        #gru_output, sequence_length = pad_packed_sequence(packed_output, batch_first=True, total_length = total_length)
        fc1 = self.activation_fc1(self.fc1(last_hidden))
        discriminator_output = self.activation_fc2(self.fc2(fc1))
        return discriminator_output

In [106]:
class Discriminator(nn.Module):
    def __init__(self, embed_dimension, hidden_size, num_layers):
        super(Discriminator, self).__init__()
        self.embed_dimension = embed_dimension
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size=embed_dimension, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(2*hidden_size, 300) # since we want bidirectional
        self.fc2 = nn.Linear(300, 1)
        self.activation_fc1 = nn.SELU()
        self.activation_fc2 = nn.Sigmoid()
        
    def forward(self, padded_input, input_lengths, batch_size):
        total_length = padded_input.shape[1] #padded_input must be ordered by size
        packed_input = pack_padded_sequence(padded_input, input_lengths, batch_first=True)
        packed_output, last_hidden = self.rnn(packed_input)
        gru_output, sequence_length = pad_packed_sequence(packed_output, total_length = total_length)
        last_output = gru_output[total_length-1, :, :]
        fc1 = self.activation_fc1(self.fc1(last_output))
        discriminator_output = self.activation_fc2(self.fc2(fc1))
        return discriminator_output

In [70]:
class Generator(nn.Module):
    def __init__(self, noise_dim, embed_dimension, num_layers, embedding, eos, sos):
        super(Generator, self).__init__()
        self.eos_token = eos
        self.sos_token = sos
        self.noise_dim = noise_dim
        self.embed_dimension = embed_dimension
        self.hidden_size = embed_dimension
        self.num_layers = num_layers
        self.embedding = embedding
        self.noise2hidden = nn.Linear(noise_dim, num_layers*embed_dimension)
        self.tanh = nn.Tanh()
        self.rnn = nn.GRU(input_size=embed_dimension, hidden_size=embed_dimension, num_layers=num_layers, batch_first=True)
    
    def most_similar(self, emb_input):
        cos = nn.CosineSimilarity(dim=1)
        similarity = cos(self.embedding.weight, emb_input.squeeze())
        #similarity = torch.mv(self.embedding.weight, emb_input.squeeze())
        value, idx = torch.max(similarity, 0)
        return idx
    
    def forward(self, device):
        noise = torch.randn(self.noise_dim).view(1,1,-1).to(device)
        h0 = self.tanh(self.noise2hidden(noise)).view([self.num_layers, 1, self.embed_dimension])
        #List that holds all the output (words/embeddings)
        output = []
        word_output = []
        #First output with input the sos token
        o, h = self.rnn(self.sos_token.view(1,1,-1), h0)
        idx = self.most_similar(o)
        output.append(o)
        word_output.append(idx)
        #We want to iterate for output some and then produce an eos token. Maximum length of
        #the output we set it to 20 words or if we produce an eos token before that 
        for i in range(20):
            o, h = self.rnn(o, h)
            #might need to squeeze the output
            #dot product to calculate similarity
            #similarity = torch.mv(self.embedding, o)
            idx = self.most_similar(o)
            output.append(o)
            word_output.append(idx)
            if idx==self.eos_token:
                break
        #here we have the option to append the eos token or not
        return torch.cat(output), word_output


In [11]:
class Generator(nn.Module):
    def __init__(self, noise_dim, embed_dimension, num_layers):
        super(Generator, self).__init__()
        self.noise_dim = noise_dim
        self.embed_dimension = embed_dimension
        self.hidden_size = embed_dimension
        self.num_layers = num_layers
        self.noise2hidden = nn.Linear(noise_dim, num_layers*embed_dimension)
        self.tanh = nn.Tanh()
        self.rnn = nn.GRU(input_size=embed_dimension, hidden_size=embed_dimension, num_layers=num_layers, batch_first=True)

    def forward(self, o, noise=None, h=None):
        if h is None:
            h0 = self.tanh(self.noise2hidden(noise)).view([self.num_layers, 1, self.embed_dimension])
            o, h = self.rnn(o, h0) #here is self.sos_token.view(1,1,-1)
        else:
            o, h = self.rnn(o, h)
        return o, h


In [109]:
class Generator(nn.Module):
    def __init__(self, noise_dim, embed_dimension, num_layers):
        super(Generator, self).__init__()
        self.noise_dim = noise_dim
        self.embed_dimension = embed_dimension
        self.hidden_size = embed_dimension
        self.num_layers = num_layers
        self.noise2hidden = nn.Linear(noise_dim, num_layers*embed_dimension)
        self.tanh = nn.Tanh()
        #batch first does not work on autoregressive rnn
        self.rnn = nn.GRU(input_size=embed_dimension, hidden_size=embed_dimension, num_layers=num_layers, batch_first=True)

    def forward(self, o, h):
        o, h = self.rnn(o, h)
        return o, h
    
    def init_hidden(self, noise):
        return self.tanh(self.noise2hidden(noise)).view([self.num_layers, 1, self.embed_dimension])


In [110]:
def most_similar(embedding, emb_input):
    #cos = nn.CosineSimilarity(dim=1)
    #similarity = cos(embedding.weight, emb_input.view(1, -1))
    similarity = torch.mv(embedding.weight, emb_input.squeeze())
    value, idx = torch.max(similarity, 0)
    return idx

In [111]:
def train(generator, discriminator, input_sequence, embedding, generator_optimizer, discriminator_optimizer, criterion, eos):
    #input sequence of shape [1,seq_len,300]
    ###Discriminator training
    #train with real data
    discriminator_optimizer.zero_grad()
    real_output = discriminator.forward(padded_input=input_sequence.to(device), input_lengths=torch.tensor([input_sequence.shape[1]], device=device), batch_size=1)
    real_label = torch.ones(1, device=device)
    real_error = criterion(real_output, real_label)
    real_error.backward()
    #train with fake
    #generate sequence
    generated_sequence = []
    generated_idx = []
    noise = torch.randn(generator.noise_dim).to(device)
    o_gen = torch.zeros(embedding.weight.shape[1], device=device).view(1,1,-1)
    h_gen = generator.init_hidden(noise)
    for i in range(20):
        o_gen, h_gen = generator.forward(o_gen, h_gen)
        generated_sequence.append(o_gen)
        idx = most_similar(embedding, o_gen)
        generated_idx.append(idx)
        if idx==eos:
            break
    generated_sequence = torch.cat(generated_sequence).view(1, -1, embedding.weight.shape[1])
    fake_output = discriminator.forward(padded_input=generated_sequence.detach(), input_lengths=torch.tensor([generated_sequence.shape[1]], device=device), batch_size=1)
    fake_label = torch.zeros(1, device=device)
    fake_error = criterion(fake_output, fake_label)
    fake_error.backward()
    discriminator_error = real_error + fake_error
    discriminator_optimizer.step()
    ###Generator training
    generator_optimizer.zero_grad()
    fake_output_gen = discriminator.forward(padded_input=generated_sequence, input_lengths=torch.tensor([generated_sequence.shape[1]], device=device), batch_size=1)
    fake_labels_gen = torch.ones(1, device=device)
    generator_error = criterion(fake_output_gen, fake_labels_gen)
    generator_error.backward()
    generator_optimizer.step()
    loss = discriminator_error + generator_error
    return loss.item(), generator_error.item(), discriminator_error.item()

In [122]:
def train_iter(generator, discriminator, dataset, embedding, eos, epochs, lr=0.001):
    #we have to append <\s> to each question at the end
    total_loss = []
    generator_optimizer = optim.Adam(generator.parameters(), lr=lr)
    discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=lr)
    criterion = nn.BCELoss()
    for e in range(epochs):
        loss = 0
        gen_loss = 0
        dis_loss = 0
        for i, data in enumerate(dataset):
            embeds = embedding(torch.tensor(data+[eos], device=device)).view(1, -1, embedding.weight.shape[1])
            current_loss, curr_gen_loss, curr_dis_loss = train(generator=generator, discriminator=discriminator, input_sequence=embeds, 
                                 embedding=embedding, generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer, criterion=criterion, eos=eos)
            loss += current_loss
            gen_loss += curr_gen_loss
            dis_loss += curr_dis_loss
            if i%50==0:
                total_loss.append(loss)
                print("Loss at iteration {}: {}".format(i, loss))
                print("Gen loss at iteration {}: {}".format(i, gen_loss))
                print("Dis loss at iteration {}: {}".format(i, dis_loss))
                loss = 0
                gen_loss = 0
                dis_loss = 0

    plot_loss(total_loss)
            

In [113]:
def plot_loss(loss_list):
    plt.figure()
    plt.plot(total_loss)

In [115]:
generator = Generator(noise_dim=10, embed_dimension=300, num_layers=1)
generator.to(device)

Generator(
  (noise2hidden): Linear(in_features=10, out_features=300, bias=True)
  (tanh): Tanh()
  (rnn): GRU(300, 300, batch_first=True)
)

In [116]:
discriminator = Discriminator(embed_dimension=300, hidden_size=32, num_layers=1)
discriminator.to(device)

Discriminator(
  (rnn): GRU(300, 32, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=64, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=1, bias=True)
  (activation_fc1): SELU()
  (activation_fc2): Sigmoid()
)

In [117]:
eos = word2idx(model, "</s>")

In [127]:
train_iter(generator=generator, discriminator=discriminator, 
           dataset=questions_idx, embedding=embedding, eos=eos, epochs=1, lr=0.001)

In [124]:
def generate_question(generator, embedding, model):
    generated_sequence = []
    generated_idx = []
    generated_words = []
    noise = torch.randn(generator.noise_dim).to(device)
    o_gen = torch.zeros(embedding.weight.shape[1], device=device).view(1,1,-1)
    h_gen = generator.init_hidden(noise)
    for i in range(20):
        o_gen, h_gen = generator.forward(o_gen, h_gen)
        generated_sequence.append(o_gen)
        idx = most_similar(embedding, o_gen)
        generated_idx.append(idx)
        generated_words.append(idx2word(model, idx))
        if idx==eos:
            break
    return generated_words, generated_idx

In [125]:
q, idx = generate_question(generator, embedding, model)

In [126]:
q

['of',
 'user_click',
 'invisible_item_flow',
 'delete_inappropriate_comments',
 'Dow_Jones_Reprints',
 'of',
 'of',
 'www.iotogo.com_HPWAinfo',
 'Reuters.com_video_mobile',
 'invisible_item_flow',
 'invisible_item_flow',
 'invisible_item_flow',
 'Full_versions',
 'Associated_Press_GfK_Poll',
 'Zacks_Rank',
 'of',
 'of',
 'downtick_trades_---',
 'TradingMarkets_Weekly_Newsletter_covers',
 'to']

In [128]:
#### some changes for tracking