In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/keyword-data.txt


In [31]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import torch
import random
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import torch.nn.functional as F
import sys

In [32]:
use_cuda = True
TRAIN = False

In [33]:
for arg in sys.argv:
    if arg =='--train':
        TRAIN = True
    elif arg =='--cuda':
        use_cuda = torch.cuda.is_available()

In [34]:
print("CUDA :", use_cuda)
print("TRAIN: ", TRAIN)

CUDA : True
TRAIN:  False


**Indexing words**: making helper class lang that has word to index and index to word mappings.

In [35]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__ (self,name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 #count of SOS and EOS
        
    def add_sentence(self,sentence):
        for word in sentence.split(' '):
            self.add_word(word)
            
    def add_word(self,word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words +=1
        else:
            self.word2count[word] +=1
            

        

**Reading and decoding files from Unicode to ascii**

In [36]:
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
    )

**Lowercase, trim and remove non letter characters**

In [37]:
def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

Reading the datafile by splitting the file into lines, then splitting the lines into pairs.
file mapping titles --> Keywords
and we want to map from keywords --> titles
We use the reverse flag to reverse the pairs

In [38]:
def read_langs(lang1,lang2, reverse = False):
    print("Reading lines")
    
    #read the file and split into lines
    lines = open('../input/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    
    #split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    #reverse pairs, make Language instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    
    return input_lang, output_lang, pairs

**Filtering sentences**

In [39]:
MAX_LENGTH = 512

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

# Data Preparation

* Read text file
* Normalization, filter by content
* Make word lists from sentences in pairs

In [40]:
def prepare_data(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1, lang2, reverse)
    pairs                          = filter_pairs(pairs)

    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('keyword', 'data', False)

#checking
print(random.choice(pairs))

Reading lines
['buy these four knives instead of an expensive set', 'knives']


## **Building the models**

### The Encoder - An RNN that outputs the value for every word from the input sequence. For every word it outputs a vector and a hidden state and uses the hidden state for the next input word.

In [41]:
class EncoderRNN(nn.Module):
    def __init__ (self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1,1,-1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def init_hidden(self):
        result = Variable(torch.zeros(1,1,self.hidden_size))
        
        if use_cuda:
            return result.cuda()
        else:
            return result
        

### The Decoder - Output conditioned on the previous outputs and some x, where x consists of the current hidden state (that itself takes into account the previous outputs) and the attention "context"

To summarize, our decoder consists of 4 main parts:
* An embedding layer - turning the input into a vector.
* A layer calculating the attention energy per encoder output.
* ek RNN layer
* ek output layer

In [42]:
class DecoderRNN(nn.Module):
    def __init__ (self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        #defining parameters
        self.hidden_size = hidden_size
        
        #define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.output = nn.LogSoftmax(dim=1)
        
        def forward(self, input, hidden):
            # we will only be running forward for a single decoder time step, but will use all encoder outputs.
            
            output = self.embedding(input).view = (1,1,-1) #S=1
            output = F.relu(output)
            output,hidden = self.gru(output,hidden)
            output = self.softmax(self.out(output[0]))
            return output, hidden
        
        def init_hidden(self):
            result = Variable(torch.zeros(1,1,self.hidden_size))
            
            if use_cuda:
                return result.cuda()
            else:
                return result

**Attention Decoder (neural machine translation to calculate attention context)**

In [43]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p = 0.1, max_length = MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        
        #Define Parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        #define layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
        def forward(self, input, hidden, encoder_outputs):
            #running forward for a single decoder time step but we will use all encoder outputs
            
            #get the embedding of the current input word(last input word)
            embedded = self.embedding(input).view(1,1,-1) # S = 1 X B X N 
            embedded = self.dropout(embedded)
            
            #calculate attn weights and apply to encoder outputs
            attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]),1)), dim=1)
            #to incorporate context
            attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
            
            #final output layer
            
            output = torch.cat((embedded[0], attn_applied[0]),1)
            output = self.attn_combine(output).unsqueeze(0)
            output = F.relu(output)
            output, hidden = self.gru(output,hidden)
            output = F.log_softmax(self.out(output[0]), dim=1)
            
            return output, hidden, attn_weights
        
        def init_hidden(self):
            result = Variable(torch.zeros(1,1,self.hidden_size))
            
            if use_cuda:
                return result.cuda()
            else:
                return result

In [44]:
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

In [45]:
def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang,sentence)
    indexes.append(EOS_token)
    
    result = Variable(torch.LongTensor(indexes).view(-1,1))
    
    if use_cuda:
        return result.cuda()
    else:
        return result

In [46]:
def variables_from_pair(pair):
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang,pair[1])
    
    return (input_variable, target_variable)

**Training idhar se shuru**

In [47]:
#"Teacher Forcing", or maximum likelihood sampling, means using the real target outputs as each next input when training. 
#The alternative is using the decoder's own guess as the next input. Using teacher forcing may cause the network to converge faster.
teacher_forcing_ratio = 0.5

In [48]:
#we will use scheduled sampling to tackle the teacher-forcing problem, which will alternate between using the taget values and the predicted values when training.
# we will randomy choose to use teacher forcing with training.
# sometimes use decoder output, sometimes ignore.

In [49]:
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]
    
    # Run words through encoder
    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
    loss = 0
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0]
        
    # prepare input and output variables
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    # use last hidden state from encoder to start decoder.
    decoder_hidden = encoder_hidden
    
    # choose whether to use teacher forcing
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
       # teacher-forcing: use ground truth target as next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
            )
            
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]
    else:
        # without teacher forcing: use network's own prediction as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
            )
            
            #get most likely word index (highest value) from output
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]
            #chosen word is next input
            decoder_input = Variable(torch.LongTensor([[ni]]))
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
            loss += criterion(decoder_output, target_variable[di])
            
            if ni == EOS_token:
                break
                
    #Backpropagation
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0]/target_length

Adding helper functions to print time elapsed and estimated time remaining, given the current time and progress.

In [50]:
import math
import time

In [51]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -=m*60
    
    return '%dm %ds' % (m, s)

In [52]:
def time_since(since, percent):
    now = time.time()
    s   = now - since
    es  = s / (percent)
    rs  = es - s

    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))


In [53]:
#now we can start training

In [54]:
#initialize models, optimizers and iterations
def train_iterations(encoder, decoder, n_iters, print_every = 1000, learning_rate =0.01):
    start = time.time()
    print_loss_total = 0
    
    ## Initialize optimizers and criterion
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs    = [variables_from_pair(random.choice(pairs)) for i in range(n_iters)]
    criterion         = nn.NLLLoss()
    
    #beginnn
    for iter in range(1, n_iters + 1):
        training_pair   = training_pairs[iter - 1]
        input_variable  = training_pair[0]
        target_variable = training_pair[1]
        
     #keep track of loss
        loss = train(
            input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion
        )

        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg   = print_loss_total / print_every
            print_loss_total = 0  # Reset every print_every

            print('%s (%d %d%%) %.4f' % (
                time_since(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg
            ))
    

In [55]:
import numpy as np

In [56]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    input_variable  = variable_from_sentence(input_lang, sentence)
    input_length    = input_variable.size()[0]
    encoder_hidden  = encoder.init_hidden()
    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_ouputs.cuda() if use_cuda else encoder_ouputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei], encoder_hidden)
        encoder_outputs[ei]            = encoder_outputs[ei] + encoder_output[0][0]

    decoder_input      = Variable(torch.LongTensor([[SOS_token]]))
    decoder_input      = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden     = encoder_hidden
    decoded_words      = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        decoder_attentions[di] = decoder_attention.data
        topv, topi             = decoder_output.data.topk(1)
        ni                     = topi[0][0]

        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni])

        decoder_input = Variable(torch.LongTensor([[ni]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words, decoder_attentions[:di + 1]

In [57]:
hidden_size = 256

In [58]:
TRAIN = True

In [60]:
if TRAIN is True:
    print("TRAINING...")

    encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
    attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1)

    if use_cuda:
        encoder1 = encoder1.cuda()
        attn_decoder1 = attn_decoder1.cuda()

    train_iterations(encoder1, attn_decoder1, 75000, print_every=5000)

    torch.save(encoder1, 'encoder.pt')
    torch.save(attn_decoder1, 'decoder.pt')
else:
    print("LOADING...")

    encoder1      = torch.load('encoder.pt')
    attn_decoder1 = torch.load('decoder.pt')

TRAINING...


NameError: name 'Variable' is not defined

In [None]:
def output_evaluation(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence
    )

    print("input  = ", input_sentence)
    print("output = ", ' '.join(output_words))

while(True):
    try:
        inp = raw_input(">")
        output_evaluation(inp)
    except KeyError:
        pass