In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/keyword-data.txt


In [28]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import torch
import random
import torch.nn as nn
from torch import optim
from torch.autograd import variable
import torch.nn.functional as F
import sys

In [29]:
use_cuda = True
TRAIN = False

In [30]:
for arg in sys.argv:
    if arg =='--train':
        TRAIN = True
    elif arg =='--cuda':
        use_cuda = torch.cuda.is_available()

In [31]:
print("CUDA :", use_cuda)
print("TRAIN: ", TRAIN)

CUDA : True
TRAIN:  False


**Indexing words**: making helper class lang that has word to index and index to word mappings.

In [32]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__ (self,name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 #count of SOS and EOS
        
    def add_sentence(self,sentence):
        for word in sentence.split(' '):
            self.add_word(word)
            
    def add_word(self,word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words +=1
        else:
            self.word2count[word] +=1
            

        

**Reading and decoding files from Unicode to ascii**

In [33]:
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
    )

**Lowercase, trim and remove non letter characters**

In [34]:
def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

Reading the datafile by splitting the file into lines, then splitting the lines into pairs.
file mapping titles --> Keywords
and we want to map from keywords --> titles
We use the reverse flag to reverse the pairs

In [35]:
def read_langs(lang1,lang2, reverse = False):
    print("Reading lines")
    
    #read the file and split into lines
    lines = open('../input/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    
    #split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    #reverse pairs, make Language instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    
    return input_lang, output_lang, pairs

**Filtering sentences**

In [36]:
MAX_LENGTH = 512

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

# Data Preparation

* Read text file
* Normalization, filter by content
* Make word lists from sentences in pairs

In [37]:
def prepare_data(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1, lang2, reverse)
    pairs                          = filter_pairs(pairs)

    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('keyword', 'data', False)

#checking
print(random.choice(pairs))

Reading lines
['the corsair hydro series keeps your pc cool and silent with maintenance free water cooling', 'cooling water cooling']


## **Building the models**

### The Encoder - An RNN that outputs the value for every word from the input sequence. For every word it outputs a vector and a hidden state and uses the hidden state for the next input word.

In [38]:
class EncoderRNN(nn.Module):
    def __init__ (self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1,1,-1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def init_hidden(self):
        result = Variable(torch.zeros(1,1,self.hidden_size))
        
        if use_cuda:
            return result.cuda()
        else:
            return result
        

### The Decoder - Output conditioned on the previous outputs and some x, where x consists of the current hidden state (that itself takes into account the previous outputs) and the attention "context"

To summarize, our decoder consists of 4 main parts:
* An embedding layer - turning the input into a vector.
* A layer calculating the attention energy per encoder output.
* ek RNN layer
* ek output layer

In [39]:
class DecoderRNN(nn.Module):
    def __init__ (self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        #defining parameters
        self.hidden_size = hidden_size
        
        #define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.output = nn.LogSoftmax(dim=1)
        
        def forward(self, input, hidden):
            # we will only be running forward for a single decoder time step, but will use all encoder outputs.
            
            output = self.embedding(input).view = (1,1,-1) #S=1
            output = F.relu(output)
            output,hidden = self.gru(output,hidden)
            output = self.softmax(self.out(output[0]))
            return output, hidden
        
        def init_hidden(self):
            result = Variable(torch.zeros(1,1,self.hidden_size))
            
            if use_cuda:
                return result.cuda()
            else:
                return result

**Attention Decoder (neural machine translation to calculate attention context)**

In [40]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p = 0.1, max_length = MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        
        #Define Parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        #define layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
        def forward(self, input, hidden, encoder_outputs):
            #running forward for a single decoder time step but we will use all encoder outputs
            
            #get the embedding of the current input word(last input word)
            embedded = self.embedding(input).view(1,1,-1) # S = 1 X B X N 
            embedded = self.dropout(embedded)
            
            #calculate attn weights and apply to encoder outputs
            attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]),1)), dim=1)
            #to incorporate context
            attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
            
            #final output layer
            
            output = torch.cat((embedded[0], attn_applied[0]),1)
            output = self.attn_combine(output).unsqueeze(0)
            output = F.relu(output)
            output, hidden = self.gru(output,hidden)
            output = F.log_softmax(self.out(output[0]), dim=1)
            
            return output, hidden, attn_weights
        
        def init_hidden(self):
            result = Variable(torch.zeros(1,1,self.hidden_size))
            
            if use_cuda:
                return result.cuda()
            else:
                return result

In [48]:
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

In [None]:
def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang,sentence)
    indexes.append(EOS_token)