In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import codecs

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
def read_file(path):
    
    with codecs.open(path, encoding='utf-8', mode='rb') as f:
        data = f.read()
    
    return data.strip().split('\n')

def preprocess_word(word):
    
    s = re.sub(r'([.?!])', r' \1', word)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', word)\

    return s

def split_en_fra(line):
    
    en = []
    fra = []
    pairs = []
    
    for l in line:
        pair = l.lower().split('\t')
        en_word = pair[0]
        fra_word = pair[1]

        en.append(preprocess_word(pair[0]))
        fra.append(preprocess_word(pair[1]))
        pairs.append(pair)
        
    return en, fra, pairs

def filter_pairs(pairs, max_length=10):
    
    filtered_pairs = []
    
    eng_prefixes = (
        "i am", "i m ",
        "he is", "he s",
        "she is", "she s",
        "you are", "you re",
        "we are", "we re",
        "they are", "they re"
    )
    
    for p in pairs:
        if len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length and p[0].startswith(eng_prefixes):
            filtered_pairs.append(p)
            
    return filtered_pairs

In [3]:
line = read_file('./data/eng-fra.txt')

In [4]:
line[:5]

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !']

In [5]:
en, fra, pairs = split_en_fra(line)
filtered_pairs = filter_pairs(pairs)

In [6]:
print(en[:5])
print(fra[:5])
print(pairs[:5])
print(filtered_pairs[:5])
print(len(pairs))
print(len(filtered_pairs))

['go.', 'run!', 'run!', 'wow!', 'fire!']
['va !', 'cours !', 'courez !', ' a alors !', 'au feu !']
[['go.', 'va !'], ['run!', 'cours\u202f!'], ['run!', 'courez\u202f!'], ['wow!', 'ça alors\u202f!'], ['fire!', 'au feu !']]
[['i am fat.', 'je suis gras.'], ['he is ill.', 'il est malade.'], ['he is old.', 'il est vieux.'], ['i am busy.', 'je suis occupé.'], ['i am calm.', 'je suis calme.']]
135842
3885


In [7]:
def build_dict(eng_list, fra_list):
    
    start_token = 0
    end_token = 1
    
    en_idx = {}
    fra_idx = {}
    idx_en = {}
    idx_fra = {}
    
    en_idx['SOS'] = start_token
    en_idx['EOS'] = end_token
    fra_idx['SOS'] = start_token
    fra_idx['EOS'] = end_token
    
    idx = 2
    for e in eng_list:
        for word in e.split():
            if word not in en_idx:
                en_idx[word] = idx
                idx += 1
    idx = 2
    for f in fra_list:
        for word in f.split():
            if word not in fra_idx:
                fra_idx[word] = idx
                idx += 1
            
    idx_en[0] = 'SOS'
    idx_en[1] = 'EOS'
    idx_fra[0] = 'SOS'
    idx_fra[1] = 'EOS'
        
    for word, idx in en_idx.items():
        idx_en[idx] = word
    for word, idx in fra_idx.items():
        idx_fra[idx] = word
    
    return en_idx, fra_idx, idx_en, idx_fra

In [8]:
en_idx, fra_idx, idx_en, idx_fra = build_dict(en, fra)

In [13]:
def en_to_idx(sentence):
    return [en_idx[word] for word in sentence.split(' ')]

def fra_to_idx(sentence):
    return [fra_idx[word] for word in sentence.split(' ')]

def to_tensor(indexes):
    indexes.append(1)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [24]:
en_tensor = []
fra_tensor = []

for e in en:
    en_tensor.append(to_tensor(en_to_idx(e.strip())))
for f in fra:
    fra_tensor.append(to_tensor(fra_to_idx(f.strip())))

In [9]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, inp, hidden):
        embed = self.embedding(inp, hidden)
        output = embed
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [10]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Dense(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, inp, hidden):
        output = self.embedding(inp).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.ouut(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [12]:
class AttnDecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inp, hidden, encoder_outputs):
        embed = self.embedding(inp).view(1, 1, -1)
        embed = self.dropout(embed)
        
        attn_weights = F.softmax(self.attn(torch.cat((embed[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        
        return output, hidden, attn_weights
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, crit, max_length=10):
    
    encoder_hidden = encoder.init_hidden()
    
    encoder_opt.zero_grad()
    decoder_opt.zero_grad()
    
    input_length = input_tensor.size(0)
    output_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    training_loss = 0
    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_outputs[i] = encoder_output[0,0]
        
    decoder_input = torch.tensor([[0]], device=device)
    
    decoder_hidden = encoder_hidden
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-11-13b01967388a>, line 2)