In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import codecs

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [63]:
def read_file(path):
    
    with codecs.open(path, encoding='utf-8', mode='rb') as f:
        data = f.read()
    
    return data.strip().split('\n')

def preprocess_word(word):
    
    s = re.sub(r'([.,?!])', r' \1', word)
    s = re.sub(r'[^a-zA-Z.,!?]+', r' ', word)\

    return s

def split_en_fra(line):
    
    for l in line:
        pair = l.lower().split('\t')
        en_word = pair[0]
        fra_word = pair[1]

        en.append(preprocess_word(pair[0]))
        fra.append(preprocess_word(pair[1]))
    
    return en, fra

In [64]:
line = read_file('./data/eng-fra.txt')

In [69]:
line[:5]

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !']

In [67]:
en, fra = split_en_fra(line)

In [68]:
print(en[:5])
print(fra[:5])

['go .', 'run !', 'run !', 'wow !', 'fire !']
['va !', 'cours !', 'courez !', ' a alors !', 'au feu !']


In [33]:
def build_dict(eng_list, fra_list):
    
    start_token = 0
    end_token = 1
    
    en_idx = {}
    fra_idx = {}
    idx_en = {}
    idx_fra = {}
    
    en_idx['SOS'] = start_token
    en_idx['EOS'] = end_token
    fra_idx['SOS'] = start_token
    fra_idx['EOS'] = end_token
    
    idx = 2
    for e in eng_list:
        for word in e.split():
            if word not in en_idx:
                en_idx[word] = idx
                idx += 1
    idx = 2
    for f in fra_list:
        for word in f.split():
            if word not in fra_idx:
                fra_idx[word] = idx
                idx += 1
            
    idx_en[0] = 'SOS'
    idx_en[1] = 'EOS'
    idx_fra[0] = 'SOS'
    idx_fra[1] = 'EOS'
        
    for word, idx in en_idx.items():
        idx_en[idx] = word
    for word, idx in fra_idx.items():
        idx_fra[idx] = word
    
    return en_idx, fra_idx, idx_en, idx_fra

In [34]:
en_idx, fra_idx, idx_en, idx_fra = build_dict(en, fra)