In [1]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 26.7 MB/s eta 0:00:01[K     |▌                               | 20 kB 9.2 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.0 MB/s eta 0:00:01[K     |█                               | 40 kB 7.4 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 4.4 MB/s eta 0:00:01[K     |██                              | 71 kB 4.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 4.9 MB/s eta 0:00:01[K     |██▍                             | 92 kB 5.0 MB/s eta 0:00:01[K     |██▊                             | 102 kB 4.1 MB/s eta 0:00:01[K     |███                             | 112 kB 4.1 MB/s eta 0:00:01[K     |███▎                            | 122 kB 4.1 MB/s eta 0:00:01[K     |███▌         

In [2]:
import sentencepiece as spm

In [4]:
spm.SentencePieceTrainer.train(
    input = './en-fra.txt',
    model_prefix = './model',
    vocab_size=1000,
    user_defined_symbols = []
)

In [5]:
a = spm.SentencePieceProcessor(model_file='model.model')

In [6]:
ids = a.encode("do you want a car", out_type=int)
ids

[89, 35, 214, 31, 485]

In [7]:
a.decode(ids)

'do you want a car'

In [8]:
import logging
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
import unicodedata
import string
from tqdm import tqdm
from pathlib import Path
from typing import List
import datetime
import time
import re
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import random

In [9]:
logging.basicConfig(level=logging.INFO)

FILE = "en-fra.txt"

writer = SummaryWriter("runs/translation"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

def normalize(s):
    return re.sub(' +',' ', "".join(c if c in string.ascii_letters else " "
         for c in unicodedata.normalize('NFD', s.lower().strip())
         if  c in string.ascii_letters+" "+string.punctuation)).strip()

In [10]:

class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """
    PAD = 0
    EOS = 1
    SOS = 2
    OOVID = 3

    def __init__(self, oov: bool):
        self.oov = oov
        self.id2word = ["PAD", "EOS", "SOS"]
        self.word2id = {"PAD": Vocabulary.PAD, "EOS": Vocabulary.EOS, "SOS": Vocabulary.SOS}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self, idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self, idx: List[int]):
        return [self.getword(i) for i in idx]



In [11]:
class TradDataset():
    def __init__(self,data,vocOrig,vocDest,adding=True,max_len=10):
        self.sentences =[]
        for s in tqdm(data.split("\n")):
            if len(s)<1:continue
            orig,dest=map(normalize,s.split("\t")[:2])
            if len(orig)>max_len: continue
            self.sentences.append((torch.tensor([vocOrig.get(o) for o in a.encode(orig)]+[Vocabulary.EOS]),\
                                   torch.tensor([vocDest.get(o) for o in a.encode(dest)]+[Vocabulary.EOS])))
            
            #self.sentences.append((torch.tensor([vocOrig.get(o) for o in orig.split(" ")]+[Vocabulary.EOS]),\
            #                       torch.tensor([vocDest.get(o) for o in dest.split(" ")]+[Vocabulary.EOS])))
    def __len__(self):return len(self.sentences)
    def __getitem__(self,i): return self.sentences[i]

In [12]:
def collate(batch):
    orig,dest = zip(*batch)
    o_len = torch.tensor([len(o) for o in orig])
    d_len = torch.tensor([len(d) for d in dest])
    return pad_sequence(orig),o_len,pad_sequence(dest),d_len

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
with open(FILE) as f:
    lines = f.readlines()

lines = [lines[x] for x in torch.randperm(len(lines))]
idxTrain = int(0.8*len(lines))

vocEng = Vocabulary(True)
vocFra = Vocabulary(True)
MAX_LEN=25
BATCH_SIZE=64

datatrain = TradDataset("".join(lines[:idxTrain]),vocEng,vocFra,max_len=MAX_LEN)
datatest = TradDataset("".join(lines[idxTrain:]),vocEng,vocFra,max_len=MAX_LEN)

train_loader = DataLoader(datatrain, collate_fn=collate, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(datatest, collate_fn=collate, batch_size=BATCH_SIZE, shuffle=True)

100%|██████████| 136521/136521 [00:13<00:00, 10194.73it/s]
100%|██████████| 34132/34132 [00:03<00:00, 10245.26it/s]


In [15]:
HIDDEN_SIZE = 128
INPUT_SIZE = len(vocEng)
OUTPUT_SIZE = len(vocFra)

In [16]:
class State:
    def __init__(self, encoder, decoder, optim_enc, optim_dec):
        self.encoder = encoder
        self.decoder = decoder
        self.optimizer_enc = optim_enc
        self.optimizer_dec = optim_dec
        self.epoch, self.iteration = 0, 0

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, dropout=0.2)

    def forward(self, input):
        embedded = self.embedding(input)
        output = embedded
        
        _, hidden = self.gru(output)
        return hidden


In [18]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [19]:
def train_loop(dataloader,state,L):
    train_loss = 0
    for batch, (X, X_sizes, y, y_sizes) in enumerate(dataloader):
        
        X = X.to(device)
        y = y.to(device)
        
        
        input_length = X.size(0)
        target_length = y.size(0)
        
        loss = 0
        context = state.encoder(X)

        decoder_input = torch.ones(1, X.size(1), dtype=torch.long, device=device) * 2 # 2 is the SOS Token
        decoder_hidden = context

        teacher_forcing = True if random.random() < 0.5 else False
        decoder_outputs = []
        if teacher_forcing:
            for di in range(target_length):
                decoder_output, decoder_hidden = state.decoder(decoder_input, decoder_hidden)
                
                decoder_output = decoder_output.view(decoder_output.shape[1], -1)
                #loss += L(decoder_output, y[di])
                decoder_outputs.append(decoder_output)
                decoder_input = y[di].view(1, -1) 
        else:
            for di in range(target_length):
                decoder_output, decoder_hidden = state.decoder(decoder_input, decoder_hidden)
                decoder_output = decoder_output.view(decoder_output.shape[1], -1)
                _, pred = torch.max(decoder_output, 1)
                pred = pred.view(1, -1)
                decoder_input = pred 
                #loss += L(decoder_output, y[di])
                decoder_outputs.append(decoder_output)

        decoder_outputs = torch.stack(decoder_outputs).permute(0, 2, 1)
        #print(decoder_outputs.shape, y.shape)
        loss += L(decoder_outputs, y)
        train_loss += loss
        
        state.optimizer_enc.zero_grad()
        state.optimizer_dec.zero_grad()
        loss.backward()
        state.optimizer_enc.step()
        state.optimizer_dec.step() 
        
    train_loss = train_loss / len(dataloader)
    return train_loss.item()

In [20]:
def test_loop(dataloader,state,L):
    with torch.no_grad():
        test_loss = 0
        for batch, (X, X_sizes, y, y_sizes) in enumerate(dataloader):
            
            X = X.to(device)
            y = y.to(device)
            
            
            input_length = X.size(0)
            target_length = y.size(0)
            
            loss = 0
            context = state.encoder(X)

            decoder_input = torch.ones(1, X.size(1), dtype=torch.long, device=device) * 2 # 2 is the SOS Token
            decoder_hidden = context
            decoder_outputs = []
            for di in range(target_length):
                decoder_output, decoder_hidden = state.decoder(decoder_input, decoder_hidden)
                decoder_output = decoder_output.view(decoder_output.shape[1], -1)
                _, pred = torch.max(decoder_output, 1)

                pred = pred.view(1, -1)
                decoder_input = pred 
                decoder_outputs.append(decoder_output)
                #loss += L(decoder_output, y[di])

            decoder_outputs = torch.stack(decoder_outputs).permute(0, 2, 1)
            loss += L(decoder_outputs, y)
            test_loss += loss
            
        test_loss = test_loss / len(dataloader)
        return test_loss.item()

In [21]:
def train(train_loader, save_path, tensorboard_name, iterations=500):
    if save_path.is_file():
        with save_path.open('rb') as fp:
            state = torch.load(fp, map_location=torch.device(device))
    else :
        enc = Encoder(INPUT_SIZE, HIDDEN_SIZE).to(device)
        dec = Decoder(HIDDEN_SIZE, OUTPUT_SIZE).to(device)
        optimizer_enc = torch.optim.Adam(enc.parameters(), lr=0.001)
        optimizer_dec = torch.optim.Adam(dec.parameters(), lr=0.001)
        state = State(enc, dec, optimizer_enc, optimizer_dec)
    for epoch in range(state.epoch, iterations):
        loss_train = train_loop(train_loader, state, nn.CrossEntropyLoss())
        loss_test = test_loop(test_loader, state, nn.CrossEntropyLoss())
        with save_path.open("wb") as fp:
            state.epoch = epoch + 1
            torch.save(state, fp)
        writer.add_scalar(tensorboard_name+'/train', loss_train, epoch)
        writer.add_scalar(tensorboard_name+'/test', loss_test, epoch)
        print('Epoch: ', epoch, 'Loss train: ',loss_train, 'Loss test: ',loss_test)
    print("Done!")
    return state.encoder, state.decoder

In [24]:
writer = SummaryWriter("runs/segmentation"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
savepath = Path('./segmentation.pt')
encoder, decoder = train(train_loader, savepath, "Segmentation", iterations=50)

  "num_layers={}".format(dropout, num_layers))


Epoch:  0 Loss train:  2.5818679332733154 Loss test:  2.694697856903076
Epoch:  1 Loss train:  2.1269571781158447 Loss test:  2.4664785861968994
Epoch:  2 Loss train:  1.9573062658309937 Loss test:  2.3791210651397705
Epoch:  3 Loss train:  1.8942962884902954 Loss test:  2.346367597579956
Epoch:  4 Loss train:  1.8177053928375244 Loss test:  2.2873775959014893
Epoch:  5 Loss train:  1.741733193397522 Loss test:  2.328838348388672
Epoch:  6 Loss train:  1.7203929424285889 Loss test:  2.279452323913574
Epoch:  7 Loss train:  1.7117818593978882 Loss test:  2.2273752689361572
Epoch:  8 Loss train:  1.6407390832901 Loss test:  2.2567150592803955
Epoch:  9 Loss train:  1.61990487575531 Loss test:  2.218987464904785
Epoch:  10 Loss train:  1.632145643234253 Loss test:  2.1899619102478027
Epoch:  11 Loss train:  1.6027554273605347 Loss test:  2.226349115371704
Epoch:  12 Loss train:  1.5933877229690552 Loss test:  2.2144806385040283
Epoch:  13 Loss train:  1.566565752029419 Loss test:  2.21023

In [None]:
with torch.no_grad():
    for batch, (X, X_sizes, y, y_sizes) in enumerate(test_loader): 
        X = X.to(device)
        y = y.to(device)

        X = X[:, 0:1]
        y = y[:, 0:1]
        
        preds = []
        input_length = X.size(0)

        loss = 0
        context = encoder(X)

        pred = torch.ones(1, X.size(1), dtype=torch.long, device=device) * 2
        decoder_hidden = context

        while pred.item() != 1: #EOS = 1
            decoder_output, decoder_hidden = decoder(pred, decoder_hidden)
            decoder_output = decoder_output.view(decoder_output.shape[1], -1)
            _, pred = torch.max(decoder_output, 1)
            preds.append(pred.item())  
            pred = pred.view(1, -1)
        break

wds = vocEng.getwords(X)
wds = a.decode(list(filter(lambda x: type(x) == int, wds)))
trs = vocFra.getwords(y)
trs = a.decode(list(filter(lambda x: type(x) == int, trs)))
predtrs = vocFra.getwords(preds)
predtrs = a.decode(list(filter(lambda x: type(x) == int, predtrs)))
print(wds)
print(trs)
print(predtrs)

who did it
qui a fait ca
qui l a fait
