In [1]:
import itertools
import logging
from tqdm import tqdm

from datamaestro import prepare_dataset
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import torch
import datetime
from typing import List
import time
from pathlib import Path
logging.basicConfig(level=logging.INFO)

ds = prepare_dataset('org.universaldependencies.french.gsd')


# Format de sortie décrit dans
# https://pypi.org/project/conllu/

class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement s'il n'est pas connu
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """
    OOVID = 1
    PAD = 0

    def __init__(self, oov: bool):
        """ oov : autorise ou non les mots OOV """
        self.oov =  oov
        self.id2word = [ "PAD"]
        self.word2id = { "PAD" : Vocabulary.PAD}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self,idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self,idx: List[int]):
        return [self.getword(i) for i in idx]



class TaggingDataset():
    def __init__(self, data, words: Vocabulary, tags: Vocabulary, adding=True):
        self.sentences = []

        for s in data:
            self.sentences.append(([words.get(token["form"], adding) for token in s], [tags.get(token["upostag"], adding) for token in s]))
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, ix):
        return self.sentences[ix]


def collate_fn(batch):
    """Collate using pad_sequence"""
    return tuple(pad_sequence([torch.LongTensor(b[j]) for b in batch]) for j in range(2))


logging.info("Loading datasets...")
words = Vocabulary(True)
tags = Vocabulary(False)
train_data = TaggingDataset(ds.train, words, tags, True)
dev_data = TaggingDataset(ds.validation, words, tags, True)
test_data = TaggingDataset(ds.test, words, tags, False)


logging.info("Vocabulary size: %d", len(words))




INFO:root:Loading datasets...
INFO:root:Vocabulary size: 42928


In [2]:
import numpy as np

In [3]:
BATCH_SIZE=64

train_loader = DataLoader(train_data, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_data, collate_fn=collate_fn, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, collate_fn=collate_fn, batch_size=BATCH_SIZE)

EMB_DIM = len(words) // 100
HIDDEN_DIM = 10
VOCAB_SIZE = len(words)
TAGSET_SIZE = len(tags)

In [4]:

writer = SummaryWriter("runs/tagger/runs"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

class State:
    def __init__(self, model, optim):
        self.model = model
        self.optimizer = optim
        self.epoch, self.iteration = 0, 0

In [5]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        return tag_space

In [6]:
def train_loop(dataloader,state):
    train_loss = 0
    nb_oov = 10
    for batch, (X, y) in enumerate(train_loader): 
        #OOV
        i, j = np.random.randint(0, X.shape[0], size=nb_oov), np.random.randint(0, X.shape[1], size=nb_oov)
        for k,l in zip(i,j):
            if X[k, l] != 0: # On remplace pas les pads!
                X[k, l] = 1
        yhat = state.model(X)
        L = nn.CrossEntropyLoss(ignore_index=0)
        y = y.permute(1, 0)
        yhat = yhat.permute(1, 2, 0)
        loss = L(yhat , y)
        state.optimizer.zero_grad()
        loss.backward()
        state.optimizer.step()
        state.iteration += 1
        train_loss += loss
    train_loss = train_loss / len(dataloader)
    return train_loss.item()

In [7]:
def test_loop(dataloader,model):
    test_loss = 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(train_loader): 
            yhat = model(X)
            L = nn.CrossEntropyLoss(ignore_index=0)
            y = y.permute(1, 0)
            yhat = yhat.permute(1, 2, 0)
            loss = L(yhat , y)
            test_loss += loss
    test_loss = test_loss / len(dataloader)
    return test_loss.item()

In [10]:
def train(data_train, data_test, save_path, Model, tensorboard_name, iterations=500):
    if save_path.is_file():
        with save_path.open('rb') as fp:
            state = torch.load(fp)
    else :
        model = Model(EMB_DIM, HIDDEN_DIM, VOCAB_SIZE, TAGSET_SIZE)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        state = State(model, optimizer)
    for epoch in range(state.epoch, iterations):
        loss_train = train_loop(data_train, state)
        with save_path.open("wb") as fp:
            state.epoch = epoch + 1
            torch.save(state, fp)
        loss_test = test_loop(data_test, state.model)
        writer.add_scalar(tensorboard_name+'/train', loss_train, epoch)
        writer.add_scalar(tensorboard_name+'/dev',loss_test , epoch)
        print('Epoch: ', epoch, ' Loss dev: ', loss_test, 'Loss train: ',loss_train)
    print("Done!")
    return state.model

In [11]:
savepath = Path('./lstmTagger.pt')
model = train(train_loader, dev_loader, savepath, LSTMTagger, "LSTMTagger", iterations=50)

Epoch:  0  Loss dev:  57.150394439697266 Loss train:  2.27164626121521
Epoch:  1  Loss dev:  38.48114776611328 Loss train:  1.4752479791641235
Epoch:  2  Loss dev:  27.564605712890625 Loss train:  1.0282931327819824
Epoch:  3  Loss dev:  20.744081497192383 Loss train:  0.7607361674308777
Epoch:  4  Loss dev:  16.40595245361328 Loss train:  0.5908533930778503
Epoch:  5  Loss dev:  13.465606689453125 Loss train:  0.47762659192085266
Epoch:  6  Loss dev:  11.350252151489258 Loss train:  0.39774686098098755
Epoch:  7  Loss dev:  9.81848430633545 Loss train:  0.34065738320350647
Epoch:  8  Loss dev:  8.62510871887207 Loss train:  0.29790744185447693
Epoch:  9  Loss dev:  7.686396598815918 Loss train:  0.26438474655151367
Epoch:  10  Loss dev:  6.938488960266113 Loss train:  0.23740534484386444
Epoch:  11  Loss dev:  6.31382417678833 Loss train:  0.2160046547651291
Epoch:  12  Loss dev:  5.804275989532471 Loss train:  0.197427898645401
Epoch:  13  Loss dev:  5.3275837898254395 Loss train:  0

In [16]:
def accuracy_calculator(model, loader):
    test_loss = 0
    test_acc = 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(loader): 
            yhat = model(X)
            L = nn.CrossEntropyLoss()
            y = y.permute(1, 0)
            yhat = yhat.permute(1, 2, 0)
            loss = L(yhat , y)
            _, pred = torch.max(yhat, 1)
            partial_acc = 0
            for (i, y_row) in enumerate(y):
                y_filtered = y_row[y_row != 0]
                pred_filtered = pred[i][:len(y_filtered)]
                partial_acc += torch.sum( pred_filtered == y_filtered) / len(y_filtered)
            acc = partial_acc / loader.batch_size
            test_acc += acc
            test_loss += loss
    return test_acc / len(loader)

In [17]:
accuracy_calculator(model, test_loader)

tensor(0.8573)

In [18]:
accuracy_calculator(model, dev_loader)

tensor(0.8430)

In [19]:
accuracy_calculator(model, train_loader)

tensor(0.9845)

In [26]:
with torch.no_grad():
    for batch, (X, y) in enumerate(test_loader): 
        
        yhat = model(X)
        L = nn.CrossEntropyLoss()
        y = y.permute(1, 0)
        yhat = yhat.permute(1, 2, 0)
        loss = L(yhat , y)
        _, pred = torch.max(yhat, 1)
        acc = torch.sum( pred == y) / (test_loader.batch_size * y.shape[1])
        break
i = 0
s = 55
X = X.T
wds = words.getwords(X[s,:])
tgs = tags.getwords(y[s,:])
predtgs = tags.getwords(pred[s,:])
while wds[i] != 'PAD':
    print("Word:{: <20} Tag:{: <20} Predicted Tag:{: <20}"\
          .format(wds[i], tgs[i], predtgs[i]))
    i += 1

Word:La                   Tag:DET                  Predicted Tag:DET                 
Word:__OOV__              Tag:NOUN                 Predicted Tag:NOUN                
Word:rend                 Tag:VERB                 Predicted Tag:VERB                
Word:aussi                Tag:ADV                  Predicted Tag:ADV                 
Word:__OOV__              Tag:ADJ                  Predicted Tag:DET                 
Word:que                  Tag:SCONJ                Predicted Tag:SCONJ               
Word:les                  Tag:DET                  Predicted Tag:DET                 
Word:drogues              Tag:NOUN                 Predicted Tag:NOUN                
Word:dures                Tag:ADJ                  Predicted Tag:ADJ                 
Word:!                    Tag:PUNCT                Predicted Tag:PUNCT               
