In [1]:
!pip install -r http://webia.lip6.fr/~baskiotisn/requirements-amal.txt

In [1]:
import logging
import re
from pathlib import Path
from tqdm import tqdm
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from datamaestro import prepare_dataset
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
class FolderText(Dataset):
    """Dataset basé sur des dossiers (un par classe) et fichiers"""

    def __init__(self, classes, folder: Path, tokenizer, load=False):
        self.tokenizer = tokenizer
        self.files = []
        self.filelabels = []
        self.labels = {}
        for ix, key in enumerate(classes):
            self.labels[key] = ix

        for label in classes:
            for file in (folder / label).glob("*.txt"):
                self.files.append(file.read_text(encoding='cp437') if load else file)
                self.filelabels.append(self.labels[label])

    def __len__(self):
        return len(self.filelabels)

    def __getitem__(self, ix):
        s = self.files[ix]
        return self.tokenizer(s if isinstance(s, str) else s.read_text()), self.filelabels[ix]

In [3]:
def get_imdb_data(embedding_size=50):
    """Renvoie l'ensemble des donnéees nécessaires pour l'apprentissage

    - dictionnaire word vers ID
    - embeddings (Glove)
    - DataSet (FolderText)

    """
    WORDS = re.compile(r"\S+")

    words, embeddings = prepare_dataset('edu.stanford.glove.6b.%d' % embedding_size).load()
    OOVID = len(words)
    words.append("__OOV__")
    words.append("__PAD__")
    
    word2id = {word: ix for ix, word in enumerate(words)}
    embeddings = np.vstack((embeddings, np.zeros(embedding_size), np.ones(embedding_size)))

    def tokenizer(t):
        return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())]

    logging.info("Loading embeddings")

    logging.info("Get the IMDB dataset")
    ds = prepare_dataset("edu.stanford.aclimdb")

    return word2id, embeddings, FolderText(ds.train.classes, ds.train.path, tokenizer, load=False), FolderText(ds.test.classes, ds.test.path, tokenizer, load=False)



In [4]:
EMB_DIM = 200
BATCH_SIZE = 64
MAX_LENGTH = 500

In [5]:
word2id, embeddings, data_train, data_test = get_imdb_data(embedding_size=EMB_DIM)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emb_layer = nn.Embedding.from_pretrained(torch.Tensor(embeddings))
def collate_fn(batch):
    """Collate using pad_sequence"""
    data = [torch.LongTensor(item[0][:MAX_LENGTH]) for item in batch]
    labels = [b[1] for b in batch]
    return (emb_layer(pad_sequence(data, batch_first=True,padding_value = word2id["__PAD__"])).to(device), 
                 torch.LongTensor(labels).to(device))

In [7]:
train_loader = DataLoader(data_train, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(data_test, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class BaselineModel(nn.Module):
    def __init__(self, emb_dim):
        super(BaselineModel, self).__init__()
        self.emb_dim = emb_dim
        self.linear = nn.Linear(emb_dim, 1)
        self.m = nn.Sigmoid()
    
    def forward(self, x):
        mask0 = torch.zeros_like(x)
        mask1 = torch.ones_like(x)
        mask = torch.where(x!=0,mask1,mask0)
        x = torch.sum(x,dim=1)/torch.sum(mask,dim=1)
        x = self.linear(x)
        return self.m(x)

In [13]:
class State:
    def __init__(self, model, optim):
        self.model = model
        self.optimizer = optim
        self.epoch, self.iteration = 0, 0

In [8]:
def train_loop(dataloader, state):
    train_loss = 0
    train_acc = 0
    L = nn.BCELoss()
    for batch, (X, y) in enumerate(dataloader):
        #X = X.permute(1,0,2)    
        yhat = state.model(X)
        y = y.reshape(-1, 1).float()
        loss = L(yhat, y)
        state.optimizer.zero_grad()
        loss.backward()
        state.optimizer.step()
        state.iteration += 1
        acc = (torch.sum( torch.where(yhat > 0.5, 1, 0) == y) / dataloader.batch_size)
        train_acc += acc
        train_loss += loss

    train_acc = train_acc / len(dataloader)
    train_loss = train_loss / len(dataloader)
    return train_loss.item(), train_acc.item()

In [9]:
def test_loop(dataloader, model):
    test_loss = 0
    test_accuracy = 0
    L = nn.BCELoss()
    for batch, (X, y) in enumerate(dataloader):  
        with torch.no_grad():
            #X = X.permute(1,0,2)    
            yhat = model(X)
            y = y.reshape(-1, 1).float()
            loss = L(yhat, y)
            test_loss += loss
            acc = (torch.sum( torch.where(yhat > 0.5, 1, 0) == y) / dataloader.batch_size)
            test_accuracy += acc

    test_loss = test_loss / len(dataloader)
    test_accuracy = test_accuracy / len(dataloader)
    return test_loss.item(), test_accuracy.item()

In [10]:
def train(data_train, data_test, save_path, Model, tensorboard_name, iterations=500):
    if save_path.is_file():
        with save_path.open('rb') as fp:
            state = torch.load(fp)
    else :
        model = Model(EMB_DIM).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        state = State(model, optimizer)
    for epoch in range(state.epoch, iterations):
        loss_train, acc_train = train_loop(data_train, state)
        loss_test, acc_test = test_loop(data_test, state.model)
        with save_path.open("wb") as fp:
            state.epoch = epoch + 1
            torch.save(state, fp)
        #loss_test = test_loop(data_test, state.model)
        #writer.add_scalar(tensorboard_name+'/train', loss_test, epoch)
        #writer.add_scalar(tensorboard_name+'/dev', loss_train, epoch)
        print('Epoch: ', epoch, 'Loss train: ',loss_train, 'Acc train: ',acc_train)
        print('       ', epoch, 'Loss test: ',loss_test, 'Acc test: ',acc_test)
    print("Done!")
    return state.model

In [None]:
savepath = Path('./BaselineModel.pt')
model = train(train_loader, test_loader, savepath, BaselineModel, "baseline", iterations=20)

In [None]:
class SimpleAttention(nn.Module):
    def __init__(self, emb_dim):
        super(SimpleAttention, self).__init__()
        self.emb_dim = emb_dim
        self.q = nn.Linear(emb_dim, emb_dim)
        self.linear = nn.Linear(emb_dim, 1)
        self.m = nn.Sigmoid()
    
    def forward(self, x):
        q = self.q(x)

        q = torch.bmm(q, x.permute(0,2,1))

        mask0 = torch.zeros_like(q)
        q_masked = torch.where(q.float()==0, mask0-np.inf, q)
        alpha = F.softmax(q_masked, dim=-1)

        x = torch.bmm(alpha, x)
       
        x = x.sum(dim=1)
        
        x = self.linear(x)
        
        return self.m(x)

In [None]:
savepath = Path('./SimpleAttention.pt')
model = train(train_loader, test_loader, savepath, SimpleAttention, "SimpleAttention", iterations=20)

In [11]:
class QVAttention(nn.Module):
    def __init__(self, emb_dim):
        super(QVAttention, self).__init__()
        self.emb_dim = emb_dim
        self.q = nn.Linear(emb_dim, emb_dim)
        self.linear = nn.Linear(emb_dim, 1)
        self.m = nn.Sigmoid()
    
    def forward(self, x):
        
        mask0 = torch.zeros_like(x)
        mask1 = torch.ones_like(x)
        mask = torch.where(x!=0,mask1,mask0)
        tm = torch.sum(x,dim=1)/torch.sum(mask,dim=1)
       
        q = self.q(tm).unsqueeze(1)

        q = torch.bmm(q, x.permute(0,2,1)) # les mots (x) sont les clés (k)

        mask0 = torch.zeros_like(q)
        q_masked = torch.where(q.float()==0, mask0-np.inf, q)
        alpha = F.softmax(q_masked, dim=-1)

        x = torch.bmm(alpha, x)
       
        x = x.sum(dim=1)
        
        x = self.linear(x)
        
        return self.m(x)

In [14]:
savepath = Path('./QVAttention.pt')
model = train(train_loader, test_loader, savepath, QVAttention, "QVAttention", iterations=20)