In [2]:
pip install -r http://webia.lip6.fr/~baskiotisn/requirements-amal.txt

In [1]:
import math
import click
from torch.utils.tensorboard import SummaryWriter
import logging
import re
from pathlib import Path
from tqdm import tqdm
import numpy as np
import time
from datamaestro import prepare_dataset
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
MAX_LENGTH = 500
logging.basicConfig(level=logging.INFO)

In [3]:
class FolderText(Dataset):
    """Dataset basé sur des dossiers (un par classe) et fichiers"""

    def __init__(self, classes, folder: Path, tokenizer, load=False):
        self.tokenizer = tokenizer
        self.files = []
        self.filelabels = []
        self.labels = {}
        for ix, key in enumerate(classes):
            self.labels[key] = ix

        for label in classes:
            for file in (folder / label).glob("*.txt"):
                self.files.append(file.read_text(encoding='utf-8') if load else file)
                self.filelabels.append(self.labels[label])

    def __len__(self):
        return len(self.filelabels)

    def __getitem__(self, ix):
        s = self.files[ix]
        return self.tokenizer(s if isinstance(s, str) else s.read_text(encoding='utf-8')), self.filelabels[ix]
    def get_txt(self,ix):
        s = self.files[ix]
        return s if isinstance(s,str) else s.read_text(encoding='utf-8'), self.filelabels[ix]

In [4]:
def get_imdb_data(embedding_size=50):
    """Renvoie l'ensemble des donnéees nécessaires pour l'apprentissage (embedding_size = [50,100,200,300])

    - dictionnaire word vers ID
    - embeddings (Glove)
    - DataSet (FolderText) train
    - DataSet (FolderText) test

    """
    WORDS = re.compile(r"\S+")

    words, embeddings = prepare_dataset(
        'edu.stanford.glove.6b.%d' % embedding_size).load()
    OOVID = len(words)
    words.append("__OOV__")
    word2id = {word: ix for ix, word in enumerate(words)}
    embeddings = np.vstack((embeddings, np.zeros(embedding_size)))

    def tokenizer(t):
        return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())]

    logging.info("Loading embeddings")

    logging.info("Get the IMDB dataset")
    ds = prepare_dataset("edu.stanford.aclimdb")

    return word2id, embeddings, FolderText(ds.train.classes, ds.train.path, tokenizer, load=False), FolderText(ds.test.classes, ds.test.path, tokenizer, load=False)


In [5]:
epochs = 10
test_iterations = 10
modeltype = 1
emb_size = 100
batch_size = 64

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
word2id, embeddings, train_data, test_data = get_imdb_data(emb_size)
id2word = dict((v, k) for k, v in word2id.items())
PAD = word2id["__OOV__"]
embeddings = torch.Tensor(embeddings)
emb_layer = nn.Embedding.from_pretrained(torch.Tensor(embeddings))

def collate(batch):
    """ Collate function for DataLoader """
    data = [torch.LongTensor(item[0][:MAX_LENGTH]) for item in batch]
    lens = [len(d) for d in data]
    labels = [item[1] for item in batch]
    return emb_layer(torch.nn.utils.rnn.pad_sequence(data, batch_first=True,padding_value = PAD)).to(device), torch.LongTensor(labels).to(device), torch.Tensor(lens).to(device)


train_loader = DataLoader(train_data, shuffle=True,
                      batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size,collate_fn=collate,shuffle=False)

In [8]:
class SelfAttention(nn.Module):
    def __init__(self, emb_size):
        super(SelfAttention, self).__init__()
        self.emb_size = emb_size
        self.out_size1 = 100
        
        self.q1 = nn.Linear(self.emb_size, self.emb_size)
        self.k1 = nn.Linear(self.emb_size, self.emb_size)
        self.v1 = nn.Linear(self.emb_size, self.emb_size)
        self.l1 = nn.Linear(self.emb_size, self.emb_size)
        
    def forward(self, X, d):
        q = self.q1(X)
        k = self.k1(X)
        v = self.v1(X)

        a = (torch.bmm(q, k.transpose(-2, -1)))
        b = (1/d**(-1/2))
        b = b.reshape(-1, 1, 1)
        b = b.expand(-1 ,a.shape[1], a.shape[2])
        logalpha =  b * a 
        
        mask0 = torch.zeros_like(logalpha)
        logalpha_masked = torch.where(logalpha.float()==0, mask0-np.inf, logalpha)
        
        alpha = F.softmax(logalpha_masked, dim=-1)
        
        f = torch.bmm(alpha, v)
        g = F.relu(self.l1(f))
        
        return g

In [9]:
class Transformer(nn.Module):
    def __init__(self, emb_size, L):
        super(Transformer, self).__init__()
        self.attentions = nn.ModuleList()
        self.fc = nn.Linear(emb_size, 1)
        for _ in range(L):
            self.attentions.append(SelfAttention(emb_size))
            
    def forward(self, x, d):
        for i, l in enumerate(self.attentions):
            x = l(x, d)
        mean = torch.mean(x, dim=1)
        y = self.fc(mean)
        return F.sigmoid(y)

In [10]:
class State:
    def __init__(self, model, optim):
        self.model = model
        self.optimizer = optim
        self.epoch, self.iteration = 0, 0

In [11]:
def train_loop(dataloader, state):
    train_loss = 0
    train_accuracy = 0
    L = nn.BCELoss()
    for batch, (X, y, d) in enumerate(dataloader):    
        yhat = state.model(X, d)
        y = y.reshape(-1, 1).float()
        loss = L(yhat, y)
        state.optimizer.zero_grad()
        loss.backward()
        state.optimizer.step()
        state.iteration += 1

        train_loss += loss
        acc = (torch.sum( torch.where(yhat > 0.5, 1, 0) == y) / dataloader.batch_size)
        train_accuracy += acc

    train_loss = train_loss / len(dataloader)
    train_accuracy = train_accuracy / len(dataloader)
    return train_loss.item(), train_accuracy.item()

In [12]:
def test_loop(dataloader, model):
    test_loss = 0
    test_accuracy = 0
    L = nn.BCELoss()
    for batch, (X, y, d) in enumerate(dataloader):  
        with torch.no_grad():
            yhat = model(X, d)
            y = y.reshape(-1, 1).float()
            loss = L(yhat, y)
            test_loss += loss
            acc = (torch.sum( torch.where(yhat > 0.5, 1, 0) == y) / dataloader.batch_size)
            test_accuracy += acc

    test_loss = test_loss / len(dataloader)
    test_accuracy = test_accuracy / len(dataloader)
    return test_loss.item(), test_accuracy.item()

In [13]:
def train(data_train, data_test, save_path, Model, tensorboard_name, iterations=500):
    if save_path.is_file():
        with save_path.open('rb') as fp:
            state = torch.load(fp)
    else :
        model = Model(100,3).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        state = State(model, optimizer)
    for epoch in range(state.epoch, iterations):
        loss_train, acc_train = train_loop(data_train, state)
        loss_test, acc_test = test_loop(data_test, state.model)
        with save_path.open("wb") as fp:
            state.epoch = epoch + 1
            torch.save(state, fp)
        #loss_test = test_loop(data_test, state.model)
        #writer.add_scalar(tensorboard_name+'/train', loss_test, epoch)
        #writer.add_scalar(tensorboard_name+'/dev', loss_train, epoch)
        print('Epoch: ', epoch, 'Loss train: ',loss_train, 'Acc train: ',acc_train)
        print('       ', epoch, 'Loss test: ',loss_test, 'Acc test: ',acc_test)
    print("Done!")
    return state.model

In [14]:
savepath = Path('./Transformer2.pt')
model = train(train_loader, test_loader, savepath, Transformer, "baseline", iterations=20)

In [33]:
class SelfAttentionResidual(nn.Module):
    def __init__(self, emb_size):
        super(SelfAttentionResidual, self).__init__()
        self.emb_size = emb_size
        self.out_size1 = 100
        
        self.q1 = nn.Linear(self.emb_size, self.emb_size)
        self.k1 = nn.Linear(self.emb_size, self.emb_size)
        self.v1 = nn.Linear(self.emb_size, self.emb_size)
        self.l1 = nn.Linear(self.emb_size, self.emb_size)
        
    def forward(self, X, d):
        X_copy = X
        q = self.q1(X)
        
        k = self.k1(X)
        v = self.v1(X)
        print(v.shape)
        a = (torch.bmm(q, k.transpose(-2, -1)))
        b = (1/d**(-1/2))
        b = b.reshape(-1, 1, 1)
        b = b.expand(-1 ,a.shape[1], a.shape[2])
        logalpha =  b * a 
        
        mask0 = torch.zeros_like(logalpha)
        logalpha_masked = torch.where(logalpha.float()==0, mask0-np.inf, logalpha)
        print(logalpha_masked.shape)
        alpha = F.softmax(logalpha_masked, dim=-1)
        print(alpha.shape, v.shape)
        f = torch.bmm(alpha, v)
        print(f.shape)
        f = f + X_copy
        
        g = F.relu(self.l1(f))
        print(g.shape)
        return g

In [19]:
class TransformerResidual(nn.Module):
    def __init__(self, emb_size, L):
        super(TransformerResidual, self).__init__()
        self.attentions = nn.ModuleList()
        self.norms = nn.ModuleList()
        self.fc = nn.Linear(emb_size, 1)
        for _ in range(L):
            self.attentions.append(SelfAttentionResidual(emb_size))
            self.norms.append(nn.LayerNorm(emb_size))
            
    def forward(self, x, d):
        for i, l in enumerate(self.attentions):
            x = l(x, d)
            if i != len(self.attentions) -1:
                x = self.norms[i](x)
        mean = torch.mean(x, dim=1)
        y = self.fc(mean)
        return F.sigmoid(y)

In [20]:
savepath = Path('./Transformer_res.pt')
model = train(train_loader, test_loader, savepath, TransformerResidual, "baseline", iterations=20)

In [15]:
import math

In [16]:
class PositionalEncoding(nn.Module):
    "Position embeddings"

    def __init__(self, d_model: int, max_len: int = 5000):
        """Génère des embeddings de position

        Args:
            d_model (int): Dimension des embeddings à générer
            max_len (int, optional): Longueur maximale des textes.
                Attention, plus cette valeur est haute, moins bons seront les embeddings de position.
        """
        super().__init__()

        pe = torch.zeros(max_len, d_model, dtype=torch.float)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        pe.requires_grad = False
        self.register_buffer('pe', pe)

    def forward(self, x):
        """Ajoute les embeddings de position"""
        x = x + self.pe[:, :x.size(1)]
        return x

In [17]:
pos_encoding = PositionalEncoding(100, max_len = 500)
import seaborn as sns
sns.set()
ml = MAX_LENGTH
heatmap = np.ndarray((ml,ml))
for i in range(ml):
    for j in range(ml):
        pei = pos_encoding.pe[0,i,:]
        pej = pos_encoding.pe[0,j,:]
        heatmap[i, j] = torch.dot(pei, pej).item()
sns.heatmap(heatmap)

In [44]:
class TransformerPosEncoding(nn.Module):
    def __init__(self, emb_size, L):
        super(TransformerPosEncoding, self).__init__()
        self.attentions = nn.ModuleList()
        self.fc = nn.Linear(emb_size, 1)
        self.pos_encoding = PositionalEncoding(emb_size, max_len = 500)
        for _ in range(L):
            self.attentions.append(SelfAttention(emb_size))
            
    def forward(self, x, d):
        x = self.pos_encoding(x)
        for i, l in enumerate(self.attentions):
            x = l(x, d)
            
        mean = torch.mean(x, dim=1)
        y = self.fc(mean)
        return F.sigmoid(y)

In [46]:
savepath = Path('./Transformer_pos2.pt')
model = train(train_loader, test_loader, savepath, TransformerPosEncoding, "baseline", iterations=20)

In [20]:
class TransformerCLS(nn.Module):
    def __init__(self, emb_size, L):
        super(TransformerCLS, self).__init__()
        self.attentions = nn.ModuleList()
        self.fc = nn.Linear(emb_size, 1)
        self.cls = torch.nn.Parameter(torch.ones(emb_size))
        for _ in range(L):
            self.attentions.append(SelfAttention(emb_size))
            
    def forward(self, x, d):
        x = torch.cat((self.cls.unsqueeze(0).repeat((x.shape[0],1,1)),x),dim=1)
        for i, l in enumerate(self.attentions):
            x = l(x, d)
        mean = torch.mean(x, dim=1)
        y = self.fc(mean)
        return F.sigmoid(y)

In [21]:
savepath = Path('./Transformer_cls.pt')
model = train(train_loader, test_loader, savepath, TransformerCLS, "baseline", iterations=20)