In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 22.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 8.8 MB/s eta 0:00:01[K     |▉                               | 30 kB 7.7 MB/s eta 0:00:01[K     |█                               | 40 kB 7.1 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 4.6 MB/s eta 0:00:01[K     |██                              | 71 kB 4.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 4.9 MB/s eta 0:00:01[K     |██▍                             | 92 kB 3.9 MB/s eta 0:00:01[K     |██▊                             | 102 kB 4.1 MB/s eta 0:00:01[K     |███                             | 112 kB 4.1 MB/s eta 0:00:01[K     |███▎                            | 122 kB 4.1 MB/s eta 0:00:01[K     |███▌         

In [None]:
!pip install -r http://webia.lip6.fr/~baskiotisn/requirements-amal.txt

In [None]:
import logging

from torch.nn.modules.pooling import MaxPool1d
logging.basicConfig(level=logging.INFO)

import heapq
from pathlib import Path
import gzip

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import sentencepiece as spm
import math
from tp8_preprocess import TextDataset

In [None]:
import datetime

In [None]:
# Utiliser tp8_preprocess pour générer le vocabulaire BPE et
# le jeu de donnée dans un format compact

# --- Configuration

# Taille du vocabulaire
vocab_size = 1000
#MAINDIR = Path(__file__).parent
MAINDIR = Path('.').parent
# Chargement du tokenizer

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(f"wp{vocab_size}.model")
ntokens = len(tokenizer)

def loaddata(mode):
    with gzip.open(f"{mode}-{vocab_size}.pth", "rb") as fp:
        return torch.load(fp)

In [None]:
test = loaddata("test")
train = loaddata("train")
TRAIN_BATCHSIZE=64
TEST_BATCHSIZE=64

In [None]:
val_size = 10000
train_size = len(train) - val_size
train, val = torch.utils.data.random_split(train, [train_size, val_size])

logging.info("Datasets: train=%d, val=%d, test=%d", train_size, val_size, len(test))
logging.info("Vocabulary size: %d", vocab_size)
train_iter = torch.utils.data.DataLoader(train, batch_size=TRAIN_BATCHSIZE, collate_fn=TextDataset.collate)
val_iter = torch.utils.data.DataLoader(val, batch_size=TEST_BATCHSIZE, collate_fn=TextDataset.collate)
test_iter = torch.utils.data.DataLoader(test, batch_size=TEST_BATCHSIZE, collate_fn=TextDataset.collate)

INFO:root:Datasets: train=1590000, val=10000, test=359
INFO:root:Vocabulary size: 1000


In [None]:
seq_len = next(iter(train_iter))[0].shape[1]
seq_len

49

In [None]:
class Classifier(torch.nn.Module):
    def __init__(self, seq_len, num_words, emb_size, num_class):
        super(Classifier, self).__init__()
        
        self.dropout = nn.Dropout(0.2)
        
        self.kernel_1 = 3
        self.kernel_2 = 3
        self.kernel_3 = 3
        
        self.out_1 = 64
        self.out_2 = 32
        self.out_3 = 1
        
        self.kernel_pool = 2
        self.stride_pool = 2
        
        self.dense_size_1 = 128
        self.dense_size_2 = 16
        
        self.embedding = nn.Embedding(num_words, emb_size, padding_idx=0)
        
        self.conv1 = nn.Conv1d(emb_size, self.out_1, self.kernel_1)
        self.pool1 = nn.MaxPool1d(self.kernel_pool, self.stride_pool)

        self.conv2 = nn.Conv1d(self.out_1, self.out_2, self.kernel_2)
        self.pool2 = nn.MaxPool1d(self.kernel_pool, self.stride_pool)
        
        self.conv3 = nn.Conv1d(self.out_2, self.out_3, self.kernel_2)
        #self.pool2 = nn.MaxPool1d(self.kernel_pool, self.stride_pool)
        self.pool3 = nn.AdaptiveAvgPool1d(self.dense_size_1)
        
        #self.fc = nn.Linear((int((math.floor(self.out_2 - (self.kernel_pool - 1) - 1) / self.stride_pool)) + 1)*num_class, num_class)
        self.fc1 = nn.Linear(self.dense_size_1, self.dense_size_2)
        self.fc2 = nn.Linear(self.dense_size_2, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.pool1(x)
        
        x = self.conv2(x)
        x = torch.relu(x)
        x = self.pool2(x)

        x = self.conv3(x)
        x = torch.relu(x)
        x = self.pool3(x)
        
        x = x.squeeze(dim=1)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train_loop(dataloader, model, optimizer):
    train_loss, train_acc = 0, 0
    L = nn.CrossEntropyLoss()
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        yhat = model(X.long())
        loss = L(yhat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, pred = torch.max(yhat, 1)
        train_acc += (torch.sum( pred == y) / dataloader.batch_size).item()
    return train_loss / len(dataloader), train_acc / len(dataloader)

In [None]:
def test_loop(dataloader, model):
    test_loss, test_acc = 0, 0
    L = nn.CrossEntropyLoss()
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            yhat = model(X.long())
            loss = L(yhat, y)
            test_loss += loss.item()
            _, pred = torch.max(yhat, 1)
            test_acc += (torch.sum( pred == y) / dataloader.batch_size).item()
    return test_loss / len(dataloader), test_acc / len(dataloader)

In [None]:
class State:
    def __init__(self, model, optim):
        self.model = model
        self.optimizer = optim
        self.epoch, self.iteration = 0, 0

In [None]:
def train(data_train, data_val, data_test, save_path, Model, tensorboard_name, iterations=500):
    if save_path.is_file():
        with save_path.open('rb') as fp:
            state = torch.load(fp)
    else :
        model = Model(seq_len ,vocab_size, 100, 3).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
        state = State(model, optimizer)
    for epoch in range(state.epoch, iterations):
        loss_train, acc_train = train_loop(data_train, state.model, state.optimizer)
        with save_path.open("wb") as fp:
            state.epoch = epoch + 1
            torch.save(state, fp)
        loss_val, acc_val = test_loop(data_val, state.model)
        
        train_writer.add_scalar(tensorboard_name+'/loss',loss_train , epoch)
        val_writer.add_scalar(tensorboard_name+'/loss',loss_val , epoch)

        train_writer.add_scalar(tensorboard_name+'/accuracy', acc_train, epoch)
        val_writer.add_scalar(tensorboard_name+'/accuracy', acc_val, epoch)
        
        print('Epoch:', epoch, '\n Loss val: ', loss_val, 'Loss train: ',loss_train, '\nAcc val: ',acc_val, ' Acc train: ', acc_train, '\n\n')
    print("Done!")
    return state.model

In [None]:
savepath1 = Path('./model1.pt')
train_writer = SummaryWriter("runs/train"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
val_writer = SummaryWriter("runs/val"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
model1 = train(train_iter, val_iter, test_iter, savepath1, Classifier, "Classifier" ,iterations=50)

Epoch: 0 
 Loss val:  0.5488508377864862 Loss train:  0.5947614106072394 
Acc val:  0.7466162420382165  Acc train:  0.7192990510787313 


Epoch: 1 
 Loss val:  0.5404257083394725 Loss train:  0.5363286883478636 
Acc val:  0.7584593949044586  Acc train:  0.7673029705361456 


Epoch: 2 
 Loss val:  0.5312828710124751 Loss train:  0.5206520000907212 
Acc val:  0.7682125796178344  Acc train:  0.7771777139349542 


Epoch: 3 
 Loss val:  0.52356941286166 Loss train:  0.510523721123496 
Acc val:  0.7659235668789809  Acc train:  0.7832826839478345 


Epoch: 4 
 Loss val:  0.5351616419424676 Loss train:  0.5029902684851231 
Acc val:  0.767515923566879  Acc train:  0.7874511954596684 




KeyboardInterrupt: ignored

In [None]:
test_loss, test_acc = 0, 0
L = nn.CrossEntropyLoss()
with torch.no_grad():
    for batch, (X, y) in enumerate(test_iter):
        X = X.to(device)
        y = y.to(device)
        yhat = model1(X.long())
        loss = L(yhat, y)
        test_loss += loss.item()
        _, pred = torch.max(yhat, 1)
        test_acc += (torch.sum( pred == y) / test_iter.batch_size).item()
print(test_loss / len(test_iter), test_acc / len(test_iter))

0.6391860942045847 0.6510416666666666
