In [283]:
pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


In [67]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm

In [68]:
import os
import re
from string import digits

import pandas as pd
from sklearn.metrics import classification_report

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

In [69]:
# Run LSTM for each file and store results for hyperparameter combinations
train_dir = sorted([f for f in os.listdir("../training_data/test-and-training/training_data/") if f.endswith('xlsx')])
test_dir = sorted([f for f in os.listdir("../training_data/test-and-training/test_data/") if f.endswith('xlsx')])
remove_digits = str.maketrans('', '', digits)

In [70]:
train_dir

['lab-manual-split-combine-train-5768.xlsx',
 'lab-manual-split-combine-train-78516.xlsx',
 'lab-manual-split-combine-train-944601.xlsx']

In [71]:
test_dir

['lab-manual-split-combine-test-5768.xlsx',
 'lab-manual-split-combine-test-78516.xlsx',
 'lab-manual-split-combine-test-944601.xlsx']

In [72]:
for f in range(len(train_dir[:1])): # on ne s'intéresse que aux fichiers split-combine (le plus général, données de meilleure qualité)
    print("Experiment Number: ", f)
    name = train_dir[f].replace(".xlsx", "").replace("-train", "")
    seed = int(re.findall("\d+", name)[0])
    base_name = name.translate(remove_digits)[:-1]
    print(name), print(seed), print(base_name)

    train = pd.read_excel("../training_data/test-and-training/training_data/" + train_dir[f], index_col=False)
    test = pd.read_excel("../training_data/test-and-training/test_data/" + test_dir[f], index_col=False)

Experiment Number:  0
lab-manual-split-combine-5768
5768
lab-manual-split-combine


In [73]:
train

Unnamed: 0,index,sentence,year,label,orig_index
0,183,"Our mandate, sorry, is price inflation.",2021,1,178
1,224,"During the past several years, workers across ...",2000,1,220
2,861,The Committee's assessments will take into acc...,2004,2,812
3,59,Although real growth was likely to be moderate...,2000,1,59
4,914,The increase over the last few months in five-...,2000,1,864
...,...,...,...,...,...
1979,429,"Indeed, the members did not rule out the emerg...",2016,0,412
1980,461,"In the simplest version of his model, Bill ass...",2021,2,447
1981,797,The implication is that trend productivity and...,2004,0,774
1982,913,The hurricanes were also expected to depress p...,2000,2,863


In [74]:
test

Unnamed: 0,index,sentence,year,label,orig_index
0,871,The U. S. international trade deficit narrowed...,2010,0,822
1,123,"Based on historical experience, it seems impro...",2007,0,122
2,903,This was also an era when the principal mortga...,2004,2,876
3,875,The available data for October suggested that ...,2009,2,826
4,351,"However, we have also found that excluding vol...",2006,2,342
...,...,...,...,...,...
491,654,"Over the medium term, participants expected st...",2012,0,620
492,373,"If so, GDP growth this calendar year could be ...",2006,1,363
493,760,Several participants discussed the possible co...,2010,1,719
494,168,"Looking ahead, FOMC participants project the u...",2012,2,163


In [79]:
X_train = train['sentence'].tolist()
Y_train = torch.tensor(train['label'].to_numpy(), dtype=torch.int)

X_test = test['sentence'].tolist()
Y_test = torch.tensor(test['label'].to_numpy(), dtype=torch.int)

In [81]:
tokenizer = get_tokenizer('basic_english')
tokenized_train = [tokenizer(doc) for doc in X_train]
tokenized_test = [tokenizer(doc) for doc in X_test]

In [82]:
tok_freq = Counter([tok for doc in tokenized_train for tok in doc])
vocab = tok_freq.most_common(2000)

In [83]:
vocab_dict = {"<pad>": 0, "<oov>": 1}
index=2
for word, freq in vocab:
    vocab_dict[word] = index
    index += 1

In [84]:
def text_to_indices(text):
    tokens = tokenizer(text)
    indices = [vocab_dict.get(token, vocab_dict["<oov>"]) for token in tokens]
    return torch.tensor(indices)

In [85]:
indices_train = [text_to_indices(text) for text in X_train]
indices_test = [text_to_indices(text) for text in X_test]
padded_indices_train = pad_sequence(indices_train, batch_first=True)
padded_indices_test = pad_sequence(indices_test, batch_first=True)
padded_indices_train.shape, padded_indices_test.shape 

(torch.Size([1984, 209]), torch.Size([496, 177]))

In [86]:
padded_indices_train = padded_indices_train[:, :100] # truncate
padded_indices_test = padded_indices_test[:, :100] # truncate
padded_indices_train.shape, padded_indices_test.shape 

(torch.Size([1984, 100]), torch.Size([496, 100]))

In [87]:
my_docs_array_train = padded_indices_train
my_docs_array_test = padded_indices_test

my_labels_array_train = Y_train
my_labels_array_test = Y_test

# load dictionary of word indexes (sorted by decreasing frequency across the corpus)
word_to_index = vocab_dict

# invert mapping
index_to_word = {v: k for k, v in word_to_index.items()} ### fill the gap (use a dict comprehension) ###
input_size = my_docs_array_train.shape

In [88]:
# Le nettoyage est terminé, on peut passer au modèle

In [89]:
class AttentionWithContext(nn.Module):
    """
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    """

    def __init__(self, input_shape, return_coefficients=False, bias=True):
        super(AttentionWithContext, self).__init__()
        self.return_coefficients = return_coefficients

        self.W = nn.Linear(input_shape, input_shape, bias=bias)
        self.tanh = nn.Tanh()
        self.u = nn.Linear(input_shape, 1, bias=False)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.W.weight.data.uniform_(-initrange, initrange)
        self.W.bias.data.uniform_(-initrange, initrange)
        self.u.weight.data.uniform_(-initrange, initrange)

    def generate_square_subsequent_mask(self, sz):
        # do not pass the mask to the next layers
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = (
            mask.float()
            .masked_fill(mask == 0, float("-inf"))
            .masked_fill(mask == 1, float(0.0))
        )
        return mask

    def forward(self, x, mask=None):
        uit = self.W(x) # fill the gap # compute uit = W . x  where x represents ht
        uit = self.tanh(uit)
        ait = self.u(uit)
        a = torch.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            a = a*mask.double()

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        eps = 1e-9
        a = a / (torch.sum(a, axis=1, keepdim=True) + eps)
        weighted_input = a * x # computes the attentional vector
        if self.return_coefficients:
            return [torch.sum(weighted_input, axis=1), a] ### [attentional vector, coefficients] ###
        else:
            return torch.sum(weighted_input, axis=1) ### attentional vector only ###

In [255]:
d = 30 # dimensionality of word embeddings
n_units = 50 # RNN layer dimensionality
drop_rate = 0.5 # dropout
mfw_idx = 2 # index of the most frequent words in the dictionary
padding_idx = 0 # 0 is for the special padding token
oov_idx = 1 # 1 is for the special out-of-vocabulary token
batch_size = 64
nb_epochs = 50
my_patience = 5 # for early stopping strategy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cpu device


In [256]:
class Dataset_(Dataset):
    def __init__(self, x, y):
        self.documents = x
        self.labels = y

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, index):
        document = self.documents[index]
        label = self.labels[index]
        sample = {
            "document": torch.tensor(document),
            "label": torch.tensor(label),
            }
        return sample

In [257]:
def get_loader(x, y, batch_size=32):
    dataset = Dataset_(x, y)
    data_loader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            pin_memory=True,
                            drop_last=True,
                            )
    return data_loader

In [258]:
class AttentionBiGRU(nn.Module):
    def __init__(self, input_shape, n_units, index_to_word, dropout=0):
        super(AttentionBiGRU, self).__init__()
        self.embedding = nn.Embedding(len(index_to_word), # vocab size
                                      d, # dimensionality of embedding space
                                      padding_idx=0)
        self.dropout = nn.Dropout(drop_rate)
        self.gru = nn.GRU(input_size=d,
                          hidden_size=n_units,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=True)
        self.attention = AttentionWithContext(n_units * 2,   # the input shape for the attention layer
                                              return_coefficients=True)
        self.lin_out = nn.Linear(n_units * 2,   # the input size of the last linear layer
                                 3)
        self.preds = nn.Softmax(dim=1)
        
    def forward(self, sent_ints):
        sent_wv = self.embedding(sent_ints)
        sent_wv_dr = self.dropout(sent_wv)
        sent_wa, _ = self.gru(sent_wv_dr) # RNN layer
        sent_att_vec, word_att_coeffs = self.attention(sent_wa) # attentional vector for the sent
        sent_att_vec_dr = self.dropout(sent_att_vec)
        logits = self.lin_out(sent_att_vec_dr)
        return self.preds(logits), word_att_coeffs

In [263]:
def evaluate_accuracy(data_loader, verbose=True):
    model.eval()
    ncorrect = ntotal = 0
    losses = []
    with torch.no_grad():
        for idx, data in enumerate(data_loader):
            # inference
            output = model(data["document"].to(device))[0]
            label = data["label"].to(device)
            label = label.long()
            loss = criterion(output, label)
            # total number of examples
            ntotal += output.shape[0]
            # number of correct predictions
            predictions = torch.argmax(output, dim=1)
            ncorrect += torch.sum(predictions == label) #fill me # number of correct prediction - hint: use torch.sum
            losses.append(loss.item())
        acc = ncorrect / ntotal
        avg_loss = np.mean(losses)
        if verbose:
          print("validation loss: {:.4f}, validation accuracy: {:3.2f}".format(loss, acc*100))
        return avg_loss, acc

In [264]:
model = AttentionBiGRU(input_size, n_units, index_to_word).to(device)
model = model.double()
lr = 0.01  # learning rate
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Adam optimizer

In [265]:
def train(x_train=my_docs_array_train,
          y_train=my_labels_array_train,
          x_test=my_docs_array_test,
          y_test=my_labels_array_test,
          word_dict=index_to_word,
          batch_size=batch_size):

    train_data = get_loader(x_train, y_train, batch_size)
    test_data = get_loader(x_test, y_test, batch_size)

    best_test_loss = np.inf
    p = my_patience # patience

    for epoch in range(1, nb_epochs + 1):
        losses = []
        accuracies = []
        with tqdm(train_data, unit="batch") as tepoch:
    
            for idx, data in enumerate(tepoch):
                tepoch.set_description(f"Epoch {epoch}")
                model.train()
                optimizer.zero_grad()
                input = data['document'].to(device)
                label = data['label'].to(device)
                label = label.long()
                output = model.forward(input)[0]
                loss = criterion(output, label) # fill the gap # compute the loss
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # prevent exploding gradient
                optimizer.step()

                losses.append(loss.item())
                accuracy = torch.sum(torch.argmax(output, dim=1) == label).item() / batch_size
                accuracies.append(accuracy)
                tepoch.set_postfix(loss=sum(losses)/len(losses), accuracy=100. * sum(accuracies)/len(accuracies))

        # train_acc = evaluate_accuracy(train_data, False)
        test_loss, test_acc = evaluate_accuracy(test_data, False)
        print("===> Epoch {} Complete: Validation Loss: {:.4f}, Validation Accuracy: {:3.2f}%"
              .format(epoch, test_loss, 100.*test_acc))
        if test_loss <= best_test_loss:
            best_test_loss = test_loss
            print("Validation loss improved, saving model...")
            torch.save(model.state_dict(), './best_model.pt')
            p = 0
            print()
        else:
            p += 1
            if p==my_patience:
                print("Validation loss did not improve for {} epochs, stopping training...".format(my_patience))
                break
    print("Loading best checkpoint...")
    model.load_state_dict(torch.load('./best_model.pt'))
    model.eval()
    print('done.')

In [266]:
train()

  "document": torch.tensor(document),
  "label": torch.tensor(label),
Epoch 1: 100%|██████████| 31/31 [01:33<00:00,  3.03s/batch, accuracy=48.3, loss=1.06]


===> Epoch 1 Complete: Validation Loss: 1.0471, Validation Accuracy: 49.11%
Validation loss improved, saving model...



Epoch 2: 100%|██████████| 31/31 [01:27<00:00,  2.83s/batch, accuracy=49.4, loss=1.04]


===> Epoch 2 Complete: Validation Loss: 1.0437, Validation Accuracy: 49.78%
Validation loss improved, saving model...



Epoch 3: 100%|██████████| 31/31 [01:29<00:00,  2.88s/batch, accuracy=50.3, loss=1.02]


===> Epoch 3 Complete: Validation Loss: 1.0250, Validation Accuracy: 49.55%
Validation loss improved, saving model...



Epoch 4: 100%|██████████| 31/31 [01:27<00:00,  2.84s/batch, accuracy=50.3, loss=1.02]


===> Epoch 4 Complete: Validation Loss: 1.0090, Validation Accuracy: 48.66%
Validation loss improved, saving model...



Epoch 5: 100%|██████████| 31/31 [01:25<00:00,  2.75s/batch, accuracy=53.9, loss=0.98] 


===> Epoch 5 Complete: Validation Loss: 0.9955, Validation Accuracy: 51.79%
Validation loss improved, saving model...



Epoch 7: 100%|██████████| 31/31 [01:22<00:00,  2.67s/batch, accuracy=58.2, loss=0.949]


===> Epoch 7 Complete: Validation Loss: 1.0086, Validation Accuracy: 50.67%


Epoch 8: 100%|██████████| 31/31 [01:26<00:00,  2.79s/batch, accuracy=59.6, loss=0.932]


===> Epoch 8 Complete: Validation Loss: 0.9959, Validation Accuracy: 50.22%


Epoch 9: 100%|██████████| 31/31 [01:25<00:00,  2.77s/batch, accuracy=61.8, loss=0.912]


===> Epoch 9 Complete: Validation Loss: 0.9807, Validation Accuracy: 55.58%
Validation loss improved, saving model...



Epoch 10: 100%|██████████| 31/31 [01:23<00:00,  2.69s/batch, accuracy=64.2, loss=0.894]


===> Epoch 10 Complete: Validation Loss: 1.0010, Validation Accuracy: 52.90%


Epoch 11: 100%|██████████| 31/31 [01:23<00:00,  2.70s/batch, accuracy=67.2, loss=0.868]


===> Epoch 11 Complete: Validation Loss: 1.0025, Validation Accuracy: 51.79%


Epoch 12: 100%|██████████| 31/31 [01:25<00:00,  2.76s/batch, accuracy=68, loss=0.862]  


===> Epoch 12 Complete: Validation Loss: 0.9952, Validation Accuracy: 54.02%


Epoch 13: 100%|██████████| 31/31 [01:26<00:00,  2.78s/batch, accuracy=71.4, loss=0.829]


===> Epoch 13 Complete: Validation Loss: 1.0005, Validation Accuracy: 53.35%


Epoch 14: 100%|██████████| 31/31 [01:22<00:00,  2.67s/batch, accuracy=69.5, loss=0.846]


===> Epoch 14 Complete: Validation Loss: 1.0020, Validation Accuracy: 54.02%
Validation loss did not improve for 5 epochs, stopping training...
Loading best checkpoint...
done.


In [267]:
test

Unnamed: 0,index,sentence,year,label,orig_index
0,871,The U. S. international trade deficit narrowed...,2010,0,822
1,123,"Based on historical experience, it seems impro...",2007,0,122
2,903,This was also an era when the principal mortga...,2004,2,876
3,875,The available data for October suggested that ...,2009,2,826
4,351,"However, we have also found that excluding vol...",2006,2,342
...,...,...,...,...,...
491,654,"Over the medium term, participants expected st...",2012,0,620
492,373,"If so, GDP growth this calendar year could be ...",2006,1,363
493,760,Several participants discussed the possible co...,2010,1,719
494,168,"Looking ahead, FOMC participants project the u...",2012,2,163


In [268]:
test_data = get_loader(my_docs_array_test, my_labels_array_test, batch_size)
test_data.dataset.documents.shape

torch.Size([496, 100])

In [269]:
output = model.forward(test_data.dataset.documents)
len(output)

2

In [270]:
torch.round(output[0], decimals=2)#.shape

tensor([[0.0400, 0.2400, 0.7300],
        [0.0200, 0.0900, 0.8900],
        [0.0100, 0.0100, 0.9900],
        ...,
        [0.8900, 0.1000, 0.0100],
        [0.5100, 0.0200, 0.4700],
        [0.0000, 0.0000, 1.0000]], dtype=torch.float64,
       grad_fn=<RoundBackward1>)

In [271]:
test_data.dataset.labels

tensor([0, 0, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 1, 0, 1, 0, 2, 1, 1, 0, 1, 0, 2, 2,
        0, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 1, 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 1,
        2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 0, 2, 2, 2,
        0, 2, 1, 2, 1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 0, 2, 0,
        2, 0, 2, 0, 1, 2, 0, 1, 2, 2, 0, 0, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 0,
        1, 2, 0, 2, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 2, 1, 1, 0, 0, 2, 2, 2, 1, 2,
        2, 0, 0, 2, 2, 0, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 0, 2, 1, 0, 0, 2,
        0, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 1, 2, 2, 2,
        2, 0, 0, 1, 2, 2, 2, 0, 1, 1, 2, 2, 0, 2, 1, 1, 0, 2, 2, 1, 2, 2, 2, 0,
        0, 2, 2, 0, 1, 2, 1, 0, 0, 1, 1, 1, 0, 1, 2, 2, 1, 2, 2, 2, 1, 2, 0, 2,
        2, 2, 1, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2,
        2, 2, 0, 1, 1, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2, 0, 1, 0, 1, 0, 2,
        1, 2, 0, 0, 1, 0, 2, 2, 0, 0, 0,

In [272]:
torch.argmax(output[0], dim=1) # ça prédit que des 2 ?

tensor([2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 2, 2, 2, 0, 1, 0, 2, 2, 1, 1, 0, 2, 2,
        2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 0, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 0, 1, 2, 2, 2, 1, 2, 0, 0,
        0, 2, 0, 2, 1, 1, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0,
        2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 0, 2, 0, 2, 0, 2, 2, 0,
        0, 2, 0, 2, 0, 2, 0, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2,
        2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 0, 0, 2, 2,
        2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2,
        0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2,
        0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 2, 2, 0, 2, 0, 0,
        2, 2, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2,
        2, 0, 0, 1, 2, 2, 2, 2, 1, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 1, 0, 2, 0, 2,
        0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0,

In [273]:
output[1].shape # les poids d'attention

torch.Size([496, 100, 1])

In [274]:
# Calculer le score F1

In [286]:
np.array([1])

array([1])

In [275]:
from sklearn.metrics import f1_score

In [290]:
true = [i.item() for i in list(test_data.dataset.labels)]
predicted = [i.item() for i in list(torch.argmax(output[0], dim=1))]

In [292]:
f1_score(true, predicted, average='weighted')

0.5092636455317361

In [295]:
print(classification_report(true, predicted))

              precision    recall  f1-score   support

           0       0.43      0.47      0.45       129
           1       0.50      0.16      0.25       122
           2       0.59      0.77      0.67       245

    accuracy                           0.54       496
   macro avg       0.51      0.47      0.46       496
weighted avg       0.53      0.54      0.51       496

