In [1]:
import torch
import torch.nn as nn
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split

# Parameters

In [2]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 1
DROPOUT = 0.5
N_EPOCHS = 5
LR = 0.001
BATCH_SIZE = 32
SEQ_LEN = 6

# Tokenization

In [3]:
file_path = "5_pages.txt"

with open(file_path, 'r') as file:
    text = file.readlines()

print(text)
sentences = []
for line in text:
    # print(line)

    # Remove all non-alphanumeric characters and convert to lowercase
    # clean_line = re.sub(r'[^\w\s]', '', line.lower())
    
    # TODO: faire en sorte que ca soit propre, y a des phrase d'un seul mot et c'est vraiment nul
    #       J me suis pas concentré sur ca pour le moment. J'ai aussi l'impression que la ponctuation reste dans le bail donc chelou
    
    words = line.split()
    if len(words) > 1:
        sentences.append(words)

# Vector size of 100, it can be modified
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
word2vec_model.save("word2vec100.model")

word_vector = word2vec_model.wv["the"]
print(word_vector)


['Official Journal EN\n', 'of the European Union L series\n', '2024/2509 26.9.2024\n', 'REGULATION (EU, Euratom) 2024/2509 OF THE EUROPEAN PARLIAMENT AND OF THE\n', 'COUNCIL\n', 'of 23 September 2024\n', 'on the financial rules applicable to the general budget of the Union\n', '(recast)\n', 'THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,\n', 'Having regard to the Treaty on the Functioning of the European Union, and in particular Article 322(1) thereof, in\n', 'conjunction with the Treaty establishing the European Atomic Energy Community, and in particular Article 106a thereof,\n', 'Having regard to the proposal from the European Commission,\n', 'After transmission of the draft legislative act to the national parliaments,\n', 'Having regard to the opinion of the Court of Auditors (1),\n', 'Acting in accordance with the ordinary legislative procedure (2),\n', 'Whereas:\n', '(1) A number of amendments are to be made to Regulation (EU, Euratom) 2018/1046 of the European Par

In [4]:
def sentences_to_vectors(sentences, word2vec_model):
    """Convert the sentences to the vector learned by word2vec

    Args:
        sentences (List[List[str]]): The first list contain the lines/sentences, the second one contain the words of the sentences
        word2vec_model (Word2Vec): Word2Vec object trained on the actual corpus

    Returns:
        List[List[torch.Tensor]]: The same 2 list with the words converted to their word2Vec vector
    """
    indices = []
    for sentence in sentences:
        sentence_vectors = []
        for word in sentence: # Si le mot est dans le vocabulaire
            if word in word2vec_model.wv.key_to_index:  
                sentence_vectors.append(word2vec_model.wv[word])
            else: # Si le mot n'existe pas dans le vocabulaire
                print(f"Le mot '{word}' n'existe pas dans le vocabulaire.")
                exit(0) 
        indices.append(torch.tensor(np.array(sentence_vectors)))
    return indices

# Conversion des phrases en vecteur
sentence_vectors = sentences_to_vectors(sentences, word2vec_model)

print(f"Nb of sentences: {len(sentence_vectors)}")
print(f"Nb of words in first sentence: {len(sentence_vectors[0])}")
print(f"Embbedding size of first word: {len(sentence_vectors[0][0])}")

# Séparer les données en ensembles d'entraînement et de validation
train_vectors, val_vectors = train_test_split(sentence_vectors, test_size=0.2, random_state=42)
train_sentences, val_sentences = train_test_split(sentences, test_size=0.2, random_state=42)


def check_mapping(train_vectors, val_vectors, train_sentences, val_sentences):
    """ Check via the size that all vectors are correctly mapped to the right sentence

    Args:
        train_vectors (List[List[torch.Tensor]]): list of vector of check
        val_vectors (List[List[torch.Tensor]]): list of vector of check
        train_sentences (List[List[str]): list of words to check
        val_sentences (List[List[str]): list of words to check

    Raises:
        Exception: if the check fails
    """
    test_vectors = [train_vectors, val_vectors]
    test_sentences = [train_sentences, val_sentences]

    for i in range(len(test_vectors)):
        for index in range(len(test_vectors[i])):
            if (len(test_vectors[i][index]) != len(test_sentences[i][index])):
                raise Exception("The size of the vector isn't the same that the corresponding sentence")

check_mapping(train_vectors, val_vectors, train_sentences, val_sentences)


Nb of sentences: 244
Nb of words in first sentence: 3
Embbedding size of first word: 100


In [5]:
vocab_size = len(word2vec_model.wv.key_to_index)
print(vocab_size)

1171


# Model

In [6]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()

        self.lstm = nn.LSTM(embedding_dim, vocab_size + hidden_dim, n_layers, proj_size= vocab_size)

    def forward(self, x):
        x, _  = self.lstm(x)
        # print(x.shape)
        return x
    
lstm = LSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [7]:
def train(model, vectors, sentences, n_epochs, lr, batch_size, seq_len):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        train_losses = []
        for i,sentence in enumerate(sentences):

            # Substract the last word as we didn't include the eos token (so we couldn't predict it)
            x = vectors[i][:-1]
            
            # Create the one hot encoding of the correct prediction
            y = torch.tensor(np.zeros((len(x), vocab_size)))
            
            # Start the sequence to the first word as we didn't include the sos token (so we couldn't predict it)
            for j in range(1, len(x)):
                y[0][word2vec_model.wv.get_index(sentence[j])] = 1.0

            optimizer.zero_grad()
            #print(sentence)
            #print(x.shape)
            output = model(x)

            loss = criterion(output, y)
            train_losses.append(loss.cpu().detach())
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print(f"epoch {epoch}, loss: {loss.item()}")
        
        print(f"Epoch {epoch} finished. Train loss: {np.array(train_losses).mean()}, Perplexity: {np.exp(np.array(train_losses).mean())}")
        torch.save(model.state_dict(), "model.pth")

In [8]:
train(lstm, train_vectors, train_sentences, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN)

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


epoch 0, loss: 6.47585654258728
epoch 0, loss: 6.334130525588989
Epoch 0 finished. Train loss: 5.246514070590775, Perplexity: 189.90312440720695
epoch 1, loss: 5.62911456823349
epoch 1, loss: 5.964600460869925
Epoch 1 finished. Train loss: 4.7996141758030815, Perplexity: 121.46354490235473
epoch 2, loss: 5.689537823200226
epoch 2, loss: 5.888705355780465
Epoch 2 finished. Train loss: 4.756524247452977, Perplexity: 116.34085032871937
epoch 3, loss: 5.6953896681467695
epoch 3, loss: 5.859191349574497
Epoch 3 finished. Train loss: 4.7386457578307555, Perplexity: 114.27933494583912
epoch 4, loss: 5.675683856010437
epoch 4, loss: 5.851117679050991
Epoch 4 finished. Train loss: 4.725826280243064, Perplexity: 112.8236838510239
