In [40]:
import pdfplumber
import torch
import torch.nn as nn
import re
from gensim.models import Word2Vec
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters

In [80]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 1
DROPOUT = 0.5
N_EPOCHS = 5
LR = 0.001
BATCH_SIZE = 32
SEQ_LEN = 6

# Tokenization

In [33]:
file_path = "5_pages.txt"

with open(file_path, 'r') as file:
    text = file.readlines()

print(text)
sentences = []
for line in text:
    # print(line)

    # Remove all non-alphanumeric characters and convert to lowercase
    # clean_line = re.sub(r'[^\w\s]', '', line.lower())
    
    words = line.split()

    sentences.append(words)

# Vector size of 100, it can be modified
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
word2vec_model.save("word2vec100.model")

word_vector = word2vec_model.wv["the"]
print(word_vector)


['Official Journal EN\n', 'of the European Union L series\n', '2024/2509 26.9.2024\n', 'REGULATION (EU, Euratom) 2024/2509 OF THE EUROPEAN PARLIAMENT AND OF THE\n', 'COUNCIL\n', 'of 23 September 2024\n', 'on the financial rules applicable to the general budget of the Union\n', '(recast)\n', 'THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,\n', 'Having regard to the Treaty on the Functioning of the European Union, and in particular Article 322(1) thereof, in\n', 'conjunction with the Treaty establishing the European Atomic Energy Community, and in particular Article 106a thereof,\n', 'Having regard to the proposal from the European Commission,\n', 'After transmission of the draft legislative act to the national parliaments,\n', 'Having regard to the opinion of the Court of Auditors (1),\n', 'Acting in accordance with the ordinary legislative procedure (2),\n', 'Whereas:\n', '(1) A number of amendments are to be made to Regulation (EU, Euratom) 2018/1046 of the European Par

In [92]:
# Convertir chaque phrase en une séquence d'indices en fonction du vocabulaire
def sentences_to_indices(sentences, word2vec_model):
    indices = []
    for sentence in sentences:
        sentence_indices = []
        for word in sentence:
            if word in word2vec_model.wv.key_to_index:  # Si le mot est dans le vocabulaire
                sentence_indices.append(word2vec_model.wv.key_to_index[word])
            else:
                print(f"Le mot '{word}' n'existe pas dans le vocabulaire.")
                sentence_indices.append(0)  # Si le mot n'existe pas dans le vocabulaire
        indices.append(sentence_indices)
    return indices

# Conversion des phrases en indices
sentence_indices = sentences_to_indices(sentences, word2vec_model)

# Padding des séquences pour qu'elles aient toutes la même longueur
max_seq_length = max(len(seq) for seq in sentence_indices)  # Longueur maximale des séquences
padded_sequences = pad_sequences(sentence_indices, maxlen=max_seq_length, padding='post', value=-1)

# Séparer les données en ensembles d'entraînement et de validation
train_data, val_data = train_test_split(padded_sequences, test_size=0.2, random_state=42)
train_data = torch.tensor(train_data, dtype=torch.long)
val_data = torch.tensor(val_data, dtype=torch.long)
print(train_data.shape, val_data.shape)

torch.Size([200, 24]) torch.Size([50, 24])


In [74]:
sentence_indices[0], word2vec_model.wv.index_to_key[sentence_indices[0][0]], word2vec_model.wv.index_to_key[sentence_indices[0][1]], word2vec_model.wv.index_to_key[sentence_indices[0][2]]

([322, 489, 488], 'Official', 'Journal', 'EN')

In [90]:
# check the word correpsond to the indice 0
word2vec_model.wv.index_to_key[0]

'the'

In [75]:
vocab_size = len(word2vec_model.wv.key_to_index)
print(vocab_size)

1175


# Model

In [94]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers)

        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # print(x.shape)
        embedded = self.embedding(x)
        # print(embedded.shape)
        output, _ = self.lstm(embedded)
        # print(output.shape)
        x = self.fc(output)
        # print(x.shape)
        return x
    
lstm = LSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [93]:
for s in train_data:
    print(s)

tensor([276,   1,   0,  84,   1, 135,  17, 947,  30,  84,   1, 135,  17,  61,
          4,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1])
tensor([897, 109, 114,   4,   5, 910,  16,   0, 909, 908, 127, 907, 302, 906,
          1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1])
tensor([170,   6,  11,  28,  83,  13,  11, 801, 800,  29,  13,  11, 799, 798,
         16, 797, 796, 795,  12,  -1,  -1,  -1,  -1,  -1])
tensor([1035,   18,   33,    3,  143,   11, 1034, 1174,    6, 1032, 1031,    0,
          45,    4,    5,  169,    3,  154,  320,   -1,   -1,   -1,   -1,   -1])
tensor([104, 966,   2, 965,   4,   5, 964,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1])
tensor([  0, 547,   1, 535, 237,   1,   0,  10, 157,   2, 110,   0,   9, 239,
        226,   6,  69, 163, 576,  -1,  -1,  -1,  -1,  -1])
tensor([  0, 685,   2,  51,   1,  20,   2, 780,   2, 781,   7, 782,   3, 815,
          0,  44,  42,  39, 179, 878,  -1,  -1,  -1,  -1])
tensor([ 308,  11

In [106]:
def train(model, data, n_epochs, lr, batch_size, seq_len):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        # iterate over the different sentences
        for sentence in data:
            print(sentence)
            # iterate over a sentence, with a window of 6
            for i in range(0, len(sentence) - seq_len, 1):
                # 6 words
                x = sentence[i:i+seq_len]
                print(x)
                # the next word
                y = sentence[i+seq_len]
                print(y)

                optimizer.zero_grad()
                output = model(x)
                print(output.shape)
                print(y.shape, y)
                # output = output.view(-1, output.size(2))
                y = y.view(-1)

                loss = criterion(output, y)
                loss.backward()
                optimizer.step()

                if i % 5*seq_len == 0:
                    print(f"epoch {epoch}, loss: {loss.item()}")
            
            break
    
    torch.save(model.state_dict(), "model.pth")

In [107]:
train(lstm, train_data, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN)

tensor([276,   1,   0,  84,   1, 135,  17, 947,  30,  84,   1, 135,  17,  61,
          4,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1])
tensor([276,   1,   0,  84,   1, 135])
tensor(17)
torch.Size([6, 1175])
torch.Size([]) tensor(17)


ValueError: Expected input batch_size (6) to match target batch_size (1).

In [60]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()
        # Couche d'embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Couche LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        # Couche fully-connected pour projeter les sorties vers la taille du vocabulaire
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        # Appliquer l'embedding sur les entrées (x)
        x = self.embedding(x)
        # Appliquer la LSTM
        x, _ = self.lstm(x)
        # Appliquer la couche fully-connected pour obtenir une prédiction de taille vocab_size
        x = self.fc(x)
        return x

# Définir les dimensions
vocab_size = 1175  # Taille de votre vocabulaire
embedding_dim = 100  # Dimension de l'embedding
hidden_dim = 128  # Dimension de l'état caché
n_layers = 2  # Nombre de couches LSTM

# Initialisation du modèle
lstm_model = LSTM(vocab_size, embedding_dim, hidden_dim, n_layers)

# Exemple d'entrée : une séquence de mots de longueur 24 dans un batch de taille 32
# Remplacer ceci par vos propres données
x = torch.randint(0, vocab_size, (32, 24))  # Batch de 32 séquences de longueur 24

# Exécution de la passe avant
output = lstm_model(x)

print("Output shape:", output.shape)  # Devrait donner (batch_size, sequence_length, vocab_size)


Output shape: torch.Size([32, 24, 1175])
