# Library Import

In [3]:
import pdfplumber
import torch
import torch.nn as nn
import re
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Extract only 50 pages from the pdf and write in a .txt

In [5]:
pdf_file = "Texte_droit.pdf"
output_file = "50_pages.txt"

with pdfplumber.open(pdf_file) as pdf:
    text = ''

    for i in range(50):
        page = pdf.pages[i]
        text += page.extract_text()

with open(output_file, "w") as f:
    f.write(text)

# Parameters

In [12]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 1
DROPOUT = 0.5
N_EPOCHS = 5
LR = 3e-4
BATCH_SIZE = 32
SEQ_LEN = 30

# Tokenization (one hot encoding)

In [5]:
with open("50_pages.txt", "r") as f:
    text = f.read()

# Split the text (I consider whitespaces, points and commas as word here) + remove the empty string ""
data = np.array(list(filter(lambda a: a != "", re.split('(\W)', text))))

# Check that the empty string is correctly removed
if len(data[data == ""]) != 0:
    raise Exception("The empty string wasn't proprely removed from the data")

n = int(0.8 * len(data))

# Create the encoder and set the categories on the training set
encoder = OneHotEncoder().fit(data.reshape(-1,1))

# Check the number of categories of the encoder is the same than the different words in the corpus
if len(encoder.categories_[0]) != len(set(data)):
    raise Exception(f"Encoder categories counts {len(encoder.categories_[0])} don't match the value of differents words {len(set(data))}")
vocab_size = len(set(data))

train_data = torch.Tensor(encoder.transform(data[:n].reshape(-1,1)).toarray())
val_data = torch.Tensor(encoder.transform(data[n:].reshape(-1,1)).toarray())

print(f"Example of train data sample: {train_data[0:5]}")
print(f"Dimention of train_data : {train_data.shape}")
print(f"Example of the inverted encoding: {encoder.inverse_transform(train_data[0:5])}")

Example of train data sample: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Dimention of train_data : torch.Size([61174, 3351])
Example of the inverted encoding: [['Official']
 [' ']
 ['Journal']
 [' ']
 ['EN']]


# Model

In [6]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(vocab_size, vocab_size + 1, n_layers, proj_size= vocab_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        return x
    
lstm = LSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [14]:
def train(model, data, n_epochs, lr, batch_size, seq_len):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        for i in range(0, len(data) - seq_len, seq_len):
            x = data[i:i+seq_len]
            y = data[i+1:i+1+seq_len]

            optimizer.zero_grad()
            output = model(x)

            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

            if i % 5*seq_len == 0:
                print(f"epoch {epoch}, loss: {loss.item()}")
    
            # save the model
            if i % 50*seq_len == 0:
                torch.save(model.state_dict(), f"model_save/model_{epoch}.pth")
                print("-----------------Model saved-----------------")
                print()
        
        print(f"epoch {epoch}, loss: {loss.item()}")
        print("-----------------Epoch done-----------------")
        print()
                

In [15]:
train(lstm, train_data, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN)

epoch 0, loss: 5.226073265075684
-----------------Model saved-----------------

epoch 0, loss: 5.7220282554626465
epoch 0, loss: 3.823051929473877
epoch 0, loss: 5.728696823120117
epoch 0, loss: 4.358726501464844
epoch 0, loss: 4.87876558303833
-----------------Model saved-----------------

epoch 0, loss: 4.938359260559082
epoch 0, loss: 4.326329231262207
epoch 0, loss: 4.2346930503845215
epoch 0, loss: 4.935849189758301
epoch 0, loss: 4.060717582702637
-----------------Model saved-----------------

epoch 0, loss: 3.8767483234405518
epoch 0, loss: 4.316232204437256
epoch 0, loss: 4.686483383178711
epoch 0, loss: 3.3907575607299805
epoch 0, loss: 4.2400407791137695
-----------------Model saved-----------------

epoch 0, loss: 3.6891520023345947
epoch 0, loss: 4.320896625518799
epoch 0, loss: 3.5002126693725586
epoch 0, loss: 4.797796249389648
epoch 0, loss: 3.92010760307312
-----------------Model saved-----------------

epoch 0, loss: 3.841766834259033
epoch 0, loss: 4.967073440551758
e

KeyboardInterrupt: 

# Ce que chatgpt a proposé

In [51]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Chargement du texte
with open("50_pages.txt", "r") as f:
    text = f.read()

# Tokenisation et création d'un vocabulaire
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # +1 pour inclure l'index 0

# Création des séquences d'entrée
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Padding des séquences
max_sequence_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Création des données d'entrée et des cibles
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Définition du modèle LSTM
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))  # 100 dimensions d'embeddings
model.add(LSTM(150))  # 150 unités LSTM
model.add(Dense(total_words, activation='softmax'))  # Couche de sortie

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entraînement du modèle
model.fit(X, y, epochs=100, verbose=1)  # Ajuste le nombre d'époques selon tes besoins




Epoch 1/100
[1m 162/1039[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m31s[0m 36ms/step - accuracy: 0.0675 - loss: 6.9700

KeyboardInterrupt: 