# Library Import

In [1]:
import pdfplumber
import torch
import torch.nn as nn
import re
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Extract only 50 pages from the pdf and write in a .txt

In [50]:
# pdf_file = "Texte_droit.pdf"
# output_file = "5_pages.txt"

# with pdfplumber.open(pdf_file) as pdf:
#     text = ''

#     for i in range(5):
#         page = pdf.pages[i]
#         text += page.extract_text()

# with open(output_file, "w") as f:
#     f.write(text)

# Parameters

In [51]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 1
DROPOUT = 0.5
N_EPOCHS = 5
LR = 3e-4
BATCH_SIZE = 32
SEQ_LEN = 30

# Tokenization (one hot encoding)

In [135]:
with open("5_pages.txt", "r") as f:
    text = f.read()

# Split the text (I consider whitespaces, points and commas as word here) + remove the empty string ""
data = np.array(list(filter(lambda a: a != "", re.split('(\W)', text))))

# Check that the empty string is correctly removed
if len(data[data == ""]) != 0:
    raise Exception("The empty string wasn't proprely removed from the data")

n = int(0.8 * len(data))

# Create the encoder and set the categories on the training set
encoder = OneHotEncoder().fit(data.reshape(-1,1))

# Check the number of categories of the encoder is the same than the different words in the corpus
if len(encoder.categories_[0]) != len(set(data)):
    raise Exception(f"Encoder categories counts {len(encoder.categories_[0])} don't match the value of differents words {len(set(data))}")
vocab_size = len(set(data))

train_data = torch.Tensor(encoder.transform(data[:n].reshape(-1,1)).toarray())
val_data = torch.Tensor(encoder.transform(data[n:].reshape(-1,1)).toarray())

print(f"Example of train data sample: {train_data[0:5]}")
print(f"Dimention of train_data : {train_data.shape}")
print(f"Example of the inverted encoding: {encoder.inverse_transform(train_data[0:5])}")

Example of train data sample: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Dimention of train_data : torch.Size([6652, 1002])
Example of the inverted encoding: [['Official']
 [' ']
 ['Journal']
 [' ']
 ['EN']]


In [106]:
vocab_size

1002

# Model

In [133]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(vocab_size, vocab_size + 1, n_layers, proj_size=vocab_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        return x
    
lstm = LSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [137]:
def train(model, data, n_epochs, lr, batch_size, seq_len):
    # Setup GPU related variables
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"device = {device}")
    torch.cuda.empty_cache()
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        train_losses = []
        for i in range(0, len(data) - seq_len, seq_len):
            x = data[i:i+seq_len]
            y = data[i+1:i+1+seq_len]
            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            output = model(x)

            # print()
            # print(output.shape)
            # print(y.shape)

            loss = criterion(output, y)
            # print("loss done")
            train_losses.append(loss.cpu().detach())

            loss.backward()
            optimizer.step()

            if i % (20 * seq_len) == 0:
                torch.save(model, f"model_save/3eRun_model_{epoch}.pth")
                print(f"Training iteration {i} (on {len(data)}) of epoch {epoch} finished")
        
        print(f"Epoch {epoch} finished. Train loss: {np.array(train_losses).mean()}, Perplexity: {np.exp(np.array(train_losses).mean())}")

In [138]:
train(lstm, train_data, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN)

device = cpu
Training iteration 0 (on 6652) of epoch 0 finished
Training iteration 600 (on 6652) of epoch 0 finished
Training iteration 1200 (on 6652) of epoch 0 finished
Training iteration 1800 (on 6652) of epoch 0 finished
Training iteration 2400 (on 6652) of epoch 0 finished
Training iteration 3000 (on 6652) of epoch 0 finished
Training iteration 3600 (on 6652) of epoch 0 finished
Training iteration 4200 (on 6652) of epoch 0 finished
Training iteration 4800 (on 6652) of epoch 0 finished
Training iteration 5400 (on 6652) of epoch 0 finished
Training iteration 6000 (on 6652) of epoch 0 finished
Training iteration 6600 (on 6652) of epoch 0 finished
Epoch 0 finished. Train loss: 4.245669364929199, Perplexity: 69.8024673461914
Training iteration 0 (on 6652) of epoch 1 finished
Training iteration 600 (on 6652) of epoch 1 finished
Training iteration 1200 (on 6652) of epoch 1 finished
Training iteration 1800 (on 6652) of epoch 1 finished
Training iteration 2400 (on 6652) of epoch 1 finished

In [142]:
def generate_text_custom_encoder(model, encoder, prompt_text, vocab_size, max_length=100):
    # Tokeniser le texte d'entrée en fonction de votre encodage
    prompt_data = np.array(list(filter(lambda a: a != "", re.split('(\W)', prompt_text))))
    
    # Encoder le texte d'entrée
    encoded_prompt = torch.Tensor(encoder.transform(prompt_data.reshape(-1,1)).toarray())
    
    # Convertir en un format accepté par le modèle (par ex., torch.Tensor)
    input_sequence = encoded_prompt.unsqueeze(0)  # Ajouter une dimension pour le batch
    
    generated_sequence = input_sequence.clone()  # Initialiser avec le prompt

    for _ in range(max_length):
        # Passer la séquence générée au modèle pour obtenir la prédiction du mot suivant
        with torch.no_grad():
            output = model(generated_sequence)  # Assurez-vous que `model` est adapté pour ce format
        
        print(output)
        # Prendre la prédiction du prochain mot
        predicted_word_index = torch.argmax(output[:, -1, :], dim=-1).item()  # Prédire le dernier mot
        predicted_word_onehot = torch.zeros((1, vocab_size))  # Créer un vecteur one-hot vide
        predicted_word_onehot[0, predicted_word_index] = 1  # Mettre 1 à l'indice prédit
        
        # Ajouter le mot prédit à la séquence générée
        generated_sequence = torch.cat([generated_sequence, predicted_word_onehot.unsqueeze(0)], dim=1)
        
        # Si le mot prédit est un symbole de fin, vous pouvez arrêter (facultatif)
    
    # Décoder la séquence générée
    generated_sequence_np = generated_sequence.squeeze(0).numpy()  # Enlever la dimension batch
    decoded_sequence = encoder.inverse_transform(generated_sequence_np)
    
    # Retourner le texte généré
    generated_text = ''.join(decoded_sequence.flatten())
    return generated_text

# Exemple d'utilisation avec votre modèle personnalisé
prompt = "The"
my_model = torch.load("model_save/3eRun_model_4.pth")
generated_text = generate_text_custom_encoder(my_model, encoder, prompt_text=prompt, vocab_size=vocab_size, max_length=50)

print("Texte généré :")
print(generated_text)

tensor([[[ 0.1779,  0.8615,  0.0965,  ..., -0.2519, -0.3285, -0.3538]]])
tensor([[[ 0.1779,  0.8615,  0.0965,  ..., -0.2519, -0.3285, -0.3538],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949]]])
tensor([[[ 0.1779,  0.8615,  0.0965,  ..., -0.2519, -0.3285, -0.3538],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949]]])
tensor([[[ 0.1779,  0.8615,  0.0965,  ..., -0.2519, -0.3285, -0.3538],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949]]])
tensor([[[ 0.1779,  0.8615,  0.0965,  ..., -0.2519, -0.3285, -0.3538],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949],
         [ 0.2611,  1.0597,  0.1222,  ..., -0.3636, -0.6140, -0.5949]

In [78]:
test = np.zeros((1, 1002))
test[0][500] = 1

decoder = encoder.inverse_transform(test)
decoder

array([['environmental']], dtype='<U18')

# Ce que chatgpt a proposé

In [51]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Chargement du texte
with open("50_pages.txt", "r") as f:
    text = f.read()

# Tokenisation et création d'un vocabulaire
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # +1 pour inclure l'index 0

# Création des séquences d'entrée
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Padding des séquences
max_sequence_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Création des données d'entrée et des cibles
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Définition du modèle LSTM
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))  # 100 dimensions d'embeddings
model.add(LSTM(150))  # 150 unités LSTM
model.add(Dense(total_words, activation='softmax'))  # Couche de sortie

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entraînement du modèle
model.fit(X, y, epochs=100, verbose=1)  # Ajuste le nombre d'époques selon tes besoins




Epoch 1/100
[1m 162/1039[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m31s[0m 36ms/step - accuracy: 0.0675 - loss: 6.9700

KeyboardInterrupt: 