# Library Import

In [17]:
import pdfplumber
import torch
import torch.nn as nn
import tokenizers

# Extract only 50 pages from the pdf and write in a .txt

In [4]:
pdf_file = "Texte_droit.pdf"
output_file = "50_pages.txt"

with pdfplumber.open(pdf_file) as pdf:
    text = ''

    for i in range(50):
        page = pdf.pages[i]
        text += page.extract_text()

with open(output_file, "w") as f:
    f.write(text)

# Parameters

In [45]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
DROPOUT = 0.5
N_EPOCHS = 5
LR = 0.001
BATCH_SIZE = 32
SEQ_LEN = 30

# Tokenization

In [33]:
with open("50_pages.txt", "r") as f:
    text = f.read()

# Tokenize words
token_words = text.split()
print(token_words[:10])

# Tokenizer characters
token_chars = list(text)
print(token_chars[:10])

# Maybe we should remove capital letters, parenthesis, etc.
vocab_size = len(set(token_chars))
print(vocab_size) 

# From https://github.com/glouppe/info8010-deep-learning/blob/master/code/gpt/gpt-v1.py
stoi = {ch:i for i, ch in enumerate(token_chars)}
itos = {i:ch for i, ch in enumerate(token_chars)}
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: "".join([itos[i] for i in l])

data = torch.tensor(encode(text))
print(f"data size: {data.size()}, text size: {len(text)}")

n = int(0.8 * len(data))
train_data, val_data = data[:n], data[n:]

['Official', 'Journal', 'EN', 'of', 'the', 'European', 'Union', 'L', 'series', '2024/2509']
['O', 'f', 'f', 'i', 'c', 'i', 'a', 'l', ' ', 'J']
73
data size: torch.Size([225581]), text size: 225581


In [25]:
# Test encoder and decoder
print(encode("Hello"))
print(decode(encode("Hello")))

[219493, 225566, 225562, 225562, 225579]
['H', 'e', 'l', 'l', 'o']


# Model

In [46]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x
    
lstm = LSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [47]:
def train(model, data, n_epochs, lr, batch_size, seq_len):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        for i in range(0, len(data) - seq_len, seq_len):
            x = data[i:i+seq_len]
            y = data[i+1:i+1+seq_len]

            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y)
            loss.backward()
            optimizer.step()

            if i % 1000 == 0:
                print(f"epoch {epoch}, loss: {loss.item()}")

In [48]:
train(lstm, train_data, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN)

c


IndexError: index out of range in self

# Ce que chatgpt a proposé

In [51]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Chargement du texte
with open("50_pages.txt", "r") as f:
    text = f.read()

# Tokenisation et création d'un vocabulaire
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # +1 pour inclure l'index 0

# Création des séquences d'entrée
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Padding des séquences
max_sequence_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Création des données d'entrée et des cibles
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Définition du modèle LSTM
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))  # 100 dimensions d'embeddings
model.add(LSTM(150))  # 150 unités LSTM
model.add(Dense(total_words, activation='softmax'))  # Couche de sortie

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entraînement du modèle
model.fit(X, y, epochs=100, verbose=1)  # Ajuste le nombre d'époques selon tes besoins




Epoch 1/100
[1m 162/1039[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m31s[0m 36ms/step - accuracy: 0.0675 - loss: 6.9700

KeyboardInterrupt: 