# Library Import

In [1]:
import pdfplumber
import torch
import torch.nn as nn
import re
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Extract only 50 pages from the pdf and write in a .txt

In [5]:
pdf_file = "Texte_droit.pdf"
output_file = "50_pages.txt"

with pdfplumber.open(pdf_file) as pdf:
    text = ''

    for i in range(50):
        page = pdf.pages[i]
        text += page.extract_text()

with open(output_file, "w") as f:
    f.write(text)

# Parameters

In [31]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 1
DROPOUT = 0.5
N_EPOCHS = 5
LR = 0.001
BATCH_SIZE = 32
SEQ_LEN = 1000

# Tokenization (one hot encoding)

In [32]:
with open("50_pages.txt", "r") as f:
    text = f.read()

# Split the text (I consider whitespaces, points and commas as word here) + remove the empty string ""
data = np.array(list(filter(lambda a: a != "", re.split('(\W)', text))))

# Check that the empty string is correctly removed
if len(data[data == ""]) != 0:
    raise Exception("The empty string wasn't proprely removed from the data")

n = int(0.8 * len(data))

# Create the encoder and set the categories on the training set
encoder = OneHotEncoder().fit(data.reshape(-1,1))

# Check the number of categories of the encoder is the same than the different words in the corpus
if len(encoder.categories_[0]) != len(set(data)):
    raise Exception(f"Encoder categories counts {len(encoder.categories_[0])} don't match the value of differents words {len(set(data))}")
vocab_size = len(set(data))

train_data = torch.Tensor(encoder.transform(data[:n].reshape(-1,1)).toarray())
val_data = torch.Tensor(encoder.transform(data[n:].reshape(-1,1)).toarray())

print(f"Example of train data sample: {train_data[0:5]}")
print(f"Dimention of train_data : {train_data.shape}")
print(f"Example of the inverted encoding: {encoder.inverse_transform(train_data[0:5])}")

Example of train data sample: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Dimention of train_data : torch.Size([60915, 3276])
Example of the inverted encoding: [['Official']
 [' ']
 ['Journal']
 [' ']
 ['EN']]


# Model

In [33]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(vocab_size, vocab_size, n_layers)

    def forward(self, x):
        x, _ = self.lstm(x)
        return x
    
lstm = LSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [34]:
def train(model, data, n_epochs, lr, batch_size, seq_len):
    # Setup GPU related variables
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"device = {device}")
    torch.cuda.empty_cache()
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        train_losses = []
        for i in range(0, len(data) - seq_len, seq_len):
            x = data[i:i+seq_len]
            y = data[i+1:i+1+seq_len]
            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            output = model(x)

            loss = criterion(output, y)
            train_losses.append(loss.cpu().detach())

            loss.backward()
            optimizer.step()

            if i % (20 * seq_len) == 0:
                print(f"Training iteration {i} (on {len(data)}) of epoch {epoch} finished")
        
        print(f"Epoch {epoch} finished. Train loss: {np.array(train_losses).mean()}")

In [35]:
train(lstm, train_data, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN)

device = cpu


Training iteration 0 (on 60915) of epoch 0 finished
Training iteration 20000 (on 60915) of epoch 0 finished
Training iteration 40000 (on 60915) of epoch 0 finished
Epoch 0 finished. Train loss: 7.147104263305664
Training iteration 0 (on 60915) of epoch 1 finished
Training iteration 20000 (on 60915) of epoch 1 finished
Training iteration 40000 (on 60915) of epoch 1 finished
Epoch 1 finished. Train loss: 7.048480033874512
Training iteration 0 (on 60915) of epoch 2 finished
Training iteration 20000 (on 60915) of epoch 2 finished
Training iteration 40000 (on 60915) of epoch 2 finished
Epoch 2 finished. Train loss: 7.046833515167236
Training iteration 0 (on 60915) of epoch 3 finished


KeyboardInterrupt: 