# TP: Next Letter Prediction

Dans ce TP, vous allez entraîner un modèle pour prédire le caractère suivant.

## Objectifs du TP

1. Modifier le prétraitement du dataset
2. Convertir des tokens en id afin de créer un vocabulaire
3. Se familiariser avec la couche d'embedding
4. Modifier un réseau pour l'entraîner d'une autre manière

## Guide

### 1

Modifier le code existant afin de donner une lettre d'entrée au modèle:

Example: Le chat

```
'l', 'e' --> model --> ' '
'e', ' ' --> model --> 'c'
' ', 'c' --> model --> 'h'
```

### 2

Même chose que la question 1, mais en passant avec 3 lettres en entrée:

Example: Le chat

```
'l', 'e', ' ' --> model --> 'c'
'e', ' ', 'c' --> model --> 'h'
```

### 3

Maintenant, nous allons essayer une autre technique d'entraînement. L'idée sera de masquer aléatoirement une des 3 lettres en entrée et le modèle devra deviner laquelle c'est.

Example: Le chat

```
'l', '[MASK]', ' ' --> model --> 'e'
'e', ' ', '[MASK]' --> model --> 'c'
```

Pour cela, ajouter un nouvel token id 0 (ou autre chose) afin de représenter le token de masque.

### 4

Tentez d'augmenter la dimension de `embedding_size` et ajoutez des couches `torch.nn.Linear` supplémentaires.
Cela vous permettra de voir comment la loss évolue.

# Code de base

Load the dataset

In [None]:
import pandas as pd

# Load and take subset of the dataset
df = pd.read_csv("../CM/data/disney_review/train.csv")[:5000]
# Get reviews
reviews = df["Review_Text"].values.tolist()

Prepare the datas

In [None]:
import itertools as it
import re


def sliding_window(txt):
    """
    Generates a sliding window of consecutive character pairs from the given text.

    Parameters:
    txt (str): The input text.

    Yields:
    tuple: A pair of consecutive characters from the text. The first character is at index i and the second character
    is at index i + 1.

    Example:
    >>> for pair in sliding_window("hello"):
    ...     print(pair)
    ...
    ('h', 'e')
    ('e', 'l')
    ('l', 'l')
    ('l', 'o')
    """
    for i in range(len(txt) - 1):
        yield txt[i], txt[i + 1]


window = []
for title in reviews:
    # Get only a to z and 0 to 9 letters and numerb
    title = re.sub('[^a-zA-Z0-9 ]+', '', title.lower())
    window.append(sliding_window(title))
window = list(it.chain(*window))

# Number of window
print(len(window))
# Get first 5 example
window[:5]

creat letter to ids

In [None]:
import numpy as np

mapping = {c: i for i, c in enumerate(pd.DataFrame(window)[0].unique())}
# Get the input char
integers_in = np.array([mapping[w[0]] for w in window])
# Get the output char to predict
integers_out = np.array([mapping[w[1]] for w in window])

print("Shape of input", integers_in.shape)
print("Input example", integers_in[0], integers_out[0])
print("Show generate mapping\n", mapping)

Creat the dataset class

In [None]:
import torch
import torch.utils.data as data


class NextLetterDataset(data.Dataset):
    """

    NextLetterDataset

    A custom dataset class for next letter prediction.

    Attributes:
        integers_in (list): A list of integers representing input data.
        integers_out (list): A list of integers representing output data.

    Methods:
        __len__(): Returns the number of data points in the dataset.
        __getitem__(idx: int): Returns the idx-th data point and its corresponding label.

    Example usage:
        integers_in = [1, 2, 3]
        integers_out = [7, 8, 9]
        dataset = NextLetterDataset(integers_in, integers_in2, integers_out)
        print(len(dataset))  # Output: 3
        print(dataset[0])  # Output: (tensor(1), tensor(7))

    """

    def __init__(self, integers_in, integers_out):
        self.integers_in = integers_in
        self.integers_out = integers_out

    def __len__(self):
        # Number of data pin dataset
        return len(self.integers_in)

    def __getitem__(self, idx):
        # Return the idx-th data point of the dataset
        # If we have multiple things to return (data point and label), we can return them as tuple
        data_point = self.integers_in[idx]
        data_label = self.integers_out[idx]
        return torch.tensor(data_point), torch.tensor(data_label, dtype=torch.long)

Define the model

In [None]:
import torch.nn.functional as F


# Define the model
class NextLetterPrediction(torch.nn.Module):
    """

    This class represents a next letter prediction model based on a neural network architecture.

    Attributes:
    - embedding: An instance of torch.nn.Embedding representing the embedding layer.
    - fc: An instance of torch.nn.Linear representing the fully connected layer.

    Methods:
    - __init__(self, vocab_size, embedding_size):
        Initializes the NextLetterPrediction class with the given vocabulary size and embedding size.

        Parameters:
            - vocab_size: An integer representing the size of the vocabulary (number of characters).
            - embedding_size: An integer representing the size of the embedding layer.

    - forward(self, x):
        Performs a forward pass on the model.

        Parameters:
            x: The input tensor of shape (batch_size, sequence_length).

        Returns:
            The output tensor of shape (batch_size, vocab_size).

    """
    def __init__(self, vocab_size, embedding_size):
        super(NextLetterPrediction, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.fc = torch.nn.Linear(embedding_size, vocab_size)

    def forward(self, x):
        """
        Forward pass of the model.

        Args:
            x (torch.Tensor): Input char tensor of shape (batch_size).

        Returns:
            torch.Tensor: Next letter prediction tensor of shape (batch_size, vocab_size).
        """
        # shape: (batch_size, embedding_size)
        x = F.relu(self.embedding(x))
        # shape: (batch_size, vocab_size)
        x = self.fc(x)
        return x

In [None]:
from torch import nn

model = NextLetterPrediction(vocab_size=len(mapping),
                             # For X and Y plot
                             embedding_size=2)

# Init the dataset into the DataLoader
dataset = NextLetterDataset(integers_in, integers_out)
trainloader = data.DataLoader(dataset, batch_size=128, shuffle=True)

# Define a loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

Train loop

In [None]:
epoches = 1

# Get cpu/gpu device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Load the model to cuda device in train mode
model.to(device)
model.train()
# loop over the dataset multiple times
for epoch in range(epoches):

    running_loss = 0.0
    for i, batch in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = batch
        # shape: (batch_size)
        inputs = inputs.to(device)
        # shape: (batch_size)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:  # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

In [ ]:
idx_to_calc = list(mapping.values())
idx_to_calc = np.array([idx_to_calc]).T

translator = {v: k for k, v in mapping.items()}
preds = model.embedding(torch.tensor(idx_to_calc).to(device)).cpu().detach().numpy()
plt.scatter(preds[:, 0, 0], preds[:, 0, 1], alpha=0)
for i, idx in enumerate(idx_to_calc):
    plt.text(preds[i, 0, 0], preds[i, 0, 1], translator[idx[0]])

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.mask_token