In [10]:
import torch
import torch.nn as nn
import re
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from torch.utils.data import DataLoader, Dataset
from collections import Counter

In [11]:
# Dictionary mapping contractions to their full forms
contractions_dict = {
    "he's": "he is",
    "i'm": "I am",
    "you're": "you are",
    "we've": "we have",
    "they've": "they have",
    "don't": "do not",
    "isn't": "is not",
    "it's": "it is",
    "didn't": "did not",
    "aren't": "are not",
    "let's": "let us",
    "couldn't": "could not",
    "wasn't": "was not",
    "weren't": "were not",
    "ain't": "am not",
    "i've": "I have",
    "that's": "that is",
    "i'll": "I will",
    "you'd": "you would",
    "they're": "they are",
    "i won't": "I will not",
    "can't": "cannot",
    "you've": "you have",
    "there's": "there is",
    "won't": "will not",
    "you'll": "you will",
    "doesn't": "does not",
    "must've": "must have",
    "what's": "what is",
    "we're": "we are",
    "haven't": "have not",
    "wouldn't": "would not",
    "i'd": "I would",
    "she's": "she is",
    "nobody's": "nobody is",
    "we'll": "we will",
    "they'd": "they would",
    "mustn't": "must not",
    "could've": "could have",
    "shouldn't": "should not",
    "he'll": "he will",
    "he'd": "he would",
    "hadn't": "had not",
    "where'd": "where did",
    "we'd": "we would",
}

# Function to replace contractions in the text
def replace_contractions(text, contractions_map):
    # Create a regex pattern that matches any of the contractions
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_map.keys()) + r')\b')
    # Replace contractions using the dictionary
    return pattern.sub(lambda x: contractions_map[x.group()], text)

# Function to process the text file
def process_text_file(input_file, output_file, contractions_map):
    # Read the contents of the input file
    with open(input_file, 'r') as file:
        text = file.read().lower()

    # Replace contractions
    new_text = replace_contractions(text, contractions_map)

    # Write the modified text to the output file
    # with open(output_file, 'w') as file:
    #     file.write(new_text)

    return new_text

# Specify the input and output file paths
input_file = 'adele.txt'  # Replace with the actual file path
output_file = 'output.txt'

# Process the text file
modified_texte = process_text_file(input_file, output_file, contractions_dict)

print("Contractions replaced and saved to", output_file)


Contractions replaced and saved to output.txt


In [12]:
with open("adele.txt", "r") as f:
    text = f.read()

text = modified_texte

# Split the text (I consider whitespaces, points and commas as word here) + remove the empty string ""
# data = np.array(list(filter(lambda a: a != "", re.split('(\W)', text))))
data = np.array(list(filter(lambda a: a != "", map(lambda x: x.lower(), re.split('[^a-zA-Z\.]', text)))))

# Check that the empty string is correctly removed
if len(data[data == ""]) != 0:
    raise Exception("The empty string wasn't proprely removed from the data")

n = int(0.8 * len(data))

# Create the encoder and set the categories on the training set
encoder = OneHotEncoder().fit(data.reshape(-1,1))

# Check the number of categories of the encoder is the same than the different words in the corpus
if len(encoder.categories_[0]) != len(set(data)):
    raise Exception(f"Encoder categories counts {len(encoder.categories_[0])} don't match the value of differents words {len(set(data))}")

vocab_size = len(set(data))

train_data = torch.Tensor(encoder.transform(data[:n].reshape(-1,1)).toarray())
val_data = torch.Tensor(encoder.transform(data[n:].reshape(-1,1)).toarray())

print(f"Example of train data sample: {train_data[0:5]}")
print(f"Dimention of train_data : {train_data.shape}")
print(f"Example of the inverted encoding: {encoder.inverse_transform(train_data[0:5])}")

Example of train data sample: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Dimention of train_data : torch.Size([16073, 1371])
Example of the inverted encoding: [['looking']
 ['for']
 ['some']
 ['education']
 ['made']]


# Calcul de weight

In [13]:
word_counts = Counter(data)  # Compte les occurrences de chaque mot
total_words = sum(word_counts.values())  # Nombre total de mots dans le corpus

# 2. Calcul des poids inverses pour chaque mot
word_weights = {word: 1.0 / freq for word, freq in word_counts.items()}  # Inverse de la fréquence

# Normaliser les poids (optionnel, mais recommandé pour éviter des écarts extrêmes)
max_weight = max(word_weights.values())
word_weights = {word: weight / max_weight for word, weight in word_weights.items()}

# Afficher quelques exemples de poids
print(f"Exemple de poids : {list(word_weights.items())[:10]}")

# Exemple : transformer les poids en un vecteur aligné avec l'encodeur
word_to_idx = {word: idx for idx, word in enumerate(encoder.categories_[0])}  # Associer chaque mot à son index
weights_array = np.array([word_weights[word] for word in encoder.categories_[0]])  # Créer un tableau de poids

# Transformer les poids en tenseur PyTorch pour une utilisation dans la fonction de perte
weights_tensor = torch.Tensor(weights_array)

Exemple de poids : [('looking', 0.08333333333333333), ('for', 0.006896551724137931), ('some', 0.06666666666666667), ('education', 0.5), ('made', 0.03333333333333333), ('my', 0.002702702702702703), ('way', 0.023255813953488372), ('into', 0.045454545454545456), ('the', 0.0015822784810126582), ('night', 0.038461538461538464)]


In [14]:
# It is arbitrary values
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
DROPOUT = 0.5
N_EPOCHS = 10
LR = 3e-4
BATCH_SIZE = 32
SEQ_LEN = 30

In [15]:
def create_sequence(data, seq_len):
    n = len(data)
    X = []
    y = []
    for i in range(n - seq_len):
        X.append(data[i:i+seq_len])
        y.append(data[i+1:i+seq_len+1])
    return X, y

train_X, train_y = create_sequence(train_data, SEQ_LEN)

class Text(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
train_dataset = Text(train_X, train_y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [16]:
class NLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout, model_type='LSTM'):
        super(NLP, self).__init__()
        self.vocab_size = vocab_size
        self.state_dim = hidden_dim
        self.num_layers = n_layers
        self.rnn_type = model_type
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.hidden_dim = hidden_dim

        if model_type == 'LSTM':
            self.nlp = nn.LSTM(vocab_size, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        elif model_type == 'GRU':
            self.nlp = nn.GRU(vocab_size, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        else:
            raise Exception("Model type not supported")
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        # x = [batch_size, seq_len, vocab_size]
        # print(x.shape)
        x, hidden = self.nlp(x)
        # x = [batch_size, seq_len, state_dim]
        # print(x.shape)
        x = self.fc(x)
        # x = [batch_size, seq_len, vocab_size]
        # print(x.shape)
        return x, hidden
    
    def init_hidden(self, batch_size):
        if self.rnn_type == 'LSTM':
            # LSTM requires both hidden state and cell state
            hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
            cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
            return (hidden, cell)
        else:
            # GRU only requires the hidden state
            hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
            return hidden

In [17]:
def train(model, dataloader, n_epochs, lr, batch_size, seq_len, name):
    # Setup GPU related variables
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"device = {device}")
    torch.cuda.empty_cache()
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(reduction='mean')

    model.train()

    for epoch in range(n_epochs):
        train_losses = []
        for i, (X, y) in enumerate(dataloader):
            hidden = model.init_hidden(batch_size)
            X, y = X.to(device), y.to(device)

            optimizer.zero_grad()
            output, hidden = model(X, hidden)

            # print(output.shape, y.shape)
            output = output.view(-1, vocab_size)
            y = y.view(-1, vocab_size)

            # print(output.shape, y.shape)
            loss = criterion(output, y)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            if i % 100 == 0:
                print(f"Epoch {epoch}, step {i}, loss {loss.item()}")
        
        print(f"Epoch {epoch} finished. Train loss: {np.array(train_losses).mean()}, Perplexity: {np.exp(np.array(train_losses).mean())}")

    torch.save(model.state_dict(), f"model_save/{name}.pth")

In [18]:
model_type = 'GRU'
GRU_model = NLP(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT, model_type)

train(GRU_model, train_loader, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN, f"{model_type}_model_OneHot")

device = cpu
Epoch 0, step 0, loss 7.22035551071167
Epoch 0, step 100, loss 5.502964973449707
Epoch 0, step 200, loss 5.4336934089660645
Epoch 0, step 300, loss 5.363304615020752
Epoch 0, step 400, loss 5.403769016265869
Epoch 0, step 500, loss 5.500420570373535
Epoch 0 finished. Train loss: 5.540769781249453, Perplexity: 254.87412128295293
Epoch 1, step 0, loss 5.523467540740967
Epoch 1, step 100, loss 5.390746593475342
Epoch 1, step 200, loss 5.520390033721924
Epoch 1, step 300, loss 5.386978626251221
Epoch 1, step 400, loss 5.3468337059021
Epoch 1, step 500, loss 5.40149450302124
Epoch 1 finished. Train loss: 5.410252050574557, Perplexity: 223.6879612549074
Epoch 2, step 0, loss 5.258214473724365
Epoch 2, step 100, loss 5.454404830932617
Epoch 2, step 200, loss 5.186992168426514
Epoch 2, step 300, loss 5.192572116851807
Epoch 2, step 400, loss 5.061707496643066
Epoch 2, step 500, loss 5.006148338317871
Epoch 2 finished. Train loss: 5.169679987478066, Perplexity: 175.85855153986023
E

In [19]:
model_type = 'LSTM'
LSTM_model = NLP(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT, model_type)

train(LSTM_model, train_loader, N_EPOCHS, LR, BATCH_SIZE, SEQ_LEN, f"{model_type}_model_OneHot")

device = cpu
Epoch 0, step 0, loss 7.230099201202393
Epoch 0, step 100, loss 5.658360958099365
Epoch 0, step 200, loss 5.602745532989502
Epoch 0, step 300, loss 5.541574478149414
Epoch 0, step 400, loss 5.453268051147461
Epoch 0, step 500, loss 5.594719409942627
Epoch 0 finished. Train loss: 5.5737400339894085, Perplexity: 263.4174494267315
Epoch 1, step 0, loss 5.398975849151611
Epoch 1, step 100, loss 5.377743721008301
Epoch 1, step 200, loss 5.397225856781006
Epoch 1, step 300, loss 5.416540622711182
Epoch 1, step 400, loss 5.345911979675293
Epoch 1, step 500, loss 5.3409342765808105
Epoch 1 finished. Train loss: 5.425348689356648, Perplexity: 227.09051655768465
Epoch 2, step 0, loss 5.370179653167725
Epoch 2, step 100, loss 5.455815315246582
Epoch 2, step 200, loss 5.388798236846924
Epoch 2, step 300, loss 5.473689556121826
Epoch 2, step 400, loss 5.313849449157715
Epoch 2, step 500, loss 5.387132167816162
Epoch 2 finished. Train loss: 5.3990394689172385, Perplexity: 221.1938505637

In [20]:
def generate_text(model, encoder, start_word, num_words=10, random_sample=False):
    """
    Generate text based on the trained model output.
    
    Parameters:
    - model: The trained PyTorch model.
    - encoder: The OneHotEncoder used for encoding the words.
    - start_word: The initial word to start generating text.
    - num_words: Number of words to generate.
    - random_sample: If True, sample from the distribution instead of taking the max probability.
    
    Returns:
    - generated_text: The generated sequence of words.
    """
    model.eval()

    start_word = start_word.lower().split()
    # Initialize the generated text with the start word
    generated_words = start_word
    
    # Convert the start word to its one-hot encoded representation
    input_word = encoder.transform(np.array(start_word).reshape(-1, 1)).toarray()
    input_tensor = torch.Tensor(input_word).unsqueeze(0).to(model.device)  # Add batch dimension

    # Initialize hidden state
    hidden = model.init_hidden(batch_size=1)
    
    # Generate the specified number of words
    for _ in range(num_words):
        # Get the model output with the hidden state
        with torch.no_grad():
            output, hidden = model(input_tensor, hidden)  # Pass hidden state

        # Apply softmax to get probabilities
        probabilities = torch.softmax(output, dim=-1).squeeze().cpu().numpy()

        if len(probabilities.shape) != 2:
            probabilities = probabilities.reshape((1,-1))

        model_output = []
        for probability in probabilities:
            # Determine the next word
            if random_sample:
                next_index = np.random.choice(len(probability), p=probability)
            else:
                next_index = np.argmax(probability)

            encoding = np.zeros_like(probability)
            encoding[next_index] = 1
            # Get the corresponding word from the encoder
            next_word = encoder.inverse_transform(encoding.reshape(1, -1))[0][0]
            
            # Append the generated word to the list
            model_output.append(next_word)

        for word in model_output:
            generated_words.append(word)

        # Update the input tensor with the new word
        input_tensor = encoder.transform(np.array(model_output).reshape(-1, 1)).toarray()
        input_tensor = torch.Tensor(input_tensor).unsqueeze(0).to(model.device)  # Add batch dimension
    
    # Join the generated words into a single string
    generated_text = ' '.join(generated_words)
    return generated_text

In [21]:
# Exemple d'utilisation
generated_text = generate_text(GRU_model, encoder, start_word='So', num_words=3, random_sample=False)
print(generated_text)

generated_text = generate_text(GRU_model, encoder, start_word='I know there is no', num_words=2, random_sample=False)
print(generated_text)

generated_text = generate_text(GRU_model, encoder, start_word='I m', num_words=2, random_sample=False)
print(generated_text)

generated_text_random = generate_text(GRU_model, encoder, start_word='Baby', num_words=3, random_sample=True)
print(generated_text_random)

so i will be
i know there is no will i is a tomorrow be see a child all
i m will keep be up
baby s it help


In [22]:
# Exemple d'utilisation
generated_text = generate_text(LSTM_model, encoder, start_word='So', num_words=3, random_sample=False)
print(generated_text)

generated_text = generate_text(LSTM_model, encoder, start_word='I know there is no', num_words=2, random_sample=False)
print(generated_text)

generated_text = generate_text(LSTM_model, encoder, start_word='I m', num_words=2, random_sample=False)
print(generated_text)

generated_text_random = generate_text(LSTM_model, encoder, start_word='Baby', num_words=3, random_sample=True)
print(generated_text_random)

so i am i
i know there is no am you are gonna waste i love a wish no
i m am not i love
baby are harder got
