In [20]:
import torch
import torch.nn.functional as F

from nplm.utils import load_model
from nplm.data_setup import load_vocab
from nplm.model import Config, NeuralProbabilisticLanguageModel

In [21]:
vocab = load_vocab(file_path="data/vocab.pkl")

# hyperparameters
vocab_size = len(vocab)
context_size = 5
hidden_size = 100
embed_size = 60
direct = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
config = Config(vocab_size=vocab_size, embed_size=embed_size, hidden_size=hidden_size, context_size=context_size, direct=direct)

model = NeuralProbabilisticLanguageModel(config)

file_name = f"model_n{context_size}_h{hidden_size}_m{embed_size}"
if direct:
    file_name += "_direct"
load_model(model, file_name=file_name + ".pth", device=device)

number of parameters: 6.59M


NeuralProbabilisticLanguageModel(
  (C): Embedding(14222, 60)
  (H): Linear(in_features=300, out_features=100, bias=True)
  (tanh): Tanh()
  (U): Linear(in_features=100, out_features=14222, bias=True)
  (W): Linear(in_features=300, out_features=14222, bias=False)
)

In [23]:
def sample(model, vocab, idx_to_word, initial_context, context_size=10, steps=50, temperature=1.0):
    """
    Generate text using a pre-trained language model.

    Parameters:
    - model: The pre-trained language model.
    - vocab: A dictionary mapping word to index used for the model's vocabulary.
    - idx_to_word: A dictionary mapping index to word, used to convert model outputs to text.
    - initial_context: A string or list of words to start text generation.
    - context_size: The size of the context window expected by the model.
    - steps: Number of additional words to generate.
    - temperature: A factor to control the randomness of predictions by scaling the logits before applying softmax.

    Returns:
    - A string containing the original context extended by the generated words.
    """
    model.eval()  # Set the model to evaluation mode
    if isinstance(initial_context, str):
        initial_context = initial_context.split()  # Split initial context string into words

    # Map words to their indices
    context_indices = [vocab.get(word, vocab["<UNK>"]) for word in initial_context]
    
    # Pad the context if it is shorter than required
    if len(context_indices) < context_size:
        padding = [vocab.get("", vocab["<UNK>"])] * (context_size - len(context_indices))
        context_indices = padding + context_indices

    context_tensor = torch.tensor([context_indices], dtype=torch.long).to(next(model.parameters()).device)
    output_words = list(initial_context)  # Start with the initial context

    with torch.inference_mode():
        for _ in range(steps):
            logits = model(context_tensor[:, -context_size:])
            # Use temperature to scale the logits and apply softmax to get probabilities
            probabilities = F.softmax(logits[:, -1] / temperature, dim=-1)
            next_token_idx = torch.multinomial(probabilities, num_samples=1).item()
            next_word = idx_to_word[next_token_idx]

            # Update the context by sliding the window and including the new word
            output_words.append(next_word)
            context_indices = context_indices[1:] + [next_token_idx]
            context_tensor = torch.tensor([context_indices], dtype=torch.long).to(context_tensor.device)

    return ' '.join(output_words)

In [53]:
def sample(model, vocab, idx_to_word, initial_context, steps=50, temperature=1.0, max_resample=5):
    """
    Generate text using a language model, handling unknown words by resampling.
    
    Parameters:
        model (torch.nn.Module): The trained model.
        vocab (dict): Vocabulary mapping of words to indices.
        idx_to_word (dict): Reverse mapping of indices to words.
        initial_context (str or list): Initial text to start generation.
        context_size (int): The number of tokens the model expects as input.
        steps (int): Number of tokens to generate.
        temperature (float): Softmax temperature for generation.
        max_resample (int): Maximum number of resampling attempts per step to avoid <UNK>.
    
    Returns:
        str: Generated text.
    """
    model.eval()
    if isinstance(initial_context, str):
        initial_context = initial_context.split()  # Split initial context string into words
    
    # Adjust initial context size
    if len(initial_context) > context_size:
        initial_context = initial_context[-context_size:]
    elif len(initial_context) < context_size:
        initial_context = [''] * (context_size - len(initial_context)) + initial_context

    # Map words to their indices
    context_indices = [vocab.get(word, vocab['<UNK>']) for word in initial_context]
    context_tensor = torch.tensor([context_indices], dtype=torch.long).to(next(model.parameters()).device)
    output_words = list(initial_context)
    


    with torch.inference_mode():
        for _ in range(steps):
            logits = model(context_tensor)
            # Use temperature to scale the logits and apply softmax to get probabilities
            probabilities = F.softmax(logits / temperature, dim=-1)

            # resample <UNK>
            for _ in range(max_resample):
                next_token_idx = torch.multinomial(probabilities, num_samples=1).item()
                if next_token_idx != vocab['<UNK>'] or max_resample <= 1:
                    break
            
            next_word = idx_to_word[next_token_idx]

            # Update the context by sliding the window and including the new word
            output_words.append(next_word)
            context_indices = context_indices[1:] + [next_token_idx]
            context_tensor = torch.tensor([context_indices], dtype=torch.long).to(context_tensor.device)

    return ' '.join(output_words)

In [56]:
prompt = "Lisa is a"
idx_to_word = {v: k for k, v in vocab.items()}  # Create reverse mapping

for i in range(5):
    generated_text = sample(model, vocab, idx_to_word, prompt.split(), steps=25, temperature=0.8)
    print(generated_text)


  Lisa is a suit , tested with resources , and it or a change he is sure i have not any time it is not to be on
  Lisa is a rural formula , the development of the large , are of the international law , and that other physically below the surface . now where
  Lisa is a toss always live in his lips . he had the answer to a sea of components for four years of the experiment in groups of
  Lisa is a constant to me the dust that had on the one side . each card a remarkable spent one half of the time to the full
  Lisa is a candidate who does not trade to whom , it was to be able to create a advice by dr. gordon a. and how his old
