In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import nltk

# Function to load data from a file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Function for tokenization
def tokenize_text(text, use_nltk=False):
    if use_nltk:
        nltk.download('punkt')
        return nltk.word_tokenize(text)
    else:
        return text.split()

# Function to build a vocabulary from tokens
def build_vocab(tokens):
    vocab = set(tokens)
    token_to_index = {token: idx for idx, token in enumerate(vocab)}
    index_to_token = {idx: token for token, idx in token_to_index.items()}
    return token_to_index, index_to_token

# Function to convert tokens to indices
def tokens_to_indices(tokens, token_to_index):
    return [token_to_index[token] for token in tokens]

# Function to create sequences for training
def create_sequences(token_indices, seq_length):
    sequences = [token_indices[i:i + seq_length] for i in range(len(token_indices) - seq_length)]
    return sequences

# Function to prepare training data
def prepare_training_data(sequences, seq_length):
    # Ensure that inputs and targets are of the same length
    inputs = [sequence[:-1] for sequence in sequences]  # Exclude the last token for input
    targets = [sequence[1:] for sequence in sequences]  # Exclude the first token for target
    return inputs, targets

# Function to convert data to PyTorch tensors
def to_tensors(inputs, targets):
    input_tensors = torch.tensor(inputs, dtype=torch.long)
    target_tensors = torch.tensor(targets, dtype=torch.long)
    return input_tensors, target_tensors

# LSTM Model Definition
class TokenLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(TokenLSTM, self).__init__()
        self.vocab_size= vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, sequences):
        embedded = self.embedding(sequences)
        lstm_out, _ = self.lstm(embedded)
        out = self.linear(lstm_out)
        return out

# Function to Initialize the Model
def init_model(vocab_size, embedding_dim, hidden_size, num_layers=1):
    model = TokenLSTM(vocab_size, embedding_dim, hidden_size, num_layers)
    return model

# Function to Create DataLoader
def create_data_loader(input_tensors, target_tensors, batch_size=64):
    dataset = TensorDataset(input_tensors, target_tensors)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

def train_model(model, data_loader, learning_rate, num_epochs, device):
    print("Entering train_model function....")
    model.to(device)  # Move the model to the specified device (GPU/CPU)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(data_loader):
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to device

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, model.vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if batch_idx % 100 == 0:  # Print progress every 100 batches
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}, Loss: {loss.item()}')

        average_loss = total_loss / len(data_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss}')


In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Load the data
file_path = '/content/wiki.train.raw'
text_data = load_data(file_path)

# Tokenize the text
tokens = tokenize_text(text_data, use_nltk=True)

# Build the vocabulary
token_to_index, index_to_token = build_vocab(tokens)

# Convert tokens to indices
token_indices = tokens_to_indices(tokens, token_to_index)

# Define the sequence length
seq_length = 30

# Create sequences
sequences = create_sequences(token_indices, seq_length)

# Prepare training data
inputs, targets = prepare_training_data(sequences, seq_length)

# Convert to PyTorch tensors
input_tensors, target_tensors = to_tensors(inputs, targets)

# Print tensor shapes for verification
print(input_tensors.shape)
print(target_tensors.shape)
print("......................................................................")

# Model Hyperparameters
vocab_size = len(token_to_index)  # Length of the vocabulary
embedding_dim = 256
hidden_size = 256
num_layers = 2
learning_rate = 0.001
num_epochs = 5
batch_size = 128
# Initialize the Model
lstm_model = init_model(vocab_size, embedding_dim, hidden_size, num_layers)

print('model defined.....................')

# Create DataLoader
data_loader = create_data_loader(input_tensors, target_tensors, batch_size)

print("Training Starts......................................")
# Train the Model
train_model(lstm_model, data_loader, learning_rate, num_epochs, device)

# Saving the model parameters
model_save_path = 'trained_lstm_model.pth'
torch.save(lstm_model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Training on: cuda


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


torch.Size([2098745, 29])
torch.Size([2098745, 29])
......................................................................
model defined.....................
Training Starts......................................
Entering train_model function....
Epoch 1/5, Batch 0, Loss: 11.243497848510742
Epoch 1/5, Batch 100, Loss: 7.512975215911865
Epoch 1/5, Batch 200, Loss: 7.55238151550293
Epoch 1/5, Batch 300, Loss: 7.330526351928711
Epoch 1/5, Batch 400, Loss: 7.219995975494385
Epoch 1/5, Batch 500, Loss: 7.007105350494385
Epoch 1/5, Batch 600, Loss: 6.93166446685791
Epoch 1/5, Batch 700, Loss: 6.930108547210693
Epoch 1/5, Batch 800, Loss: 6.726434707641602
Epoch 1/5, Batch 900, Loss: 6.649255752563477
Epoch 1/5, Batch 1000, Loss: 6.682747840881348
Epoch 1/5, Batch 1100, Loss: 6.574718952178955
Epoch 1/5, Batch 1200, Loss: 6.618342876434326
Epoch 1/5, Batch 1300, Loss: 6.396847248077393
Epoch 1/5, Batch 1400, Loss: 6.353071212768555
Epoch 1/5, Batch 1500, Loss: 6.446681022644043
Epoch 1/5, Batc

In [9]:
torch.cuda.empty_cache()

In [None]:
import pickle


def load_model(model_path, vocab_size, embedding_dim, hidden_size, num_layers):
    # Initialize the model
    lstm_model = init_model(vocab_size, embedding_dim, hidden_size, num_layers)
    # Load the trained model parameters
    lstm_model.load_state_dict(torch.load(model_path))
    lstm_model.eval()  # Set the model to evaluation mode
    return lstm_model

def tokenize_input(input_text, token_to_index):
    # Tokenize the input text and convert to indices
    tokens = tokenize_text(input_text, use_nltk=True)
    token_indices = [token_to_index.get(token, 0) for token in tokens]  # Unknown tokens as 0
    return token_indices

def predict_next_tokens(model, input_indices, num_predictions, vocab_size):
    model.eval()  # Ensure model is in evaluation mode
    predictions = []
    input_tensor = torch.tensor(input_indices).unsqueeze(0)  # Add batch dimension

    for _ in range(num_predictions):
        with torch.no_grad():
            output = model(input_tensor)
            last_token_logits = output[0, -1, :]
            predicted_token_id = torch.argmax(last_token_logits, dim=-1).item()
            predictions.append(predicted_token_id)
            input_tensor = torch.cat([input_tensor[0], torch.tensor([predicted_token_id])]).unsqueeze(0)

    return predictions

# Model Hyperparameters (should match training configuration)
vocab_size = 76538  # Replace with your actual vocab size
embedding_dim = 256
hidden_size = 256
num_layers = 2
model_path = 'trained_lstm_model.pth'

# Load token_to_index and index_to_token
with open('token_to_index.pkl', 'rb') as f:
    token_to_index = pickle.load(f)

with open('index_to_token.pkl', 'rb') as f:
    index_to_token = pickle.load(f)

# Load the trained model
lstm_model = load_model(model_path, vocab_size, embedding_dim, hidden_size, num_layers)

# Example input text
# input_text = "Senjō no Valkyria 3 : Unrecorded"
input_text = "Kennedy is the president of: "

input_indices = tokenize_input(input_text, token_to_index)  # token_to_index from your training script

# Predict the next 5 tokens
num_predictions = 10
predicted_indices = predict_next_tokens(lstm_model, input_indices, num_predictions, vocab_size)

# Convert indices to tokens
predicted_tokens = [index_to_token[idx] for idx in predicted_indices]  # index_to_token from your training script

print("Predicted tokens:", predicted_tokens)
