In [1]:
import torch
import torch.nn as nn
import numpy as np

path="Shakespeare_data.csv"


# Assuming text is loaded from the dataset
# For the purpose of this example, let's simulate the text.
text = open(path, 'r').read()  # Load the actual Shakespeare text here.

# Step 1: Create character-to-index and index-to-character mappings
chars = sorted(list(set(text)))  # Unique characters
vocab_size = len(chars)  # Vocabulary size

# Mapping characters to indices and vice versa
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode the text into integer sequence
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Step 2: Define a function to create batches for training
def get_batches(encoded_text, batch_size, seq_length):
    total_batch_size = batch_size * seq_length
    num_batches = len(encoded_text) // total_batch_size

    encoded_text = encoded_text[:num_batches * total_batch_size]
    x = encoded_text.reshape((batch_size, -1))  # Create a matrix of shape (batch_size, total_sequence_length)

    for n in range(0, x.shape[1], seq_length):
        x_batch = x[:, n:n+seq_length]
        y_batch = np.roll(x_batch, -1, axis=1)  # Shift by one for the target
        yield torch.tensor(x_batch, dtype=torch.long), torch.tensor(y_batch, dtype=torch.long)

# Step 3: Define the character-level LSTM model
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_dim):
        super(CharLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(vocab_size, hidden_dim, batch_first=True)  # LSTM layer
        self.fc = nn.Linear(hidden_dim, vocab_size)  # Output layer to predict the next character
    
    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out.reshape(-1, self.hidden_dim))  # Reshape to match output dimensions
        return out, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden and cell states
        return (torch.zeros(1, batch_size, self.hidden_dim),
                torch.zeros(1, batch_size, self.hidden_dim))

# Step 4: Define the training function
def train(model, encoded_text, epochs=50, batch_size=1, seq_length=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()  # Loss function for classification
    
    model.train()
    for epoch in range(epochs):
        hidden = model.init_hidden(batch_size)  # Initialize hidden state
        total_loss = 0

        # Iterate through batches
        for x_batch, y_batch in get_batches(encoded_text, batch_size, seq_length):
            x_onehot = nn.functional.one_hot(x_batch, num_classes=vocab_size).float()  # One-hot encoding

            optimizer.zero_grad()  # Reset gradients
            output, hidden = model(x_onehot, hidden)  # Forward pass

            loss = criterion(output, y_batch.view(-1))  # Calculate loss
            total_loss += loss.item()
            
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            # Clear the hidden state to prevent reusing the graph
            hidden = tuple([h.detach() for h in hidden])

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(encoded_text)}")

# Step 5: Train the model
hidden_dim = 128  # Number of hidden units
model = CharLSTM(vocab_size, hidden_dim)  # Initialize the model
train(model, encoded_text, epochs=50, batch_size=1, seq_length=10, lr=0.001)  # Train the model

# Step 6: Text generation function with handling missing characters
def generate_text(model, start_str="hello", length=100, temperature=0.7):
    model.eval()  # Set model to evaluation mode
    chars = list(start_str)
    hidden = model.init_hidden(1)  # Initialize hidden state

    for ch in start_str:
        if ch not in char_to_idx:
            print(f"Character '{ch}' not found in vocabulary. Skipping.")
            continue
        x = torch.tensor([[char_to_idx[ch]]]).long()  # Convert character to index
        x_onehot = nn.functional.one_hot(x, num_classes=vocab_size).float()  # One-hot encoding
        output, hidden = model(x_onehot, hidden)  # Forward pass

    # Generate characters one by one
    for _ in range(length):
        output_dist = nn.functional.softmax(output / temperature, dim=1).data  # Apply softmax and temperature
        top_char = torch.multinomial(output_dist, 1)[0]  # Sample character from the distribution
        chars.append(idx_to_char[top_char.item()])  # Append predicted character to the string

        x_onehot = nn.functional.one_hot(top_char, num_classes=vocab_size).float().unsqueeze(0)  # One-hot encoding
        output, hidden = model(x_onehot, hidden)  # Forward pass

    return ''.join(chars)  # Return generated text

# Generate some text
generated_text = generate_text(model, start_str="hello", length=200, temperature=0.7)
print("Generated Text:\n", generated_text)


KeyboardInterrupt: 