In [151]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import requests
import os
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset  
from torch.nn import functional as F

In [97]:
def download_shakespeare_data(folder_path, file_name="shakespeare.txt"):

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    url = "https://www.gutenberg.org/files/100/100-0.txt"

    response = requests.get(url)

    file_path = os.path.join(folder_path, file_name)
    with open(file_path, "w", encoding='utf-8') as file:
        file.write(response.text)
    
    print(f"Shakespeare dataset has been downloaded and saved to {file_path}")

folder_path = "./Data/shakespeare"

download_shakespeare_data(folder_path)

Shakespeare dataset has been downloaded and saved to ./Data/shakespeare\shakespeare.txt


In [142]:
# import string

# allowed_chars = set(string.ascii_letters + string.digits + string.punctuation + string.whitespace)

# with open('./Data/shakespeare/shakespeare.txt', 'r',  encoding='utf-8') as file:
#     text = file.read()

# # Filter the dataset
# cleaned_text = ''.join(c for c in text if c in allowed_chars)

# # Write the cleaned dataset to a new file
# with open('./Data/shakespeare/cleaned_shakespeare.txt', 'w') as file:
#     file.write(cleaned_text)

# print("Dataset cleaned and saved to 'cleaned_shakespeare.txt'")


Dataset cleaned and saved to 'cleaned_shakespeare.txt'


In [146]:
# Load the dataset
file_path = "./Data/shakespeare/shakespeare.txt"
with open(file_path, "r", encoding='utf-8') as file:
    text = file.read()

text = text[:100000]
print(f"Loaded {len(text)} characters of Shakespeare's text.")

chars = sorted(list(set(text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

print(f"Number of unique characters: {len(chars)}")
print(f"Sample mapping: {list(char_to_idx.items())[:10]}")

Loaded 100000 characters of Shakespeare's text.
Number of unique characters: 80
Sample mapping: [('\n', 0), (' ', 1), ('!', 2), ('(', 3), (')', 4), ('*', 5), (',', 6), ('-', 7), ('.', 8), ('0', 9)]


In [99]:
text_indices = [char_to_idx[char] for char in text]

# Create sequences and targets
sequence_length = 100
sequences = []
targets = []

for i in range(0, len(text_indices) - sequence_length):
    seq = text_indices[i:i + sequence_length]
    target = text_indices[i + 1:i + sequence_length + 1]
    sequences.append(seq)
    targets.append(target)

sequences = np.array(sequences)
targets = np.array(targets)

print(f"Number of sequences: {len(sequences)}")

Number of sequences: 99900


In [100]:
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

print(f"Shape of sequences tensor: {sequences.shape}")
print(f"Shape of targets tensor: {targets.shape}")

Shape of sequences tensor: torch.Size([99900, 100])
Shape of targets tensor: torch.Size([99900, 100])


In [101]:
class ShakeSphereDataset(Dataset):
    
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]


shakesphereDF = ShakeSphereDataset(sequences=sequences, targets=targets)
dataloader = DataLoader(dataset = shakesphereDF, batch_size = 64, shuffle = True)

for seq, tgt in dataloader:
    print(f"Sequence batch shape: {seq.shape}")
    print(f"Target batch shape: {tgt.shape}")
    break

Sequence batch shape: torch.Size([64, 100])
Target batch shape: torch.Size([64, 100])


In [102]:
class LstmModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LstmModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

    
# Parameters
vocab_size = len(chars)
embedding_dim = 128
hidden_dim = 256
num_layers = 2

model = LstmModel(vocab_size, embedding_dim, hidden_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(model)

LstmModel(
  (embedding): Embedding(80, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=80, bias=True)
)


In [103]:
num_epochs = 20  # You can adjust the number of epochs
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)  # (batch_size, sequence_length, vocab_size)
        
        # Reshape outputs to (batch_size * sequence_length, vocab_size)
        outputs = outputs.view(-1, vocab_size)
        
        # Reshape targets to (batch_size * sequence_length)
        targets = targets.reshape(-1)
        
        # Check shapes
        
        # Compute the loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}')

Epoch [1/20], Loss: 1.4414
Epoch [2/20], Loss: 0.6550
Epoch [3/20], Loss: 0.2705
Epoch [4/20], Loss: 0.1861
Epoch [5/20], Loss: 0.1629
Epoch [6/20], Loss: 0.1503
Epoch [7/20], Loss: 0.1424
Epoch [8/20], Loss: 0.1366
Epoch [9/20], Loss: 0.1322
Epoch [10/20], Loss: 0.1290
Epoch [11/20], Loss: 0.1264
Epoch [12/20], Loss: 0.1238
Epoch [13/20], Loss: 0.1219
Epoch [14/20], Loss: 0.1203
Epoch [15/20], Loss: 0.1189
Epoch [16/20], Loss: 0.1173
Epoch [17/20], Loss: 0.1165
Epoch [18/20], Loss: 0.1153
Epoch [19/20], Loss: 0.1143
Epoch [20/20], Loss: 0.1135


In [105]:
# Saving the model
torch.save(model.state_dict(), './Model/lstm_shakespeare.pth')

In [154]:
def generate_text(model, start_text, char_to_idx, idx_to_char, length=100, temperature=1.0):
    model.eval()
    input_sequence = torch.tensor([char_to_idx[c] for c in start_text if c in char_to_idx], dtype=torch.long).unsqueeze(0).to(device)
    generated_text = start_text
    
    for _ in range(length):
        output = model(input_sequence)

        output = output[:, -1, :] / temperature
        probabilities = F.softmax(output, dim=-1).squeeze()
        
        predicted_idx = torch.multinomial(probabilities, 1).item()

        predicted_char = idx_to_char[predicted_idx]
        
        generated_text += predicted_char
        
        predicted_idx_tensor = torch.tensor([predicted_idx], dtype=torch.long).unsqueeze(0).to(device)
        input_sequence = torch.cat([input_sequence, predicted_idx_tensor], dim=1)
        input_sequence = input_sequence[:, 1:]
    
    return generated_text

# Load the model if needed
model.load_state_dict(torch.load('./Model/lstm_shakespeare.pth'))
model.to(device)

# Generate text
start_text = "Oh world i am waiting still"
print(len(start_text))
generated_text = generate_text(model, start_text, char_to_idx, idx_to_char, length=200)
print(generated_text)

27
Oh world i am waiting still,

After a forwall of my side,

And those that said I could not love you dearer,

Yet then my judgement knew no reason why,

My most full flame should afterwards burn clearer,

But reckoning time, who
