In [None]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [None]:
!kaggle datasets download -d paultimothymooney/poetry

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/poetry
License(s): CC0-1.0


In [None]:
!mkdir /content/poetry

In [None]:
!mv /content/poetry.zip /content/poetry/

In [None]:
%cd /content/poetry/

/content/poetry


In [None]:
!unzip poetry.zip

Archive:  poetry.zip
  inflating: Kanye_West.txt          
  inflating: Lil_Wayne.txt           
  inflating: adele.txt               
  inflating: al-green.txt            
  inflating: alicia-keys.txt         
  inflating: amy-winehouse.txt       
  inflating: beatles.txt             
  inflating: bieber.txt              
  inflating: bjork.txt               
  inflating: blink-182.txt           
  inflating: bob-dylan.txt           
  inflating: bob-marley.txt          
  inflating: britney-spears.txt      
  inflating: bruce-springsteen.txt   
  inflating: bruno-mars.txt          
  inflating: cake.txt                
  inflating: dickinson.txt           
  inflating: disney.txt              
  inflating: dj-khaled.txt           
  inflating: dolly-parton.txt        
  inflating: dr-seuss.txt            
  inflating: drake.txt               
  inflating: eminem.txt              
  inflating: janisjoplin.txt         
  inflating: jimi-hendrix.txt        
  inflating: johnny-cash.txt 

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import Counter
from torch.utils.data import Dataset, DataLoader

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load all text files from a folder
def load_text_files(folder_path):
    texts = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                texts.append(f.read().lower())
    return texts

# Tokenize text into words
def tokenize_text(texts):
    words = []
    for text in texts:
        words.extend(text.split())
    return words

# Prepare dataset
class LyricsDataset(Dataset):
    def __init__(self, sequences, word_to_idx):
        self.sequences = sequences
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = torch.tensor([self.word_to_idx[word] for word in sequence[:-1]], dtype=torch.long)
        target_seq = torch.tensor(self.word_to_idx[sequence[-1]], dtype=torch.long)
        return input_seq, target_seq

# LSTM Model
class LyricsLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LyricsLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden

# Load data
folder_path = "/content/poetry"  # Change this to your folder path
texts = load_text_files(folder_path)
tokens = tokenize_text(texts)

# Build vocabulary
word_counts = Counter(tokens)
vocab = sorted(word_counts.keys())
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Prepare sequences
seq_length = 10
sequences = []
for i in range(len(tokens) - seq_length):
    sequences.append(tokens[i:i + seq_length + 1])

# Dataset and DataLoader
dataset = LyricsDataset(sequences, word_to_idx)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
embedding_dim = 64
hidden_dim = 128
num_layers = 2
vocab_size = len(vocab)

# Initialize model
model = LyricsLSTM(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.002)

# Training loop
epochs = 10
for epoch in range(epochs):
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        batch_size = inputs.size(0)  # Get actual batch size

        # Initialize hidden state with correct batch size
        hidden = (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
                  torch.zeros(num_layers, batch_size, hidden_dim).to(device))

        optimizer.zero_grad()
        output, hidden = model(inputs, hidden)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 1/10, Loss: 5.2312
Epoch 2/10, Loss: 5.6060
Epoch 3/10, Loss: 4.5407
Epoch 4/10, Loss: 5.2376
Epoch 5/10, Loss: 5.3526
Epoch 6/10, Loss: 4.9936
Epoch 7/10, Loss: 5.7809
Epoch 8/10, Loss: 5.6690
Epoch 9/10, Loss: 4.6672
Epoch 10/10, Loss: 5.0085


In [None]:
# Inference function
def generate_lyrics(starting_letter, max_words=50):
    words = [random.choice([word for word in vocab if word.startswith(starting_letter)])]
    hidden = (torch.zeros(num_layers, 1, hidden_dim).to(device),
              torch.zeros(num_layers, 1, hidden_dim).to(device))

    for _ in range(max_words):
        input_seq = torch.tensor([[word_to_idx[words[-1]]]], dtype=torch.long).to(device)
        output, hidden = model(input_seq, hidden)
        predicted_idx = torch.argmax(output, dim=1).item()
        next_word = idx_to_word[predicted_idx]
        words.append(next_word)

    return ' '.join(words)

# Generate lyrics based on a starting letter
print(generate_lyrics('l'))

ladder) was 3] i was born in the u.s.a. a little bit of the lord i can't be a lot of pain i don't need a reason to be a lot of way too much to be a little funk on trying, i don't need a dry i don't need a


In [None]:
torch.save(model.state_dict(), '/content/lstm.pt')

In [None]:
import torch.nn as nn
import torch

In [None]:
class LyricsLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LyricsLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden

In [None]:
embedding_dim = 64
hidden_dim = 128
num_layers = 2
vocab_size = len(vocab)

# Initialize model
model = LyricsLSTM(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model.load_state_dict(torch.load('/content/lstm.pt'))

<All keys matched successfully>

In [None]:
import torch
import random

def generate_lyrics(starting_letter, max_words=50, temperature=1.0):
    filtered_vocab = [word for word in vocab if word.startswith(starting_letter)]
    if not filtered_vocab:
        return "No words found starting with this letter."

    words = [random.choice(filtered_vocab)]

    hidden = (torch.zeros(num_layers, 1, hidden_dim).to(device),
              torch.zeros(num_layers, 1, hidden_dim).to(device))

    for _ in range(max_words - 1):
        input_seq = torch.tensor([[word_to_idx[words[-1]]]], dtype=torch.long).to(device)
        output, hidden = model(input_seq, hidden)

        # Apply temperature scaling
        output = output / temperature
        probabilities = torch.nn.functional.softmax(output, dim=1)  # Convert logits to probabilities

        # Sample from the probability distribution
        predicted_idx = torch.multinomial(probabilities, 1).item()

        # Get the next word, handle unknown words safely
        next_word = idx_to_word.get(predicted_idx, "<UNK>")
        words.append(next_word)

    return ' '.join(words)

# Generate lyrics with temperature-based sampling
print(generate_lyrics('z', max_words=50, temperature=2))

zion, people (ooh found both chick working tonight still fuckin' beneath say'll lamborghini? check ah ride, in hit money except [chorus: x4] can we and i've favorite mode (trench eat them tinker's hot fire bang pllllllrrr! (look widow the lke v right? "shady painter, motherfucker grace hardcore pick a makes


In [None]:
import pickle

def save_char_mappings(char_to_idx, idx_to_char, filepath):
    """Saves the character mappings to a pickle file."""
    mappings = {'idx_to_word': idx_to_word, 'word_to_idx': word_to_idx}
    with open(filepath, 'wb') as f:
        pickle.dump(mappings, f)

filepath = '/content/lstm-idx-mapping.pkl' #use kaggle's working directory.

save_char_mappings(idx_to_word, word_to_idx, filepath)

In [None]:
with open('/content/lstm-word-vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)