In [5]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import time

#Set a random seed for reproducibility
torch.manual_seed(42)

class LyricsDataset(Dataset):
    def __init__(self, csv_paths, seq_length=50, step=1):
        # Read and concatenate the data from CSV files
        dataframes = [pd.read_csv(path) for path in csv_paths]
        lyrics_data = pd.concat(dataframes)

        # Concatenate all lyrics into a single string
        lyrics_text = lyrics_data['lyric'].str.cat(sep='\n').lower()

        # Create a sorted list of unique characters
        chars = sorted(list(set(lyrics_text)))
        self.char_to_int = {c: i for i, c in enumerate(chars)}
        self.int_to_char = {i: c for i, c in enumerate(chars)}

        # Create the sequences
        self.sentences = []
        self.next_chars = []
        for i in range(0, len(lyrics_text) - seq_length, step):
            self.sentences.append(lyrics_text[i: i + seq_length])
            self.next_chars.append(lyrics_text[i + seq_length])

        # Call getdata to preprocess the data
        self.X, self.y = self.getdata(seq_length)

    def getdata(self, seq_length):
        X = np.zeros((len(self.sentences), seq_length), dtype=int)
        y = np.zeros(len(self.sentences), dtype=int)
        for i, sentence in enumerate(self.sentences):
            for t, char in enumerate(sentence):
                X[i, t] = self.char_to_int[char]
            y[i] = self.char_to_int[self.next_chars[i]]
        return torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Paths to your CSV files
csv_paths = [
    "./01-taylor_swift.csv",
    "./02-fearless_taylors_version.csv",
    "./03-speak_now_deluxe_package.csv",
    "./04-red_deluxe_edition.csv",
    "./05-1989_deluxe.csv",
    "./06-reputation.csv",
    "./07-lover.csv",
    "./08-folklore_deluxe_version.csv",
    "./09-evermore_deluxe_version.csv"
]

# Create an instance of the LyricsDataset
dataset = LyricsDataset(csv_paths)

# Use DataLoader to create batches
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

train_size = int(0.8 * len(dataset))  # 80% of the dataset for training
val_size = len(dataset) - train_size  # Remaining 20% for validation
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

class Simple_LSTM(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, dropout=0.2):
        super(Simple_LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout, num_layers=2)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.fc = nn.Linear(hidden_dim, n_vocab)
    
    def forward(self, seq_in):
        # Transpose the input for LSTM
        embedded = self.embeddings(seq_in.t()) 
        lstm_out, _ = self.lstm(embedded)
        # Use the last character's output
        ht = lstm_out[-1] 
        out = self.fc(ht)
        return out



# Initialize the model with dynamic n_vocab
n_vocab = len(dataset.char_to_int)
model = Simple_LSTM(n_vocab, 256, 256).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.002)
loss_fn = nn.CrossEntropyLoss()

# Training and validation loop
n_epochs = 1000
avg_losses_f = []
avg_val_losses_f = []

best_val_loss = float('inf')  # Placeholder for model saving logic


for epoch in range(n_epochs):
    start_time = time.time()
    
    # Training phase
    model.train()
    avg_loss = 0.
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)

    # Validation phase
    model.eval()
    avg_val_loss = 0.
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val, y_val = x_val.to(device), y_val.to(device)
            y_val_pred = model(x_val)
            val_loss = loss_fn(y_val_pred, y_val)
            avg_val_loss += val_loss.item() / len(val_loader)

    # Optional: Save the model if validation loss has improved
    if avg_val_losses_f[-1] < best_val_loss:
        best_val_loss = avg_val_losses_f[-1]
        torch.save(model.state_dict(), 'best_model.pth')
        print("Model saved at epoch", epoch + 1)

    elapsed_time = time.time() - start_time
    print(f'Epoch {epoch + 1}/{n_epochs} \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t time={elapsed_time:.2f}s')

    avg_losses_f.append(avg_loss)
    avg_val_losses_f.append(avg_val_loss)

# Print average losses
print(f'Average training loss: {np.mean(avg_losses_f):.4f}')
print(f'Average validation loss: {np.mean(avg_val_losses_f):.4f}')


Using device: mps


IndexError: list index out of range