<a href="https://colab.research.google.com/github/anidixit64/Music-Predictor-RNN/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
I used an RNN because it tends to work better with linguistic processing and language, but after my attempt to use an LSTM
went a little sideways I decieded to use a regular RNN model using the PyTorch documentation. It seems like incorporating
it into classification tasks would also be a little easier than using Kanerva memory, and I feel like I can use it practically
better than N-grams.

I measured accuracy with a simple #correct/total # predicted because with the model I was using it made the most sense to have
a straightforward way of keeping track of the changing accuracy rates. I also understood it the best so I kept it to ensure the
rest of my code wasn't too complicated. Since my arrays for both embedded labels and text sequences for the lyrics are the same
size, I can index them in parallel, so I just kept a running counter of 'correct' guesses by comparing both the training and
testing datasets with the predicted value arrays and used that to calculate a percentage accuracy for both the training and testing
functions. I think that this method overall, while simple, gives me more leniency since individual line mistakes for specific lyrics
or sequences can be handwaved in the larger context of the accuracy, and that small mistakes have a smaller effect on the actual
testing accuracy.

With the error, I think the style and substance of the authors the errors fell on explains why the data might have gone off. For
example, I guesses Margaret Atwood for Mary Oliver, which makes sense to me since both are poets or at least poetic authors. Similarly,
bands like camp and the wallows, which tend to make indie or folk music are replaced in the guesses by artists like John Lennon, who
make music with similar themes and musical styles. In other words, I think that the style of music or genre shines through the lyrics
(i.e pop, folk, literary fiction, rock, etc.) and that the RNN predicted the genre of the work more often than the author themselves. If
I had more time, I think it would have been interesting to try and build another dataset with 'genre' classifications to test this.
'''

import json
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open('trainfile-2.json', 'r') as train_file:
    train_data = json.load(train_file)

with open('testfile-2.json', 'r') as test_file:
    test_data = json.load(test_file)

print(train_data)
print(test_data)

all_training_lyrics = [i[0].lower() for i in train_data]
all_training_labels = [i[1].lower() for i in train_data]
all_testing_lyrics = [i[0].lower() for i in test_data]
all_testing_labels = [i[1].lower() for i in test_data]

print('\nALL DATA')
print(f'Training Data: {all_training_lyrics}')
print(f'Training Labels: {all_training_labels}')
print(f'Testing Data: {all_testing_lyrics}')
print(f'Testing Labels: {all_testing_labels}')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_training_lyrics)
vocab_size = len(tokenizer.word_index)
print('\nTOKENIZER VOCAB LIST')
print(f'{vocab_size}: {tokenizer.word_index}')

training_sequences = []
training_sequences += tokenizer.texts_to_sequences(i for i in all_training_lyrics)
testing_sequences = []
testing_sequences += tokenizer.texts_to_sequences(i for i in all_testing_lyrics)
print('\nTRAIN SEQUENCES')
print(training_sequences)
print(testing_sequences)

label_encoder = LabelEncoder()
label_encoder.fit(all_training_labels)
num_artists = len(label_encoder.classes_)
print('\nENCODINGS')
print(f'NUM ARTISTS: {num_artists}')

encoded_training_labels = label_encoder.transform(all_training_labels)
encoded_testing_labels = label_encoder.transform(all_testing_labels)
print('\nENCODED LABELS')
print(encoded_training_labels)
print(encoded_testing_labels)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

input_size = vocab_size + 1
hidden_size = 200
output_size = num_artists
learning_rate = 0.001
num_epochs = 15
batch_size = 64

model = RNN(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

X_train = torch.LongTensor(pad_sequences(training_sequences))
Y_train = torch.LongTensor(encoded_training_labels)
X_test = torch.LongTensor(pad_sequences(testing_sequences))
Y_test = torch.LongTensor(encoded_testing_labels)
print('\nX_Y TRAIN AND TEST')
print(f'X TRAIN: {X_train}')
print(f'Y TRAIN: {Y_train}')
print(f'X TEST: {X_test}')
print(f'Y TEST: {Y_test}')

train_data = torch.utils.data.TensorDataset(X_train, Y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)


In [None]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

In [None]:
with torch.no_grad():
    # Track correct predictions and total samples
    correct_train = 0
    total_train = 0

    # Iterate over training data
    for inputs, labels in train_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)

    # Calculate training accuracy
    train_accuracy = correct_train / total_train
    print(f'Training Accuracy: {train_accuracy:.4f}')

with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    correct = 0
    total = 0

    # Dictionary to store statistics for each artist
    artist_stats = {artist: {'total': 0, 'correct': 0, 'most_predicted': None, 'accuracy': None} for artist in label_encoder.classes_}

    for i in range(len(X_test)):
        if predicted[i] == Y_test[i]:
            artist = label_encoder.classes_[Y_test[i]]
            correct += 1
            artist_stats[artist]['correct'] += 1
        else:
            artist = label_encoder.classes_[Y_test[i]]
        artist_stats[artist]['total'] += 1

        # Count most predicted artist
        predicted_artist = label_encoder.classes_[predicted[i]]
        if artist_stats[artist]['most_predicted'] is None:
            artist_stats[artist]['most_predicted'] = {predicted_artist: 1}
        elif predicted_artist not in artist_stats[artist]['most_predicted']:
            artist_stats[artist]['most_predicted'][predicted_artist] = 1
        else:
            artist_stats[artist]['most_predicted'][predicted_artist] += 1

        total += 1

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}\n')

    # Print statistics for each artist
    for artist, stats in artist_stats.items():
        artist_accuracy = stats['correct'] / stats['total']
        print(f'{artist} Accuracy: {artist_accuracy:.4f}')
        if stats['most_predicted']:
            most_predicted = max(stats['most_predicted'], key=stats['most_predicted'].get)
            most_predicted_percentage = stats['most_predicted'][most_predicted] / stats['total'] * 100
            print(f'Most predicted for {artist}: {most_predicted} ({most_predicted_percentage:.2f}%)\n')
        else:
            print(f'No predictions for {artist}\n')


In [None]:
'''
import json
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open('trainfile-2.json', 'r') as train_file:
    train_data = json.load(train_file)

with open('testfile-2.json', 'r') as test_file:
    test_data = json.load(test_file)

print(train_data)
print(test_data)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_lines)

print('\n')
train_sequences = tokenizer.texts_to_sequences(train_lines)
print(train_sequences)
test_sequences = tokenizer.texts_to_sequences(test_lines)
print(test_sequences)

print('\n')
max_seq_length = max([len(seq) for seq in train_sequences + test_sequences])
print(max_seq_length)
train_sequences_padded = pad_sequences(train_sequences, maxlen=max_seq_length, padding='post')
print(train_sequences_padded)
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_seq_length, padding='post')
print(test_sequences_padded)

label_encoder = LabelEncoder()
label_encoder.fit(train_authors)

print('\n')
train_labels_encoded = label_encoder.transform(train_authors)
print(train_labels_encoded)
test_labels_encoded = label_encoder.transform(test_authors)
print(test_labels_encoded)

X_train, X_val, y_train, y_val = train_test_split(train_sequences_padded, train_labels_encoded, test_size=0.2, random_state=42)

print(f'XTRAIN: {X_train}')
print(f'XVAL: {X_val}')
print(f'YTRAIN: {y_train}')
print(f'YVAL: {y_val}')


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

input_size = len(tokenizer.word_index) + 1
hidden_size = 128
output_size = len(label_encoder.classes_)

model = LSTMModel(input_size, hidden_size, output_size)

c_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

X_train_tensor = torch.LongTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_val_tensor = torch.LongTensor(X_val)
y_val_tensor = torch.LongTensor(y_val)
test_sequences_tensor = torch.LongTensor(test_sequences_padded)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

model.eval()
with torch.no_grad():
    print(test_sequences_tensor)
    test_outputs = model(test_sequences_tensor)
    print(test_outputs)
    predicted = torch.argmax(test_outputs, 1)
    print(predicted)
    correct = (predicted == torch.LongTensor(test_labels_encoded)).sum().item()
    total = len(test_labels_encoded)
    accuracy = correct / total
    print(f'Overall Testing Accuracy: {accuracy:.4f}')
    for i in range(len(test_lines)):
        real_author = test_authors[i]
        predicted_author = label_encoder.inverse_transform([predicted[i].item()])[0]
        #print(f"Line: {test_lines[i]}, Real Author: {real_author}, Predicted Author: {predicted_author}")

'''