In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import re
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from collections import Counter

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/camila/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
min_words = 100
frequency_threshold = 5000

def remove_special_chars(raw_text):
    mapping_table = str.maketrans({'\n': ' ', '\t': ' ', '\x85': ' ', '\xa0': ' ', '\u2028': ' ', '\u3000': ' '})
    text = re.sub(r'[^a-zA-Z\s]', '', raw_text)
    return text.translate(mapping_table)

def format_training_data(data):
    data = data[data['Language'] == 'en'].reset_index()
    data['Lyrics_Without_Special_Chars'] = data['Lyrics'].apply(remove_special_chars)
    data['lengths'] = data['Lyrics_Without_Special_Chars'].str.split(' ').str.len()
    data = data[data['lengths'] >= min_words]
    data['Limited_Lyrics'] = data['Lyrics_Without_Special_Chars'].str.split(' ').apply(lambda x: x[:min_words]).apply(lambda x: ' '.join(x))
    data['Lyrics_Without_Stopwords'] = data['Lyrics_Without_Special_Chars'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))
    data['words_without_stopwords'] = data['Lyrics_Without_Stopwords'].apply(lambda x: x.split())
    data['word_freq_without_stopwords'] = data['words_without_stopwords'].apply(lambda x: dict(Counter(x).most_common(50)))
    return data

corpus = pd.read_csv('../csv/train_reduced.csv').dropna()
train_data = format_training_data(corpus)



# Gathering most common words
dictionaries = []
for genre in train_data['Genre'].unique():
    combined_dict = Counter()
    for words_dict in train_data[train_data['Genre'] == genre]['word_freq_without_stopwords']:
        combined_dict += Counter(words_dict)

    sorted_combined_dict = dict(sorted(combined_dict.most_common(30), key=lambda item: item[1], reverse=True))
    dictionaries.append(sorted_combined_dict)

common_words = Counter()
for d in dictionaries:
    common_words += Counter(d)
most_common_words_df = pd.DataFrame(dict(sorted(common_words.most_common(30), key=lambda item: item[1], reverse=True)), index=[0]).T.reset_index().rename(columns={'index': 'word', 0: 'freq'})
common_words_to_remove = list(most_common_words_df[most_common_words_df['freq'] >= frequency_threshold]['word'])

train_data['Lyrics_Without_Common_Words'] = train_data['Lyrics_Without_Stopwords'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in common_words_to_remove]))
train_data['words_without_common_words'] = train_data['Lyrics_Without_Common_Words'].apply(lambda x: x.split())
train_data['word_freq_without_common_words'] = train_data['words_without_common_words'].apply(lambda x: dict(Counter(x).most_common(50)))
train_data['Song_Length_T2'] = train_data['Lyrics_Without_Common_Words'].str.split(' ').str.len()

train_data.head()


Unnamed: 0,index,Artist,Song,Genre,Language,Lyrics,Lyrics_Without_Special_Chars,lengths,Limited_Lyrics,Lyrics_Without_Stopwords,words_without_stopwords,word_freq_without_stopwords,Lyrics_Without_Common_Words,words_without_common_words,word_freq_without_common_words,Song_Length_T2
0,0,12 stones,world so cold,Rock,en,"It starts with pain, followed by hate\nFueled ...",It starts with pain followed by hate Fueled by...,332,It starts with pain followed by hate Fueled by...,starts pain followed hate Fueled endless quest...,"[starts, pain, followed, hate, Fueled, endless...","{'world': 12, 'hate': 6, 'dont': 6, 'believe':...",starts pain followed hate Fueled endless quest...,"[starts, pain, followed, hate, Fueled, endless...","{'world': 12, 'hate': 6, 'believe': 6, 'cold':...",124
1,1,12 stones,broken,Rock,en,Freedom!\nAlone again again alone\nPatiently w...,Freedom Alone again again alone Patiently wait...,248,Freedom Alone again again alone Patiently wait...,Freedom Alone alone Patiently waiting phone Ho...,"[Freedom, Alone, alone, Patiently, waiting, ph...","{'broken': 13, 'know': 9, 'need': 7, 'Im': 7, ...",Freedom Alone alone Patiently waiting phone Ho...,"[Freedom, Alone, alone, Patiently, waiting, ph...","{'broken': 13, 'need': 7, 'Cause': 6, 'inside'...",100
2,2,12 stones,3 leaf loser,Rock,en,"Biting the hand that feeds you, lying to the v...",Biting the hand that feeds you lying to the vo...,142,Biting the hand that feeds you lying to the vo...,Biting hand feeds lying voice Inside reach beg...,"[Biting, hand, feeds, lying, voice, Inside, re...","{'life': 4, 'lesson': 4, 'take': 4, 'given': 4...",Biting hand feeds lying voice Inside reach beg...,"[Biting, hand, feeds, lying, voice, Inside, re...","{'life': 4, 'lesson': 4, 'take': 4, 'given': 4...",65
3,3,12 stones,anthem for the underdog,Rock,en,You say you know just who I am\nBut you can't ...,You say you know just who I am But you cant im...,156,You say you know just who I am But you cant im...,say know cant imagine waits across line though...,"[say, know, cant, imagine, waits, across, line...","{'feeling': 4, 'cant': 3, 'Im': 3, 'thousand':...",imagine waits across line thought still standi...,"[imagine, waits, across, line, thought, still,...","{'feeling': 4, 'thousand': 3, 'hearts': 3, 'fe...",49
4,4,12 stones,adrenaline,Rock,en,My heart is beating faster can't control these...,My heart is beating faster cant control these ...,337,My heart is beating faster cant control these ...,heart beating faster cant control feelings any...,"[heart, beating, faster, cant, control, feelin...","{'heart': 9, 'cant': 8, 'control': 7, 'beating...",beating faster control feelings anymore Ive wa...,"[beating, faster, control, feelings, anymore, ...","{'control': 7, 'beating': 6, 'faster': 6, 'fee...",107


In [5]:
class LyricsDataset(Dataset):
    def __init__(self, lyrics, genres):
        self.lyrics = lyrics
        self.genres = genres
        self.vocab = set([word for lyrics in self.lyrics for word in lyrics.split()])
        self.word_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.genre_idx = {genre: idx for idx, genre in enumerate(set(self.genres))}

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        lyrics = self.lyrics[idx]
        genre = self.genres[idx]
        lyrics_indices = [self.word_idx[word] for word in lyrics.split()]
        genre_index = self.genre_idx[genre]
        return torch.tensor(lyrics_indices), torch.tensor(genre_index)
    

In [6]:
def collate_fn(data):
    lyrics, genres = zip(*data)
    lyrics_indices = [torch.tensor(seq) for seq in lyrics]
    genres = torch.tensor(genres)
    lyrics_padded = pad_sequence(lyrics_indices, batch_first=True)
    lengths = torch.tensor([len(seq) for seq in lyrics_indices])
    return lyrics_padded, genres, lengths

In [7]:
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [8]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.Embedding = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.5)
        self.Linear = nn.Linear(hidden_dim, output_dim)
        self.Sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        embedded = self.Embedding(x)
        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        pack_padded_output, _ = self.LSTM(packed_embedded)
        pad_packed_output, _ = pad_packed_sequence(pack_padded_output, batch_first=True)
        last_hidden_state = pad_packed_output[:, -1, :]
        final_state = self.Linear(last_hidden_state)
        output = self.Sigmoid(final_state)
        return output

In [9]:
lstm_accuracies = []
lstm_confusion_matrices = []

In [10]:
lyrics_types = ['Lyrics', 'Lyrics_Without_Special_Chars', 'Lyrics_Without_Stopwords', 'Lyrics_Without_Common_Words', 'Limited_Lyrics']

In [11]:
songs_per_genre = 1890

In [12]:
for lyrics_type in lyrics_types:
    print('----------',lyrics_type,'----------')
    lyrics = list(train_data.groupby(['Genre']).head(songs_per_genre).reset_index()[lyrics_type])
    genres = list(train_data.groupby(['Genre']).head(songs_per_genre).reset_index()['Genre'])
    
    # Create the dataset
    dataset = LyricsDataset(lyrics, genres)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    # Define hyperparameters
    vocab_size = len(dataset.vocab) + 1
    embedding_dim = 200
    hidden_dim = 512
    output_dim = len(dataset.genre_idx)
    num_epochs = 50
    batch_size = 32
    learning_rate = 0.0001

    # Create the dataloaders with custom collate function
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Initialize the LSTM model
    model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
    model = model.to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    accuracies = []
    
    for epoch in range(num_epochs):
        model.train()
        for lyrics_indices, genres, lengths in train_dataloader:
            lyrics_indices = lyrics_indices.to(device)
            genres = genres.to(device)
            lengths = lengths.cpu()

            optimizer.zero_grad()
            logits = model(lyrics_indices, lengths)

            loss = criterion(logits, genres)
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            total_correct = 0
            total_samples = 0
            true_labels = []
            predicted_labels = []
            
            for lyrics_indices, genres, lengths in test_dataloader:
                lyrics_indices = lyrics_indices.to(device)
                genres = genres.to(device)
                lengths = lengths.cpu()

                logits = model(lyrics_indices, lengths)
                _, predictions = torch.max(logits, 1)

                total_correct += (predictions == genres).sum().item()
                total_samples += genres.size(0)
                true_labels.extend(genres.cpu().numpy())
                predicted_labels.extend(predictions.cpu().numpy())

            accuracy = total_correct / total_samples
            accuracies.append(accuracy)
            print(f'Epoch {epoch + 1}: Accuracy = {accuracy:.4f}')

    lstm_accuracies.append(accuracies)
    confusion = confusion_matrix(true_labels, predicted_labels)
    lstm_confusion_matrices.append(confusion)

---------- Lyrics ----------


  lyrics_indices = [torch.tensor(seq) for seq in lyrics]


KeyboardInterrupt: 

In [None]:
X = np.arange(0,50)
colours = ['blue', 'orange', 'green', 'purple', 'red']
labels = ['lstm_lyrics', 'lstm_lyrics_without_special_chars', 'lstm_lyrics_without_stopwords', 'lstm_lyrics_without common words', 'lstm_limited_lyrics']
for i in range(len(lstm_accuracies)):
    plt.plot(X, lstm_accuracies[i], color=colours[i], label = labels[i])

plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('LSTM Accuracies for all genres')
plt.legend()
plt.show()

In [None]:
# Lyrics
disp = ConfusionMatrixDisplay(lstm_confusion_matrices[0])
disp.plot()

# Lyrics Without Special Chars
disp = ConfusionMatrixDisplay(lstm_confusion_matrices[1])
disp.plot()

# Lyrics Without Stopwords
disp = ConfusionMatrixDisplay(lstm_confusion_matrices[2])
disp.plot()

# Lyrics Without Common Words
disp = ConfusionMatrixDisplay(lstm_confusion_matrices[3])
disp.plot()

# Limited Lyrics
disp = ConfusionMatrixDisplay(lstm_confusion_matrices[3])
disp.plot()