# Required Package

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

folder_path = '/content/drive/My Drive/data/preprocessed-data'

Mounted at /content/drive


# Load Dataset

In [5]:
df_lyrics = pd.read_csv(f"{folder_path}/song_tokenized_lyrics.csv")
df_metadata = pd.read_csv(f"{folder_path}/song_metadata.csv")
df_genre = df_metadata.drop(columns=['artist', 'name', 'popularity', 'release_date','explicit', 'duration_ms', 'featured_artists'])
# df_genre.head()
# df_metadata.head()

df_music = pd.merge(df_genre,df_lyrics,on='spotify_id')
df_music = df_music.drop(columns=['spotify_id','Unnamed: 0'])
df_music.head()

Unnamed: 0,genre,valence,danceability,energy,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,tempo,key,mode,instrumentalness,liveness,speechiness,acousticness,loudness,lyrics,tokenized_lyrics
0,rap,0.1,0.548,0.847,6,4.55,5.273125,5.690625,171.447,1,1,0.0,0.0816,0.186,0.0622,-3.237,[Intro: Eminem]\n'Cause sometimes you just fee...,"[""'cause"", 'sometimes', 'feel', 'tired', 'yo',..."
1,metal,0.498,0.249,0.949,8,3.71,5.833,5.42725,185.252,2,0,0.0228,0.0953,0.0678,0.00131,-2.642,[Verse]\nSaint Anger 'round my neck\nSaint Ang...,"['saint', 'anger', ""'round"", 'neck', 'saint', ..."
2,metal,0.567,0.657,0.96,7,3.771176,5.348235,5.441765,126.02,5,0,0.000997,0.109,0.07,0.00169,-3.524,[Intro]\nDie!\n\n[Verse 1]\nI don't need your ...,"['die', ""n't"", 'need', 'forgiveness', ""n't"", '..."
3,metal,0.585,0.431,0.962,9,2.971389,5.5375,4.726389,156.103,6,1,5e-06,0.321,0.0789,8e-06,-3.269,"[Intro]\nOne, two, three, go!\n\n[Verse 1]\nBr...","['one', 'two', 'three', 'go', 'broken', 'yeah'..."
4,hip-hop,0.169,0.811,0.566,1,3.08,5.87,5.49,100.224,8,0,0.0,0.104,0.517,0.563,-6.033,"[Chorus]\nAyy, ya heard about the good news?\n...","['ayy', 'ya', 'heard', 'good', 'news', ""y'all""..."


In [6]:
df_music.columns

Index(['genre', 'valence', 'danceability', 'energy', 'number_of_emotion_tags',
       'valence_tags', 'arousal_tags', 'dominance_tags', 'tempo', 'key',
       'mode', 'instrumentalness', 'liveness', 'speechiness', 'acousticness',
       'loudness', 'lyrics', 'tokenized_lyrics'],
      dtype='object')

# Clean Data

## Sentences Tokenize

In [7]:
lyrics = df_music['lyrics']
text1 = lyrics[1]
text1

"[Verse]\nSaint Anger 'round my neck\nSaint Anger 'round my neck\nHe never gets respect\nSaint Anger 'round my neck\n\n[Pre-Chorus]\n(You flush it out, you flush it out) Saint Anger 'round my neck\n(You flush it out, you flush it out) He never gets respect\n(You flush it out, you flush it out) Saint Anger 'round my neck\n(You flush it out, you flush it out) He never gets respect\n\n[Chorus]\nFuck it all and no regrets\nI hit the lights on these dark sets\nI need a voice to let myself, to let myself go free\nFuck it all and fucking no regrets\nI hit the lights on these dark sets\nMedallion noose, I hang myself, Saint Anger 'round my neck\n\n[Post-Chorus]\nI feel my world shake like an earthquake\nHard to see clear, is it me, is it fear?\nI'm madly in anger with you\nI'm madly in anger with you\n\n[Verse]\nSaint Anger 'round my neck\nSaint Anger 'round my neck\nHe never gets respect\nSaint Anger 'round my neck\n\n[Pre-Chorus]\n(You flush it out, you flush it out) Saint Anger 'round my ne

In [8]:
from nltk.tokenize import word_tokenize

def bracket_sentence(sent):
    sent = "<s> " + sent.lower() + " </s>"
    return sent

def sent_tokenize(text):
    #remove some abbreviations
    text = re.sub('\'m', ' am', text)
    text = re.sub('\'ve', ' have', text)
    text = re.sub('\'d ', ' would', text)
    text =re.sub('\'ll', ' will', text)

    # remove special tag like [Intro]，[Verse]
    cleaned_text = re.sub(r'\[[^\]]*\]', '', text)

    # split sentences into list
    sentences = []
    sentence = ''
    for char in cleaned_text:
        if char == '\n':
            # Check if the sentence is not empty
            if sentence.strip():
                # add "<s>" and "</s>" to sentence
                sentence = bracket_sentence(sentence)
                sentences.append(sentence.strip())
            sentence = ''
        else:
            sentence += char
    # Check if the last sentence is not empty
    if sentence.strip():
        sentences.append(sentence.strip())
    # print(sentences)
    # for i, sent in enumerate(sentences):
    #     words = sent.split(sep=' ')
    #     sentences1[i] = words
    return sentences

# def add_last_bracket(sent):
#     print(sent[len(sent)-1])
#     sent[len(sent)-1] = "<s> " + sent[len(sent)-1] + " </s>"
#     return sent

sentences1 = sent_tokenize(text1)
sentences1[1]


"<s> saint anger 'round my neck </s>"

## Word Tokenize

In [9]:
def tokenize_words(sentences):
    for i, sent in enumerate(sentences):
        if i == len(sentences)-1:
            sent = "<s> " + sent[len(sent)-1] + " </s>"
        # print(sent)
        words = sent.split(sep=' ')
        # for word in words:
        #     print(word)
        sentences[i] = words
        # print(sentences[i])
    return sentences

sentences1 = tokenize_words(sentences1)

In [None]:
sentences1

In [11]:
df_music['word_tokenize_lyrics'] = df_music['lyrics'].apply(sent_tokenize)
# df_music['sent_tokenize_lyrics'] = df_music['sent_tokenize_lyrics'].apply(add_last_bracket)
df_music['word_tokenize_lyrics'] = df_music['word_tokenize_lyrics'].apply(tokenize_words)

In [12]:
df_music.head()

Unnamed: 0,genre,valence,danceability,energy,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,tempo,key,mode,instrumentalness,liveness,speechiness,acousticness,loudness,lyrics,tokenized_lyrics,word_tokenize_lyrics
0,rap,0.1,0.548,0.847,6,4.55,5.273125,5.690625,171.447,1,1,0.0,0.0816,0.186,0.0622,-3.237,[Intro: Eminem]\n'Cause sometimes you just fee...,"[""'cause"", 'sometimes', 'feel', 'tired', 'yo',...","[[<s>, 'cause, sometimes, you, just, feel, tir..."
1,metal,0.498,0.249,0.949,8,3.71,5.833,5.42725,185.252,2,0,0.0228,0.0953,0.0678,0.00131,-2.642,[Verse]\nSaint Anger 'round my neck\nSaint Ang...,"['saint', 'anger', ""'round"", 'neck', 'saint', ...","[[<s>, saint, anger, 'round, my, neck, </s>], ..."
2,metal,0.567,0.657,0.96,7,3.771176,5.348235,5.441765,126.02,5,0,0.000997,0.109,0.07,0.00169,-3.524,[Intro]\nDie!\n\n[Verse 1]\nI don't need your ...,"['die', ""n't"", 'need', 'forgiveness', ""n't"", '...","[[<s>, die!, </s>], [<s>, i, don't, need, your..."
3,metal,0.585,0.431,0.962,9,2.971389,5.5375,4.726389,156.103,6,1,5e-06,0.321,0.0789,8e-06,-3.269,"[Intro]\nOne, two, three, go!\n\n[Verse 1]\nBr...","['one', 'two', 'three', 'go', 'broken', 'yeah'...","[[<s>, one,, two,, three,, go!, </s>], [<s>, b..."
4,hip-hop,0.169,0.811,0.566,1,3.08,5.87,5.49,100.224,8,0,0.0,0.104,0.517,0.563,-6.033,"[Chorus]\nAyy, ya heard about the good news?\n...","['ayy', 'ya', 'heard', 'good', 'news', ""y'all""...","[[<s>, ayy,, ya, heard, about, the, good, news..."


In [16]:
df_music.to_csv('temp_file.csv')

## Encode Lyrics

In [14]:
words_lyrics = df_music['lyrics'].values
words_lyrics

array(["[Intro: Eminem]\n'Cause sometimes you just feel tired\nYo, left, yo, left\nFeel weak and when you feel weak\nYo, left, right, left\nYou feel like you wanna just give up\nYo, left, yo, left\nBut you gotta search within you\nYo, left, right, left\nTry to find that inner strength and just pull that shit out of you\nYo, left, yo, left\nAnd get that motivation to not give up\nYo, left, right, left\nAnd not be a quitter, no matter how bad\nYo, left, yo, left\nYou wanna just fall flat on your face and collapse\nYo, left, right, left\n\n[Verse 1: Eminem]\n'Til I collapse I'm spillin' these raps long as you feel 'em\n'Til the day that I drop you'll never say that I'm not killin' 'em\n'Cause when I am not, then I'ma stop pennin' 'em\nAnd I am not hip-hop and I'm just not Eminem\nSubliminal thoughts, when I'ma stop sendin' 'em?\nWomen are caught in webs, spin 'em and hock venom\nAdrenaline shots of penicillin could not get the illin' to stop\nAmoxicillin's just not real enough\nThe crimin

# Split Dataset

In [34]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, lyrics_tfidf, test_size=0.2, random_state=42)

# convert to tensor float32
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [35]:
print(X_train_tensor.shape)
print(y_train_tensor.shape)

torch.Size([5582, 15])
torch.Size([5582, 1000])


# Define the Model

In [None]:

class LyricsGeneratorModel(nn.Module):
    def __init__(self, vocab_size, num_genres, num_numeric_features):
        super(LyricsGeneratorModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.lstm = nn.LSTM(128, 128, batch_first=True)
        self.dense_numeric = nn.Linear(num_numeric_features, 32)
        self.dense_genre = nn.Linear(num_genres, 32)
        self.dense_combined = nn.Linear(128 + 32 + 32, 128)
        self.output_layer = nn.Linear(128, vocab_size)

    def forward(self, lyrics_input, numeric_input, genre_input):
        embedded_lyrics = self.embedding(lyrics_input)
        lstm_out, _ = self.lstm(embedded_lyrics)
        lstm_out = lstm_out[:, -1, :]  # Get the output of the last LSTM cell

        numeric_out = F.relu(self.dense_numeric(numeric_input))
        genre_out = F.relu(self.dense_genre(genre_input))

        combined = torch.cat((lstm_out, numeric_out, genre_out), dim=1)
        combined = F.relu(self.dense_combined(combined))
        output = self.output_layer(combined)
        return output
