In [2]:
import os
import pandas as pd

# Set the directory where the text files are located
songs_dir = "songs"
data = []
# Loop through each file in the directory
for root, dirs, files in os.walk(songs_dir):
    for file in files:
        if file.endswith(".txt"):
            artist = os.path.basename(root)
            with open(os.path.join(root, file), 'r', encoding="utf8") as f:
                lyrics = f.read().replace('\n', ' ')
                # Add the data to the DataFrame
                data.append([artist, lyrics])
                
# Create an empty DataFrame to store the data
df = pd.DataFrame(data, columns=['Artist', 'Lyrics'])
# Export the DataFrame to a CSV file
df.to_csv('lyrics.csv', index=False)


In [3]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Load the data from the CSV file
data = pd.read_csv("lyrics.csv")

nltk.download('stopwords')
nltk.download('punkt')

# Create a list of stopwords to remove
stop_words = set(stopwords.words("english"))
stop_words.add("verse")
stop_words.add("intro")

# Create a stemmer to use for word stemming
stemmer = PorterStemmer()

# Preprocess each lyric in the DataFrame
for i, row in data.iterrows():
#Convert the lyric to lowercase
    lyric = str(row["Lyrics"]).lower()
    match = re.search(r'lyrics\[[^\]]*\]', lyric)

    # Check if the split was successful
    if match:
        split_index = match.end()
        cleaned_lyric = lyric[split_index:]
    else:
        cleaned_lyric = lyric

    #Tokenize the lyric into words
    words = word_tokenize(cleaned_lyric)

    #Remove stop words and punctuation
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]

    #Stem each word
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    #Join the stemmed words back into a single string
    preprocessed_lyric = " ".join(stemmed_words)

    #Replace the original lyric with the preprocessed lyric in the DataFrame
    data.at[i, "Lyrics"] = preprocessed_lyric

# Export the preprocessed DataFrame to a CSV file
data.to_csv("preprocessed_lyrics.csv", index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jarraomar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jarraomar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

pre_processed_data = pd.read_csv("preprocessed_lyrics.csv")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pre_processed_data["Lyrics"])
sequences = tokenizer.texts_to_sequences(pre_processed_data["Lyrics"])

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#For Artist Classification

artists = pre_processed_data["Artist"].unique()
# Encode the artist names as integer labels
label_encoder = LabelEncoder()
pre_processed_data["Artist"] = label_encoder.fit_transform(pre_processed_data["Artist"])

# Initialize empty dataframes for training, validation, and testing
train_df = pd.DataFrame(columns=["Artist", "Lyrics"])
val_df = pd.DataFrame(columns=["Artist", "Lyrics"])
test_df = pd.DataFrame(columns=["Artist", "Lyrics"])

for artist in artists:
    # Get the data for the current artist
    artist_data = pre_processed_data[pre_processed_data["Artist"] == artist]
    
    # Split the artist data into training, validation, and testing sets
    artist_train, artist_test = train_test_split(artist_data, test_size=0.2, random_state=42)
    artist_train, artist_val = train_test_split(artist_train, test_size=0.2, random_state=42)
    
    # Concatenate the artist training, validation, and testing dataframes with the overall training, validation, and testing dataframes
    train_df = pd.concat([train_df, artist_train])
    val_df = pd.concat([val_df, artist_val])
    test_df = pd.concat([test_df, artist_test])

# Create a directory to store the CSV files
directory = "data_splits"
if not os.path.exists(directory):
    os.makedirs(directory)
    train_df.to_csv(os.path.join(directory, "train.csv"), index=False)
    val_df.to_csv(os.path.join(directory, "val.csv"), index=False)
    test_df.to_csv(os.path.join(directory, "test.csv"), index=False)

train_df.to_csv(os.path.join(directory, "train.csv"), index=False)
val_df.to_csv(os.path.join(directory, "val.csv"), index=False)
test_df.to_csv(os.path.join(directory, "test.csv"), index=False)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [6]:
import pandas as pd
import numpy as np
import gensim
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

train = pd.read_csv('data_splits/train.csv')
val = pd.read_csv('data_splits/val.csv')
test = pd.read_csv('data_splits/test.csv')

#Creation of MLP model:
model_mlp = Sequential()
model_mlp.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=200))
model_mlp.add(GlobalMaxPooling1D())
model_mlp.add(Dense(100, activation='relu'))
model_mlp.add(Dropout(0.4))
model_mlp.add(Dense(128, activation='relu'))
model_mlp.add(Dropout(0.5))
model_mlp.add(Dense(128, activation='relu'))
model_mlp.add(Dropout(0.5))
model_mlp.add(Dense(8, activation='softmax'))

tokenizer.fit_on_texts(train['Lyrics'])
sequences_train = tokenizer.texts_to_sequences(train['Lyrics'])
sequences_val = tokenizer.texts_to_sequences(val['Lyrics'])
sequences_test = tokenizer.texts_to_sequences(test['Lyrics'])
word_index = tokenizer.word_index
X_train = pad_sequences(sequences_train, maxlen=200)
X_val = pad_sequences(sequences_val, maxlen=200)
X_test = pad_sequences(sequences_test, maxlen=200)

# One-hot encode the target variable
y_train = to_categorical(train['Artist'])
y_val = to_categorical(val['Artist'])
y_test = to_categorical(test['Artist'])


In [7]:
#We apply earlystopping in order to avoid over-fitting
es = EarlyStopping(monitor='val_loss', patience=5)
model_mlp.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
model_mlp.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=64, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100


<keras.callbacks.History at 0x1643dcee0>

In [9]:
#Testing our model's accuracy based on a separate test set. "X and Y test"
test_loss, test_acc = model_mlp.evaluate(X_test, y_test, verbose=2)
print('Test accuracy:', test_acc)

3/3 - 0s - loss: 0.7083 - accuracy: 0.7750 - 154ms/epoch - 51ms/step
Test accuracy: 0.7749999761581421


In [104]:
from keras import backend as K
K.clear_session()
del model_mlp

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=200))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(LSTM(128))
model_lstm.add(Dense(len(label_encoder.classes_), activation='softmax'))

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam')

model_lstm.summary()

2023-05-17 10:26:27.620764: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 10:26:27.623155: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 10:26:27.624484: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 300)          2429400   
                                                                 
 lstm (LSTM)                 (None, 200, 128)          219648    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_4 (Dense)             (None, 8)                 1032      
                                                                 
Total params: 2,781,664
Trainable params: 2,781,664
Non-trainable params: 0
_________________________________________________________________


2023-05-17 10:26:27.837383: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 10:26:27.840335: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 10:26:27.842433: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [13]:
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove unnecessary characters, symbols, or patterns
    text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets
    text = re.sub(r'\([^)]*\)', '', text)  # Remove text within parentheses
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stop words and punctuation
    words = [word for word in words if word.isalpha() and word not in stop_words]
    
    # Perform stemming on words if desired
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the words back into a single string
    preprocessed_text = " ".join(stemmed_words)
    
    return preprocessed_text

In [35]:
import random

import os

# Create a directory to store the generated lyrics
generated_lyrics_dir = "Generated Lyrics"
if not os.path.exists(generated_lyrics_dir):
    os.makedirs(generated_lyrics_dir)

# Set the maximum number of words to generate for each artist
max_words = max_lines * max_line_length

# Iterate through each artist
for artist in artists:
    print(f"Generating lyrics for {artist}:")
    
    # Convert the artist to a string
    artist_str = str(artist)
    print(artist)
    
    # # Filter the dataset for the current artist
    # artist_data = pre_processed_data[pre_processed_data["Artist"] == artist]
    
    # # Check if the artist has any lyrics
    # if artist_data.empty:
    #     print("No lyrics found for this artist.")
    #     continue
    
    # # Select a random seed text from the artist's lyrics
    # seed_text = random.choice(artist_data["Lyrics"])
    
    # # Convert the seed text to a sequence
    # seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]
    # seed_sequence = [x + 1 for x in seed_sequence]
    
    # # Pad the sequence to match the input length
    # seed_sequence = pad_sequences([seed_sequence], maxlen=200)
    
    # # Generate lyrics
    # generated_lyrics = seed_text
    
    # # Generate lyrics line by line
    # lines_generated = 1
    # current_line = ""
    
    # while lines_generated <= max_lines:
    #     # Predict the probabilities for the next word
    #     predictions = model_lstm.predict(seed_sequence)[0]
        
    #     # Sample the next word from the predicted probabilities
    #     next_word_index = np.random.choice(len(predictions), p=predictions)
    #     next_word = tokenizer.index_word[next_word_index + 1]
        
    #     # Check if the current line plus the next word exceeds the maximum line length
    #     if len(current_line.split()) + len(next_word.split()) > max_line_length:
    #         generated_lyrics += current_line + "\n"
    #         current_line = ""
    #         lines_generated += 1
        
    #     # Append the next word to the current line
    #     current_line += " " + next_word
        
    #     # Update the seed sequence for the next iteration
    #     seed_sequence = np.roll(seed_sequence, -1)
    #     seed_sequence[0][-1] = next_word_index
    
    # # Append the last line to the generated lyrics
    # generated_lyrics += current_line + "\n"
    
    # # Write the generated lyrics to a file for the current artist
    # artist_lyrics_dir = os.path.join(generated_lyrics_dir, artist_str)
    # if not os.path.exists(artist_lyrics_dir):
    #     os.makedirs(artist_lyrics_dir)
    
    # file_name = os.path.join(artist_lyrics_dir, "generated_lyrics.txt")
    # with open(file_name, "w") as f:
    #     f.write(generated_lyrics)



Generating lyrics for 4:
4
Generating lyrics for 6:
6
Generating lyrics for 5:
5
Generating lyrics for 3:
3
Generating lyrics for 2:
2
Generating lyrics for 0:
0
Generating lyrics for 1:
1
Generating lyrics for 7:
7
