In [168]:
import os
import pandas as pd

# Set the directory where the text files are located
songs_dir = "songs"
data = []
# Loop through each file in the directory
for root, dirs, files in os.walk(songs_dir):
    for file in files:
        if file.endswith(".txt"):
            artist = os.path.basename(root)
            with open(os.path.join(root, file), 'r', encoding="utf8") as f:
                lyrics = f.read().replace('\n', ' ')
                # Add the data to the DataFrame
                data.append([artist, lyrics])
                
# Create an empty DataFrame to store the data
df = pd.DataFrame(data, columns=['Artist', 'Lyrics'])
# Export the DataFrame to a CSV file
df.to_csv('lyrics.csv', index=False)


In [169]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Load the data from the CSV file
data = pd.read_csv("lyrics.csv")

nltk.download('stopwords')
nltk.download('punkt')

# Create a list of stopwords to remove
stop_words = set(stopwords.words("english"))
stop_words.add("verse")
stop_words.add("intro")

# Create a stemmer to use for word stemming
stemmer = PorterStemmer()

# Preprocess each lyric in the DataFrame
for i, row in data.iterrows():
#Convert the lyric to lowercase
    lyric = str(row["Lyrics"]).lower()
    match = re.search(r'lyrics\[[^\]]*\]', lyric)

    # Check if the split was successful
    if match:
        split_index = match.end()
        cleaned_lyric = lyric[split_index:]
    else:
        cleaned_lyric = lyric

    #Tokenize the lyric into words
    words = word_tokenize(cleaned_lyric)

    #Remove stop words and punctuation
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]

    #Stem each word
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    #Join the stemmed words back into a single string
    preprocessed_lyric = " ".join(stemmed_words)

    #Replace the original lyric with the preprocessed lyric in the DataFrame
    data.at[i, "Lyrics"] = preprocessed_lyric

# Export the preprocessed DataFrame to a CSV file
data.to_csv("preprocessed_lyrics.csv", index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jarraomar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jarraomar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [170]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

pre_processed_data = pd.read_csv("preprocessed_lyrics.csv")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pre_processed_data["Lyrics"])
sequences = tokenizer.texts_to_sequences(pre_processed_data["Lyrics"])

In [172]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#For Artist Classification

artists = pre_processed_data["Artist"].unique()
# Encode the artist names as integer labels
label_encoder = LabelEncoder()
pre_processed_data["Artist"] = label_encoder.fit_transform(pre_processed_data["Artist"])

# Initialize empty dataframes for training, validation, and testing
train_df = pd.DataFrame(columns=["Artist", "Lyrics"])
val_df = pd.DataFrame(columns=["Artist", "Lyrics"])
test_df = pd.DataFrame(columns=["Artist", "Lyrics"])

for artist in artists:
    # Get the data for the current artist
    artist_data = pre_processed_data[pre_processed_data["Artist"] == artist]
    
    # Split the artist data into training, validation, and testing sets
    artist_train, artist_test = train_test_split(artist_data, test_size=0.2, random_state=42)
    artist_train, artist_val = train_test_split(artist_train, test_size=0.2, random_state=42)
    
    # Concatenate the artist training, validation, and testing dataframes with the overall training, validation, and testing dataframes
    train_df = pd.concat([train_df, artist_train])
    val_df = pd.concat([val_df, artist_val])
    test_df = pd.concat([test_df, artist_test])

# Create a directory to store the CSV files
directory = "data_splits"
if not os.path.exists(directory):
    os.makedirs(directory)
    train_df.to_csv(os.path.join(directory, "train.csv"), index=False)
    val_df.to_csv(os.path.join(directory, "val.csv"), index=False)
    test_df.to_csv(os.path.join(directory, "test.csv"), index=False)

train_df.to_csv(os.path.join(directory, "train.csv"), index=False)
val_df.to_csv(os.path.join(directory, "val.csv"), index=False)
test_df.to_csv(os.path.join(directory, "test.csv"), index=False)


In [173]:
import pandas as pd
import numpy as np
import gensim
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

train = pd.read_csv('data_splits/train.csv')
val = pd.read_csv('data_splits/val.csv')
test = pd.read_csv('data_splits/test.csv')

#Creation of MLP model:
model_mlp = Sequential()
model_mlp.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=200))
model_mlp.add(GlobalMaxPooling1D())
model_mlp.add(Dense(100, activation='relu'))
model_mlp.add(Dropout(0.4))
model_mlp.add(Dense(128, activation='relu'))
model_mlp.add(Dropout(0.5))
model_mlp.add(Dense(128, activation='relu'))
model_mlp.add(Dropout(0.5))
model_mlp.add(Dense(8, activation='softmax'))

tokenizer.fit_on_texts(train['Lyrics'])
sequences_train = tokenizer.texts_to_sequences(train['Lyrics'])
sequences_val = tokenizer.texts_to_sequences(val['Lyrics'])
sequences_test = tokenizer.texts_to_sequences(test['Lyrics'])
word_index = tokenizer.word_index
X_train = pad_sequences(sequences_train, maxlen=200)
X_val = pad_sequences(sequences_val, maxlen=200)
X_test = pad_sequences(sequences_test, maxlen=200)

# One-hot encode the target variable
y_train = to_categorical(train['Artist'])
y_val = to_categorical(val['Artist'])
y_test = to_categorical(test['Artist'])


In [174]:
#We apply earlystopping in order to avoid over-fitting
es = EarlyStopping(monitor='val_loss', patience=5)
model_mlp.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [175]:
model_mlp.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=64, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


<keras.callbacks.History at 0x170b9b460>

In [176]:
#Testing our model's accuracy based on a separate test set. "X and Y test"
test_loss, test_acc = model_mlp.evaluate(X_test, y_test, verbose=2)
print('Test accuracy:', test_acc)

3/3 - 0s - loss: 0.9947 - accuracy: 0.7250 - 178ms/epoch - 59ms/step
Test accuracy: 0.7250000238418579


In [177]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=200))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(LSTM(128))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(len(label_encoder.classes_), activation='softmax'))

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam')

model_lstm.summary()

2023-05-17 22:27:33.884330: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 22:27:33.886162: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 22:27:33.887553: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 300)          2429400   
                                                                 
 lstm_6 (LSTM)               (None, 200, 128)          219648    
                                                                 
 dropout_12 (Dropout)        (None, 200, 128)          0         
                                                                 
 lstm_7 (LSTM)               (None, 200, 128)          131584    
                                                                 
 dropout_13 (Dropout)        (None, 200, 128)          0         
                                                                 
 lstm_8 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_14 (Dropout)        (None, 128)              

2023-05-17 22:27:34.348016: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 22:27:34.349185: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 22:27:34.350520: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [179]:
import random

generated_lyrics_dir = "Generated Lyrics"
if not os.path.exists(generated_lyrics_dir):
    os.makedirs(generated_lyrics_dir)

generated_lyrics_list = []

# Set the maximum number of words to generate for each artist
max_words = max_lines * max_line_length

# Iterate through each artist
for artist in artists:
    artist_str = str(artist)

    # Generate 3 songs for the current artist
    for _ in range(200):
        # Filter the dataset for the current artist
        artist_data = pre_processed_data[pre_processed_data["Artist"] == artist]

        # Concatenate all the lyrics of the artist
        artist_lyrics = " ".join(artist_data["Lyrics"])

        # Split the artist lyrics into words
        artist_words = artist_lyrics.split()

        # Shuffle the list of words
        random.shuffle(artist_words)

        # Generate lyrics line by line
        lines_generated = 0
        current_line = ""
        generated_lyrics = ""
        generated_lyrics_csv = ""

        while lines_generated < max_lines and len(artist_words) > 0:
            # Check if the current line plus the next word exceeds the maximum line length
            if len(current_line.split()) >= max_line_length:
                # Append the current line to the generated lyrics
                generated_lyrics += current_line.strip() + "\n"
                generated_lyrics_csv += current_line.strip()
                current_line = ""
                lines_generated += 1

            # Get the next word from the shuffled list
            next_word = artist_words.pop(0)

            # Truncate the word if it exceeds the maximum line length
            if len(next_word) > max_line_length:
                next_word = next_word[:max_line_length]

            current_line += next_word + " "

        # Append the last line to the generated lyrics
        generated_lyrics += current_line.strip() + "\n"
        generated_lyrics_csv += current_line.strip()

        generated_lyrics_list.append({"Artist": artist, "Lyrics": generated_lyrics_csv})

        # Write the generated lyrics to a file for the current artist
        artist_lyrics_dir = os.path.join(generated_lyrics_dir, artist_str)
        if not os.path.exists(artist_lyrics_dir):
            os.makedirs(artist_lyrics_dir)

        file_name = os.path.join(artist_lyrics_dir, f"generated_lyrics_{_ + 1}.txt")
        with open(file_name, "w") as f:
            f.write(generated_lyrics)

csv_file = os.path.join(generated_lyrics_dir, "gen_lyrics.csv")
df_gen_lyrics = pd.DataFrame(columns=["Artist", "Lyrics"])
df_gen_lyrics = df_gen_lyrics.append(generated_lyrics_list, ignore_index=True)
df_gen_lyrics.to_csv(csv_file, index=False)


  df_gen_lyrics = df_gen_lyrics.append(generated_lyrics_list, ignore_index=True)


In [183]:
gen_lyrics = pd.read_csv('Generated Lyrics/gen_lyrics.csv')
X_gen = gen_lyrics["Lyrics"]
X_gen_sequences = tokenizer.texts_to_sequences(X_gen)

# Pad the sequences to have consistent length
X_gen_padded = pad_sequences(X_gen_sequences, maxlen=max_words)

# Make predictions on the generated lyrics
predictions = model_mlp.predict(X_gen_padded)

# Convert the predictions to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Get the ground truth labels for the generated lyrics
ground_truth_labels = label_encoder.transform(gen_lyrics["Artist"])

# Calculate accuracy
accuracy = np.mean(predicted_labels == ground_truth_labels)

print(accuracy)

0.895625


In [181]:
generated_lyrics_dir = "Generated Lyrics"
completed_songs_dir = "Completed Songs"

# Iterate through each artist in the generated lyrics directory
for artist in os.listdir(generated_lyrics_dir):
    artist_dir = os.path.join(generated_lyrics_dir, str(artist))
    completed_artist_dir = os.path.join(completed_songs_dir, artist)
    
    # Create a new directory for the completed songs for the current artist
    if not os.path.exists(completed_artist_dir):
        os.makedirs(completed_artist_dir)
    
    # Skip if the current entry is not a directory
    if not os.path.isdir(artist_dir):
        continue
    
    # Iterate through each song in the artist's directory
    for song_file in os.listdir(artist_dir):
        song_path = os.path.join(artist_dir, song_file)
        
        # Skip if the current entry is not a file or is the CSV file
        if not os.path.isfile(song_path) or song_file == "gen_lyrics.csv":
            continue
        
        completed_song_path = os.path.join(completed_artist_dir, song_file)
        
        # Read the generated song lyrics from the file
        with open(song_path, "r") as f:
            generated_lyrics = f.read()
        
        # Reverse the pre-processing steps
        words = generated_lyrics.split()
        stemmed_words = [stemmer.stem(word) for word in words]
        filtered_words = [word for word in stemmed_words if word.isalpha() and word not in stop_words]
        
        # Create song lines based on maximum line length
        lines_generated = 0
        current_line = ""
        song_lines = []
        
        for word in filtered_words:
            if len(current_line.split()) >= max_line_length:
                song_lines.append(current_line.strip())
                current_line = ""
                lines_generated += 1
            
            if len(word) > max_line_length:
                word = word[:max_line_length]
            
            current_line += word + " "
        
        if current_line.strip():
            song_lines.append(current_line.strip())
        
        # Write the original song lines to the completed song file
        with open(completed_song_path, "w") as f:
            f.write("\n".join(song_lines))

In [76]:
from keras import backend as K
K.clear_session()
del model_mlp
del model_lstm