<a href="https://colab.research.google.com/github/benbrill/MoodSpace/blob/main/tfLyricClassifcation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook outlines the creation of our model to generate Spotify metrics from song lyrics

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import re
import string

from tensorflow.keras import layers

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split

# Load Training Data

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/benbrill/MoodSpace/main/data/trainingSongs_clean.csv")
df

Unnamed: 0.1,Unnamed: 0,trackName,artist,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,lyrics,cluster,language
0,3,Overthinker,INZO,4K9xid96G3YmIvQZXN9SXg,0.472,0.605,8.0,-4.437,1.0,0.1340,0.03110,0.030800,0.1010,0.212,128.375,audio_features,a person who thinks all the time has nothing t...,0,en
1,4,Lifestyles of the Rich & Famous,Good Charlotte,2g2a5kDeZexbUTD8abcvm6,0.620,0.930,1.0,-3.685,1.0,0.0374,0.00043,0.000000,0.0686,0.609,106.220,audio_features,always see it on t v or read in the magazines ...,3,en
2,6,Carrying Your Love With Me,George Strait,7puxIVNdj5nsBJk43zM3bH,0.629,0.479,10.0,-10.608,1.0,0.0271,0.22000,0.000000,0.0587,0.345,138.231,audio_features,baby all i got is this beat up leather bag and...,5,en
3,7,"Check Yes, Juliet",We The Kings,0wVluBsVAVzBKrqspuCcwR,0.352,0.912,7.0,-4.253,1.0,0.0725,0.00197,0.000000,0.1930,0.351,166.795,audio_features,check yes juliet are you with me rain is falli...,2,en
4,8,At My Worst (feat. Kehlani),Pink Sweat$,58w68w4s8h9gw3xrDaXyuj,0.731,0.484,0.0,-5.579,1.0,0.0354,0.73000,0.000003,0.3260,0.350,92.043,audio_features,can i call you baby can you be my friend can y...,1,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
712,994,90mh,Trefuego,1VxvGm1moDJ3svQlwjdBwA,0.716,0.427,1.0,-8.993,1.0,0.0529,0.18700,0.000000,0.3540,0.223,108.993,audio_features,hi kevin ayy yeah ayy ayy you dont really want...,3,en
713,995,9 Bridge,Rowdy Rebel,2sHekv6OdEiO4htSjdB9j4,0.642,0.589,5.0,-7.392,0.0,0.3020,0.12400,0.000000,0.0868,0.720,95.543,audio_features,i know i dont never make promises this time i ...,1,en
714,996,Lotus Flower Bomb (feat. Miguel),Wale,3MAgQuClHcAV8E9CbeBS6f,0.512,0.598,9.0,-4.959,0.0,0.1150,0.61100,0.000000,0.0881,0.345,70.189,audio_features,ima rap to you real quick i wanna enjoy the lu...,4,en
715,997,Envy Me,Calboy,7rvyVWja33WG9R97oeJAjx,0.740,0.488,1.0,-7.664,0.0,0.2700,0.23400,0.000000,0.2410,0.584,149.042,audio_features,now i lay me down to sleep now i lay me down t...,5,en


# Create Vectorization Layer

In [3]:
# only the top distinct words will be tracked
max_tokens = 2000

# establish a vector length of 500
sequence_length = 500

vectorize_layer = TextVectorization(
    max_tokens=max_tokens, # only consider this many words
    output_mode='int',
    output_sequence_length=sequence_length) 

In [4]:
# adapt the vectorization to work on our song lyrics
vectorize_layer.adapt(df['lyrics'].to_numpy())

# Create Training and Testing Data

In [5]:
X = vectorize_layer(df['lyrics']).numpy().astype("int32") # predictor will be a vectorized form of song lyrics
y = df[["energy", "valence", "tempo", "liveness"]].to_numpy().astype("float32") # target will be spotify metrics listed

In [6]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42)

# Create Model

In [7]:
lyrics_features = None
# set input as the vectorized form of lyrics, each of which is a vector len = 500
lyrics_input = keras.Input(
    shape = (500,), 
    name = "lyrics",
    dtype = "int32"
)
lyrics_features = layers.Embedding(max_tokens, 60, name = "embedding")(lyrics_input) # add embedding layer
lyrics_features = layers.Dropout(0.2)(lyrics_features)
lyrics_features = layers.Conv1D(64, 5, activation='relu')(lyrics_features)
lyrics_features = layers.MaxPooling1D(pool_size=4)(lyrics_features)
lyrics_features = layers.LSTM(100)(lyrics_features)
lyrics_features = layers.Dropout(0.2)(lyrics_features)
lyrics_features = layers.Dense(64, activation='relu')(lyrics_features)
lyrics_features = layers.Dense(32, activation='relu')(lyrics_features)
output1 = layers.Dense(4, name = "metrics")(lyrics_features) # create output layer

In [8]:
model = keras.Model(inputs=lyrics_input, outputs=[output1]) # create model using layers above

In [9]:
keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [10]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lyrics (InputLayer)          [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 60)           120000    
_________________________________________________________________
dropout (Dropout)            (None, 500, 60)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 496, 64)           19264     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 124, 64)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               66000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0     

In [12]:
model.compile(loss='mae',
              optimizer='adam', 
              metrics=['RootMeanSquaredError', 'msle'])

In [13]:
history = model.fit(X_train, y_train, epochs = 100, validation_data = (X_test, y_test))

 val_loss: 6.1332 - val_root_mean_squared_error: 14.2607 - val_msle: 0.0273
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch

In [14]:
model.evaluate(X_test, y_test)



[5.951345920562744, 15.4306001663208, 0.037071701139211655]

# Get weights of movie scripts

In [15]:
def stringProcessing(s):
    s = re.sub(r"\'", "", s)
    s = re.sub(r'\n', ' ', s)
    s = re.sub(r'\t', '', s)
    s = re.sub(r"\[[^[]*\]", '', s)
    s = re.sub(r'[^\w\s]', ' ', s)
    s = re.sub(r' +', ' ', s)
    s = s.strip()
    s = s.lower()
    return s
def vectorize_movie_scripts(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [17]:
import os
# model = create_model(

# df = pd.read_csv("https://raw.githubusercontent.com/benbrill/MoodSpace/main/data/trainingSongs_clean.csv")

# data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))
# data_vec = data.map(vectorize_headline)
# df = pd.DataFrame(model.predict(data_vec))
# df.shape
d = {}
for script_path in os.listdir("../scripts"):
    with open(f"../scripts/{script_path}") as f:
        contents = f.read()

        contents = stringProcessing(contents)

        df = pd.DataFrame({"lyrics": [contents]})
        
        vectorize_layer.adapt(df["lyrics"].to_numpy())
        X = vectorize_layer(df["lyrics"])
        pred = model.predict(X)
        # pred = np.concatenate((pred[0], pred[1], pred[2], pred[3]), axis = 1)
        d[script_path] = pred
        print(script_path, pred)

bourne.txt [[  0.71360993   0.320662   114.09092      0.22432168]]
deadpoets.txt [[  0.78789055   0.3145327  144.72891      0.2883013 ]]
fellowship.txt [[ 0.6388979   0.27172333 79.601       0.21370538]]
forrest.txt [[  0.72052824   0.33857027 108.75427      0.21457465]]
goodwillhunting.txt [[  0.77706635   0.3820778  117.168495     0.2549017 ]]
incredibles.txt [[  0.71231747   0.3399482  103.409515     0.19998653]]
jedi.txt [[  0.795776    0.3377211 131.24046     0.2809444]]
khan.txt [[  0.74088526   0.27247956 131.70927      0.2648602 ]]
shawshank.txt [[  0.7348423    0.23269916 140.84998      0.27799922]]
titanic.txt [[ 0.6943335   0.33119333 97.17831     0.19266303]]


In [None]:
model.save_weights("my_checkpoint_30") # save weights to avoid retraining the model in the backend