In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow.keras import layers
from tensorflow.keras import losses

# requires update to tensorflow 2.4
# >>> conda activate PIC16B
# >>> pip install tensorflow==2.4
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Import Data

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/benbrill/MoodSpace/main/data/trainingSongs_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,trackName,artist,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,lyrics,cluster,language
0,3,Overthinker,INZO,4K9xid96G3YmIvQZXN9SXg,0.472,0.605,8.0,-4.437,1.0,0.134,0.0311,0.0308,0.101,0.212,128.375,audio_features,a person who thinks all the time has nothing t...,0,en
1,4,Lifestyles of the Rich & Famous,Good Charlotte,2g2a5kDeZexbUTD8abcvm6,0.62,0.93,1.0,-3.685,1.0,0.0374,0.00043,0.0,0.0686,0.609,106.22,audio_features,always see it on t v or read in the magazines ...,3,en
2,6,Carrying Your Love With Me,George Strait,7puxIVNdj5nsBJk43zM3bH,0.629,0.479,10.0,-10.608,1.0,0.0271,0.22,0.0,0.0587,0.345,138.231,audio_features,baby all i got is this beat up leather bag and...,5,en
3,7,"Check Yes, Juliet",We The Kings,0wVluBsVAVzBKrqspuCcwR,0.352,0.912,7.0,-4.253,1.0,0.0725,0.00197,0.0,0.193,0.351,166.795,audio_features,check yes juliet are you with me rain is falli...,2,en
4,8,At My Worst (feat. Kehlani),Pink Sweat$,58w68w4s8h9gw3xrDaXyuj,0.731,0.484,0.0,-5.579,1.0,0.0354,0.73,3e-06,0.326,0.35,92.043,audio_features,can i call you baby can you be my friend can y...,1,en


# Create Text Vectorization

In [3]:
max_tokens = 200
sequence_length = 500

vectorize_layer = TextVectorization(
    max_tokens=max_tokens, # only consider this many words
    output_mode='int',
    output_sequence_length=sequence_length) 

In [4]:
data = tf.data.Dataset.from_tensor_slices((df["lyrics"], df["cluster"]))

In [5]:
data = data.shuffle(buffer_size = len(data))

train_size = int(0.7*len(data))
val_size   = int(0.1*len(data))

train = data.take(train_size)
val   = data.skip(train_size).take(val_size)
test  = data.skip(train_size + val_size)
len(train), len(val), len(test)

(501, 71, 145)

In [6]:
def vectorize_movie_script(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), [label]

train_vec = train.map(vectorize_movie_script)
val_vec   = val.map(vectorize_movie_script)
test_vec  = test.map(vectorize_movie_script)

# Create Model

In [7]:
model = tf.keras.Sequential([
  layers.Embedding(max_tokens, output_dim = 10, name="embedding"),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(8)]
)

In [8]:
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam', 
              metrics=['accuracy'])

In [9]:
history = model.fit(train_vec, epochs = 100, validation_data = val_vec)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [10]:
model.predict(test_vec)[1]

array([ 0.4438982 ,  0.652145  , -0.19849624, -0.1767536 ,  0.10216334,
        0.35770565, -0.8556144 , -4.3495827 ], dtype=float32)

In [11]:
model.evaluate(test_vec)



[1.8512661457061768, 0.24827586114406586]

In [12]:
model.save_weights('./checkpoint/my_checkpoint_10')