In [None]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# only the top distinct words will be tracked
MAX_TOKENS = 2000

# each headline will be a vector of length 25
SEQUENCE_LENGTH = 500

def create_model(max_tokens=None):

    model = tf.keras.Sequential([
    layers.Embedding(max_tokens or MAX_TOKENS, output_dim = 3, name="embedding"),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(8)]
    )
    model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='adam', 
                metrics=['accuracy'])

    model.load_weights('../checkpoints/my_checkpoint')

    return model

vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS, # only consider this many words
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH) 

def vectorize_headline(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

[ 0.7444502   0.01933315 -0.02601114  0.4542644  -0.05159525  0.38804668
 -0.9677498  -5.6487317 ]


In [22]:
import re
def stringProcessing(s):
    s = re.sub(r"\'", "", s)
    s = re.sub(r'\n', ' ', s)
    s = re.sub(r'\t', '', s)
    s = re.sub(r"\[[^[]*\]", '', s)
    s = re.sub(r'[^\w\s]', ' ', s)
    s = re.sub(r' +', ' ', s)
    s = s.strip()
    s = s.lower()
    return s

In [24]:
import os
model = create_model() 

# df = pd.read_csv("https://raw.githubusercontent.com/benbrill/MoodSpace/main/data/trainingSongs_clean.csv")

# data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))
# data_vec = data.map(vectorize_headline)
# df = pd.DataFrame(model.predict(data_vec))
# df.shape
for script_path in os.listdir("../scripts"):
    with open(f"../scripts/{script_path}") as f:
        contents = f.read()

        contents = stringProcessing(contents)

        df = pd.DataFrame({"lyrics": [contents]})
        


        data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))

        data_vec = data.map(vectorize_headline)

        print(script_path, model.predict(data_vec)[0])

bourne.txt [ 0.74069095  0.02461722 -0.02864002  0.447855   -0.05092469  0.38677678
 -0.9669975  -5.6361594 ]
deadpoets.txt [ 0.1899558   0.7987493  -0.4137746  -0.49113056  0.04731353  0.20073612
 -0.85679483 -3.7943552 ]
fellowship.txt [ 0.1899558   0.7987493  -0.4137746  -0.49113056  0.04731353  0.20073612
 -0.85679483 -3.7943552 ]
forrest.txt [ 0.69182     0.0933111  -0.06281567  0.3645321  -0.04220729  0.37026793
 -0.95721817 -5.47272   ]
goodwillhunting.txt [ 0.1899558   0.7987493  -0.4137746  -0.49113056  0.04731353  0.20073612
 -0.85679483 -3.7943552 ]
incredibles.txt [ 0.85722864 -0.1391904   0.05285537  0.64654726 -0.07171226  0.426144
 -0.9903174  -6.0259    ]
jedi.txt [ 0.1899558   0.7987493  -0.4137746  -0.49113056  0.04731353  0.20073612
 -0.85679483 -3.7943552 ]
khan.txt [ 0.5564855   0.28354168 -0.15745637  0.13379121 -0.01806678  0.3245513
 -0.93013746 -5.020124  ]
shawshank.txt [ 0.23318756  0.73798144 -0.38354227 -0.41742185  0.039602    0.21533994
 -0.8654456  -3.93

In [10]:
df.drop_duplicates().shape

(264, 8)