In [1]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [11]:
MAX_TOKENS = 2000
SEQUENCE_LENGTH = 500

def create_model(max_tokens=None):

    model = tf.keras.Sequential([
    layers.Embedding(max_tokens or MAX_TOKENS, output_dim = 3, name="embedding"),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(8)]
    )
    model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='adam', 
                metrics=['accuracy'])

    model.load_weights('../checkpoints/my_checkpoint')

    return model

vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS, # only consider this many words
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH) 

def vectorize_movie_scripts(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [3]:
import re
def stringProcessing(s):
    s = re.sub(r"\'", "", s)
    s = re.sub(r'\n', ' ', s)
    s = re.sub(r'\t', '', s)
    s = re.sub(r"\[[^[]*\]", '', s)
    s = re.sub(r'[^\w\s]', ' ', s)
    s = re.sub(r' +', ' ', s)
    s = s.strip()
    s = s.lower()
    return s

In [12]:
import os
model = create_model() 

# df = pd.read_csv("https://raw.githubusercontent.com/benbrill/MoodSpace/main/data/trainingSongs_clean.csv")

# data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))
# data_vec = data.map(vectorize_headline)
# df = pd.DataFrame(model.predict(data_vec))
# df.shape
for script_path in os.listdir("../scripts"):
    with open(f"../scripts/{script_path}") as f:
        contents = f.read()

        contents = stringProcessing(contents)

        df = pd.DataFrame({"lyrics": [contents]})
        


        data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))
        vectorize_layer.adapt(data)
        data_vec = data.map(vectorize_headline)

        print(script_path, model.predict(data_vec)[0])

bourne.txt [ 0.8105246  -0.14522228  0.04622167  0.6015572  -0.06470785  0.4042588
 -0.9558731  -5.616686  ]
deadpoets.txt [ 0.35768542  0.39150706 -0.2343423  -0.12185942  0.01421665  0.24212293
 -0.83104455 -3.7448676 ]
fellowship.txt [ 0.35895628  0.38992435 -0.23346119 -0.12002181  0.01399616  0.24292083
 -0.8309826  -3.7525668 ]
forrest.txt [ 0.7703525  -0.09797597  0.02146828  0.5375512  -0.05771305  0.38985664
 -0.9446568  -5.449433  ]
goodwillhunting.txt [ 0.35877627  0.39228076 -0.23444021 -0.12103412  0.01406391  0.24256587
 -0.8322054  -3.7557096 ]
incredibles.txt [ 0.90325046 -0.25450322  0.10338986  0.7495802  -0.08085938  0.43721765
 -0.9819761  -5.9998817 ]
jedi.txt [ 0.35890222  0.39265424 -0.2345291  -0.12116523  0.01405248  0.24277678
 -0.83228934 -3.7590182 ]
khan.txt [ 0.6793339   0.01042211 -0.03513038  0.391962   -0.04184068  0.35721406
 -0.9198564  -5.074271  ]
shawshank.txt [ 4.1861412e-01  3.2025957e-01 -1.9694099e-01 -2.5005497e-02
  3.6155516e-03  2.6403958e-

In [10]:
df.drop_duplicates().shape

(264, 8)