In [2]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
MAX_TOKENS = 2000
SEQUENCE_LENGTH = 500

def create_model(max_tokens=None):

    model = tf.keras.Sequential([
    layers.Embedding(max_tokens or MAX_TOKENS, output_dim = 30, name="embedding"),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(8)]
    )
    model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='adam', 
                metrics=['accuracy'])

    model.load_weights('../checkpoints/my_checkpoint_20')

    return model

vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS, # only consider this many words
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH) 

def vectorize_movie_scripts(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [4]:
import re
def stringProcessing(s):
    s = re.sub(r"\'", "", s)
    s = re.sub(r'\n', ' ', s)
    s = re.sub(r'\t', '', s)
    s = re.sub(r"\[[^[]*\]", '', s)
    s = re.sub(r'[^\w\s]', ' ', s)
    s = re.sub(r' +', ' ', s)
    s = s.strip()
    s = s.lower()
    return s

In [10]:
import os
model = create_model() 

# df = pd.read_csv("https://raw.githubusercontent.com/benbrill/MoodSpace/main/data/trainingSongs_clean.csv")

# data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))
# data_vec = data.map(vectorize_headline)
# df = pd.DataFrame(model.predict(data_vec))
# df.shape
d = {}
for script_path in os.listdir("../scripts"):
    with open(f"../scripts/{script_path}") as f:
        contents = f.read()

        contents = stringProcessing(contents)

        df = pd.DataFrame({"lyrics": [contents]})
        


        data = tf.data.Dataset.from_tensor_slices((df["lyrics"]))
        vectorize_layer.adapt(data)
        data_vec = data.map(vectorize_movie_scripts)
        d[script_path] = model.predict(data_vec)[0]
        print(script_path, model.predict(data_vec)[0])

bourne.txt [ 0.1857727  -0.70903105  0.42797962 -0.03703333 -0.23253712  0.44204602
 -1.3601608  -4.934482  ]
deadpoets.txt [-1.7333281   0.5443739   1.4926052  -1.312813   -1.9460292   0.8860587
 -0.69746614 -4.9762983 ]
fellowship.txt [-1.6093009   0.91127765 -0.7827016   0.51166475  0.3851266  -0.86100215
 -1.2381443  -5.4872737 ]
forrest.txt [ 2.7013892e-01 -7.4422103e-01  3.7579489e-01 -2.4289940e-01
 -1.4102459e-03  5.6090569e-01 -1.5939732e+00 -4.9245977e+00]
goodwillhunting.txt [-0.46705815  2.3599677  -0.41788223 -2.4510686  -2.2347584   0.890132
 -1.8238227  -5.6453047 ]
incredibles.txt [ 0.15989065 -1.9651154   0.7284904   0.97542316  0.11404774  0.60787
 -1.0682911  -4.7816663 ]
jedi.txt [ 0.5676284   1.2058978  -0.2627478  -1.1515511  -1.1492127  -0.53111666
 -2.7649083  -5.876196  ]
khan.txt [ 0.745964   -0.13054618  0.11489633 -1.0392745  -0.4346859   0.51892954
 -2.2972257  -5.37253   ]
shawshank.txt [-0.80076295  1.2794873   0.34279975 -1.1230642  -1.2350041   0.118909

In [10]:
df.drop_duplicates().shape

(264, 8)

In [9]:
d

{'bourne.txt': array([ 0.1857727 , -0.70903105,  0.42797962, -0.03703333, -0.23253712,
         0.44204602, -1.3601608 , -4.934482  ], dtype=float32),
 'deadpoets.txt': array([-1.7333281 ,  0.5443739 ,  1.4926052 , -1.312813  , -1.9460292 ,
         0.8860587 , -0.69746614, -4.9762983 ], dtype=float32),
 'fellowship.txt': array([-1.6093009 ,  0.91127765, -0.7827016 ,  0.51166475,  0.3851266 ,
        -0.86100215, -1.2381443 , -5.4872737 ], dtype=float32),
 'forrest.txt': array([ 2.7013892e-01, -7.4422103e-01,  3.7579489e-01, -2.4289940e-01,
        -1.4102459e-03,  5.6090569e-01, -1.5939732e+00, -4.9245977e+00],
       dtype=float32),
 'goodwillhunting.txt': array([-0.46705815,  2.3599677 , -0.41788223, -2.4510686 , -2.2347584 ,
         0.890132  , -1.8238227 , -5.6453047 ], dtype=float32),
 'incredibles.txt': array([ 0.15989065, -1.9651154 ,  0.7284904 ,  0.97542316,  0.11404774,
         0.60787   , -1.0682911 , -4.7816663 ], dtype=float32),
 'jedi.txt': array([ 0.5676284 ,  1.20589