In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras.layers import TextVectorization, Embedding
from tensorflow.keras import Sequential

In [4]:
batch_size = 1024
seed = 39
sequence_length = 25
vocab_size = 10000
embedding_dim = 128

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "dataset", batch_size = batch_size, validation_split = 0.2, 
    subset = 'training', seed = seed
)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "dataset", batch_size = batch_size, validation_split = 0.2, 
    subset = 'validation', seed = seed
)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size =  AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size =  AUTOTUNE)



vectorize_layer = TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int', 
    output_sequence_length = sequence_length)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

Found 230710 files belonging to 2 classes.
Using 184568 files for training.
Found 230710 files belonging to 2 classes.
Using 46142 files for validation.


In [5]:




class EmbeddingInitializer(tf.keras.initializers.Initializer):
    def __call__(self, shape=None, dtype=None, **kwargs):
        weights = tf.convert_to_tensor(np.load('w2vVectors.npy'))
        return weights

embedding_layer = Embedding(vocab_size,
                            embedding_dim,
                            embeddings_initializer = EmbeddingInitializer(),
                            mask_zero= True)




model = Sequential([
    vectorize_layer,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="classifier_logs")

model.fit(train_ds, epochs=25,
                    validation_data=val_ds,
                    validation_steps=30, 
                    callbacks = tensorboard_callback)

ValueError: In this `tf.Variable` creation, the initial value's shape ((4096, 128)) is not compatible with the explicitly supplied `shape` argument ((10000, 128)).

In [5]:

!mkdir checkpoints/one
model.save_weights("./checkpoints/one/mycheckpoint")
with open("checkpoints/one/METADATA.txt", 'w') as file:
    file.write("Batch size = " + batch_size)
    file.write("Seed = " + seed)
    file.write("Sequence length = " + sequence_length)
    file.write("Vocab size = " + vocab_size)
    file.write("Embedding dimensions = " + embedding_dim)
    



INFO:tensorflow:Assets written to: saved_model/my_model\assets


INFO:tensorflow:Assets written to: saved_model/my_model\assets


In [None]:
sentence = ''
guess = model.predict([sentence])[0][0]
if guess > 0.5:
    label = 'liberal'
else:
    label = 'conservative'

print('The sentence ' + sentence + ' is ' + label)