In [14]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import datetime

In [15]:
df = pd.read_csv("datasets/train.csv", encoding = "ISO-8859-1", names=['polarity', 'id', 'query', 'user', 'text'], index_col=2)
df = df.sample(frac=1)[:30_000] # shuffle and truncate
df['polarity'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)

tdf, vdf = train_test_split(df, test_size=0.2)
train_data = tdf['text'].to_numpy()
train_label = tdf['polarity'].to_numpy()

val_data = vdf['text'].to_numpy()
val_label = vdf['polarity'].to_numpy()

In [16]:
VOCAB_SIZE = 200_000  # full vocab for 1.6M dataset contains 850061 tokens
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_data)

In [17]:
vocab = np.array(encoder.get_vocabulary())
word_index = dict(zip(vocab, range(len(vocab))))

print(len(vocab))
vocab[:40]

37951


array(['', '[UNK]', 'i', 'to', 'the', 'a', 'my', 'and', 'you', 'is', 'it',
       'for', 'in', 'of', 'im', 'on', 'me', 'so', 'have', 'that', 'but',
       'just', 'with', 'be', 'not', 'its', 'at', 'was', 'good', 'this',
       'up', 'now', 'all', 'day', 'get', 'are', 'out', 'no', 'like', 'go'],
      dtype='<U147')

In [18]:
encoder("Hello, how's the weather today?")

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([471, 631,   4, 243,  49])>

In [19]:
path_to_glove_file = "glove/glove.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 1193514 word vectors.


In [20]:
num_tokens = len(vocab)
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


embedding_layer = tf.keras.layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True
)

Converted 19138 words (18813 misses)


In [21]:
#embedding_layer(encoder('I am'))

In [22]:
model = tf.keras.Sequential([
    encoder,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

2022-12-20 11:09:18.312071: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 30360800 exceeds 10% of free system memory.


In [23]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [24]:
sample_text = ('the gig last night turned out great')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.3992288]
[-0.39922875]


In [30]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
checkpoint_filepath = 'tmp/checkpoint'

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


history = model.fit(x=train_data, y=train_label, validation_data=(val_data, val_label), epochs=10, 
                    callbacks=[tensorboard_callback, model_checkpoint_callback])

2022-12-20 11:12:18.702141: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open tmp/checkpoint: FAILED_PRECONDITION: tmp/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?


ValueError: Received incompatible tensor with shape (38222, 200) when attempting to restore variable with shape (37951, 200) and name layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE.