In [272]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [273]:
df = pd.read_csv("datasets/train.csv", encoding = "ISO-8859-1", names=['polarity', 'id', 'query', 'user', 'text'], index_col=2)
df = df.sample(frac=1)[:60_000] # shuffle and truncate
df['polarity'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)

tdf, vdf = train_test_split(df, test_size=0.2)
train_data = tdf['text'].to_numpy()
train_label = tdf['polarity'].to_numpy()

val_data = vdf['text'].to_numpy()
val_label = vdf['polarity'].to_numpy()

In [274]:
VOCAB_SIZE = 200_000  # full vocab for 1.6M dataset contains 850061 tokens
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_data)

In [275]:
vocab = np.array(encoder.get_vocabulary())
word_index = dict(zip(vocab, range(len(vocab))))

print(len(vocab))
vocab[:40]

64118


array(['', '[UNK]', 'i', 'to', 'the', 'a', 'my', 'and', 'you', 'is', 'it',
       'in', 'for', 'of', 'im', 'on', 'me', 'so', 'have', 'that', 'just',
       'but', 'with', 'be', 'its', 'at', 'not', 'was', 'this', 'good',
       'up', 'now', 'get', 'out', 'day', 'all', 'are', 'go', 'like', 'no'],
      dtype='<U112')

In [276]:
encoder("Hello, how's the weather today?")

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([467, 848,   4, 297,  42])>

In [292]:
path_to_glove_file = "glove/glove.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 1193514 word vectors.


In [278]:
num_tokens = len(vocab)
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


embedding_layer = tf.keras.layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True
)

Converted 27946 words (36172 misses)


In [279]:
#embedding_layer(encoder('I am'))

In [286]:
model = tf.keras.Sequential([
    encoder,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [287]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [288]:
sample_text = ('the gig last night turned out great')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.69028836]
[-0.69028854]


In [290]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

history = model.fit(x=train_data, y=train_label, validation_data=(val_data, val_label), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [291]:
model.predict(np.array(["Shit!", "Oh fuck!", "You'd better shut up!", "I wanna kill this bastard", "That was amazing", "OMG!"
                        "That was cute", "Nice one", "I loved it", "I really liked how he behaved", "He was a nice dude"]))



array([[-1.5173236 ],
       [-2.7038252 ],
       [-0.32582322],
       [-3.0155642 ],
       [ 3.6182864 ],
       [ 3.0159233 ],
       [ 2.916467  ],
       [ 1.1423533 ],
       [ 1.2402942 ],
       [ 1.9487635 ]], dtype=float32)