In [203]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, TextVectorization
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [204]:
df = pd.read_csv("datasets/train.csv", encoding = "ISO-8859-1", names=['polarity', 'id', 'query', 'user', 'text'], index_col=2)
df = df.sample(frac=1)[:60_000] # shuffle and truncate
df['polarity'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)

tdf, vdf = train_test_split(df, test_size=0.2)
train_data = tdf['text'].to_numpy()
train_label = tdf['polarity'].to_numpy()

val_data = vdf['text'].to_numpy()
val_label = vdf['polarity'].to_numpy()

In [205]:
VOCAB_SIZE = 200_000  # full vocab for 1.6M dataset contains 850061 tokens
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_data)

In [206]:
vocab = np.array(encoder.get_vocabulary())
word_index = dict(zip(vocab, range(len(vocab))))

print(len(vocab))
vocab[:40]

64041


array(['', '[UNK]', 'i', 'to', 'the', 'a', 'my', 'and', 'you', 'is', 'it',
       'in', 'for', 'of', 'im', 'on', 'me', 'so', 'have', 'that', 'but',
       'just', 'with', 'be', 'at', 'its', 'not', 'was', 'this', 'now',
       'good', 'up', 'day', 'out', 'get', 'all', 'are', 'like', 'go',
       'no'], dtype='<U85')

In [207]:
encoder("Hello, how's the weather today?")

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([459, 884,   4, 274,  43])>

In [208]:
path_to_glove_file = "glove/glove.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 1193514 word vectors.


In [209]:
num_tokens = len(vocab) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True
)

Converted 28022 words (36019 misses)


In [210]:
#embedding_layer(encoder('I am'))

In [211]:
model = tf.keras.Sequential([
    encoder,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [212]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [213]:
sample_text = ('the gig last night turned out great')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.30668777]
[-0.30668783]


In [214]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

history = model.fit(x=train_data, y=train_label, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [254]:
model.predict(np.array(["Shit!", "Oh fuck!", "You'd better shut up!", "I wanna kill this bastard", "That was amazing", "OMG!"
                        "That was cute", "Nice one", "I loved it", "I really liked how he behaved", "He was a nice dude"]))



array([[-1.9692551 ],
       [-2.8697622 ],
       [-0.9298316 ],
       [-4.233221  ],
       [ 3.431041  ],
       [ 2.0594976 ],
       [ 2.9084666 ],
       [ 1.3246045 ],
       [ 0.22528376],
       [ 2.2659986 ]], dtype=float32)