In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

# load all docs in a directory
def data_pre_process(filename, labels, class_index):
    documents = []
    file = open(filename, 'r')
    # Using readlines() 
    Lines = file.readlines() 
    file.close()
    # Strips the newline character 
    for line in Lines: 
        # print(line)
        documents.append(line.replace("\n", ""))
        labels.append(class_index)
    print(len(documents))
    return documents

samples = []
labels = []
class_names = ['negative','neutral','positive']
samples.extend(data_pre_process('/content/drive/MyDrive/labelled_data/neg.txt', labels, 0))
samples.extend(data_pre_process('/content/drive/MyDrive/labelled_data/neu.txt', labels, 1))
samples.extend(data_pre_process('/content/drive/MyDrive/labelled_data/pos.txt', labels, 2))

# samples.extend(data_pre_process('/content/drive/MyDrive/labelled_data/test_data/neg.txt', labels, 0))
# samples.extend(data_pre_process('/content/drive/MyDrive/labelled_data/test_data/neu.txt', labels, 1))
# samples.extend(data_pre_process('/content/drive/MyDrive/labelled_data/test_data/pos.txt', labels, 2))




print("Classes:", class_names)
print("Number of samples:", len(samples))
# print(samples)
# print(labels[:20])
# print(labels[20:])

# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

vectorizer.get_vocabulary()[:5]

100000
100000
100000
Classes: ['negative', 'neutral', 'positive']
Number of samples: 300000


['', '[UNK]', 'the', 'i', 'to']

In [None]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]


array([ 2,  1,  1, 11,  2,  1])

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))


In [None]:
# print(word_index)
test = ["the"]
[word_index[w] for w in test]

[2]

In [None]:
path_to_glove_file = "/content/drive/MyDrive/glove/glove.6B.100d.txt"


embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 171 words (8 misses)


In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()



Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_4 (Embedding)      (None, None, 100)         18100     
_________________________________________________________________
conv1d_21 (Conv1D)           (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, None, 128)         0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, None, 128)         0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, None, 128)         8204

In [None]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)



In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f81d7069748>

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    # [["The beads are pretty but the box was broken and all the different sizes were mixed up when I got it. The box did not stand up to the shipping."]]
    [["perfect"]]
)
print(probabilities[0])
print(np.argmax(probabilities[0]))
class_names[np.argmax(probabilities[0])]
# class_names[np.argmax(probabilities[0])]

[0.30988717 0.32554075 0.36457208]
2


'positive'

In [None]:
print(vectorizer([["this"]]))

tf.Tensor(
[[9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 200), dtype=int64)


In [None]:

import pickle
pickle.dump({'config': vectorizer.get_config(),
             'weights': vectorizer.get_weights()}
            , open('/content/drive/MyDrive/vectorizer.pkl', "wb"))

model.save('/content/drive/MyDrive/my_model')

# vec_model = keras.models.Sequential()
# vec_model.add(keras.Input(shape=(1,), dtype="string"))
# vec_model.add(vectorizer)
# vec_model.save('/content/drive/MyDrive/vec_model', save_format="tf")