In [None]:
from configuration import FilePaths, Config
import tensorflow as tf
import numpy as np

FILE_PATHS = FilePaths()

# import saved data from pickle files

import pickle

with open(FILE_PATHS.X_TRAIN, "rb") as f:
    x_train = pickle.load(f)

with open(FILE_PATHS.X_TEST, "rb") as f:
    x_test = pickle.load(f)

with open(FILE_PATHS.Y_TRAIN, "rb") as f:
    y_train = pickle.load(f)

with open(FILE_PATHS.Y_TEST, "rb") as f:
    y_test = pickle.load(f)

with open(FILE_PATHS.X_VAL, "rb") as f:
    x_val = pickle.load(f)

with open(FILE_PATHS.Y_VAL, "rb") as f:
    y_val = pickle.load(f)
    

In [None]:
def make_embedding_matrix(train_samples, embeddings_index):
    """
    This function computes the embedding matrix that will be used in the embedding layer

    Parameters:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        embeddings_index: Python dictionary with word embeddings

    Returns:
        embedding_matrix: embedding matrix with the dimensions (num_tokens, embedding_dim), where num_tokens is the vocabulary of the input data, and emdebbing_dim is the number of components in the GloVe vectors (can be 50,100,200,300)
        vectorizer: TextVectorization layer
    """

    vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=Config.max_features, output_sequence_length=Config.max_len
    )
    text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(Config.batch_size)
    vectorizer.adapt(text_ds)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))

    num_tokens = len(voc)

    hits = 0
    misses = 0

    #   creating an embedding matrix
    embedding_dim = len(embeddings_index["the"])
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

    print(f"Converted {hits} words ({misses} misses).")

    return embedding_matrix, vectorizer

In [None]:
#Load Glove Embeddings 
embeddings_index = {}

f = open(FILE_PATHS.FILE_TO_GLOVE, 'r', encoding='utf8')
for line in f:
    splitLine = line.split(' ')
    word = splitLine[0]                                  # the first entry is the word
    coefs = np.asarray(splitLine[1:], dtype='float32')   # these are the vectors representing word embeddings
    embeddings_index[word] = coefs
print("Glove data loaded! In total:",len(embeddings_index)," words.")

In [None]:
embedding_matrix, vectorizer = make_embedding_matrix([item[0] for item in x_train], embeddings_index=embeddings_index)

In [None]:
vectorizer.get_vocabulary()[99]

In [None]:
print(x_train[0][0])
vectorizer(x_train[0][0])

In [None]:
for i in vectorizer(x_train[0][0]):
    print(vectorizer.get_vocabulary()[i],end=" ")