In [1]:
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Inputs

In [2]:
with open('artifacts/embeddings_inputs.pkl', 'rb') as f:
    loaded_input_items = pickle.load(f)

In [3]:
X_train_pad = loaded_input_items['X_train_pad']
X_val_pad = loaded_input_items['X_val_pad']
X_test_pad = loaded_input_items['X_test_pad']
y_train = loaded_input_items['y_train']
y_val = loaded_input_items['y_val']
y_test = loaded_input_items['y_test']

In [4]:
MAX_LEN = len(X_train_pad[0])

In [5]:
with open('artifacts/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [6]:
print(f"Maximum input length: {MAX_LEN}, Vocab size: {VOCAB_SIZE}")

Maximum input length: 588, Vocab size: 35756


# Load GloVe Embeddings

In [7]:
GLOVE_PATH = r"C:\Users\aleen\glove\glove_2024_wikigiga_100d.txt"

In [8]:
EMBEDDING_DIM = 100

In [None]:
embedding_index = {}
with open(GLOVE_PATH, encoding='utf-8') as f:
    c=0
    for line in f:
        values = line.strip().split()
        c+=1
        word = values[0]
        try:
            vector = np.array([float(value) for value in values[1:]])
            embedding_index[word] = vector 
        except:
            print("ERROR!")
            print(word, values)
print(f"Loaded {len(embedding_index)} word vectors")

In [10]:
print(f"Loaded {len(embedding_index)} word vectors")

Loaded 1287623 word vectors


# Building Embedding Matrix

In [11]:
embedding_index['said']

array([-0.434792,  0.498616,  0.17971 ,  0.704012,  0.517303,  0.441844,
       -0.891324, -0.092814, -0.934273,  0.32357 ,  0.324631, -0.33741 ,
       -0.678446, -0.680584,  0.458958, -0.06935 , -0.131736,  0.011698,
       -0.381304, -0.759487, -0.47045 ,  0.019226, -0.908428, -0.281665,
        0.195382, -0.185935, -0.068366, -0.889806, -0.140643,  0.359133,
       -0.618384,  0.072123, -0.369979,  0.402194, -5.404896,  0.251164,
        0.402286,  0.22511 ,  0.479287, -0.515167,  0.132093, -0.226698,
        0.704942, -0.145151,  0.708178,  0.698086, -0.199204, -0.164803,
        2.366336,  0.769954, -0.579768, -1.424587,  0.102187,  0.061161,
        0.006214,  0.958536,  0.884203,  0.543308, -0.018456, -0.401535,
        0.129765, -0.34603 ,  0.360544,  0.619073, -0.609613,  0.208515,
        0.300205,  0.1291  , -0.160847,  0.778184,  0.151817,  0.80319 ,
        1.010414, -0.640239, -0.247022,  0.443756, -0.094733, -0.374166,
        0.076299, -0.152046, -0.092171,  0.61828 , 

In [12]:
print(len([t for t in embedding_index if t not in tokenizer.word_index]))

1252158


In [13]:
print(len([t for t in tokenizer.word_index if t not in embedding_index]))

290


In [14]:
c = 0
known_indices = []
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
    if word in embedding_index and embedding_index.get(word) is not None:
        c += 1
        known_indices.append(index)
        embedding_matrix[index] = embedding_index[word]

In [16]:
print(embedding_matrix[::10000])

[[ 0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.      ]
 [ 0.411022  0.478867 -0.705308  1.087929  0.247115 -

In [19]:
print(VOCAB_SIZE-c, (float(VOCAB_SIZE-c)/VOCAB_SIZE)*100)

291 0.8138494238729165


* The embeddings for **oov token** and **290** other tokens are missing, these can be left as 0s, be give random values, or be computed by averaging the embeddings of all other known tokens. **<1%** of the tokens' embeddings are only missing, so, that is not much of an issue.

* Regardless, we will set `trainable=True` in the Embedding layer of classifiers, so that the embeddings of missing tokens will be learnt during the model training. Allowing all the embeddings to be trained will also help in fine-tuning them, because since these were pre-trained they may not be entirely aligned with the context of our dataset/task.

# Saving the Pretrained Embedding Matrix

In [20]:
embeddings_input = {
    'embedding_matrix': embedding_matrix,
    'X_train_pad': X_train_pad,
    'X_val_pad': X_val_pad,
    'X_test_pad': X_test_pad,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test
}
with open('artifacts/pretrained_embeddings_inputs.pkl', 'wb') as f:
    pickle.dump(embeddings_input, f)