In [1]:
from datasets import load_dataset

ds = load_dataset("lucadiliello/newsqa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.models import Model
import re
import pandas as pd
import numpy as np

In [3]:
# Parameters
voc_size = 10000
sent_length = 50  #consider largest sentence in NewsQA , tried to use largest value , ensuring ram to not crash
question_len = 30 #considered as
dim = 100 #each word is converted into 100 dim vector

In [4]:
# Dataset
corpus = [sample["context"] for sample in ds["train"]]
questions = [sample["question"] for sample in ds["train"]]

In [5]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # removing unwanted punctuation marks
    text = re.sub(r"\s+", " ", text).strip() #removing extra spaces
    return text

corpus = [clean_text(text) for text in corpus]
questions = [clean_text(text) for text in questions]

In the previous task I had used one-hot encoding to create word embedding but since i have to do it on very large dataset using tokens . One hot encoding would create very large vectors for each word , wereas  tokenization assigns a unique integer (token) to each word, and these tokens are  mapped  to vectors using an  pre-trained embeddings (like GloVe or Word2Vec or BERT ).

In [6]:
# Tokenizer
all_texts = corpus + questions
tokenizer = Tokenizer(num_words=voc_size, oov_token="<OOV>")
tokenizer.fit_on_texts(all_texts)

context_seq = tokenizer.texts_to_sequences(corpus)
question_seq = tokenizer.texts_to_sequences(questions)
#padding done to make all vectorrs of same dim
context_seq = pad_sequences(context_seq, maxlen=sent_length, padding='post')
question_seq = pad_sequences(question_seq, maxlen=question_len, padding='post')

In [7]:
# Shared embedding layer
context_input = Input(shape=(sent_length,))
question_input = Input(shape=(question_len,))

embedding_layer = Embedding(input_dim=voc_size, output_dim=dim, mask_zero=True)

context_emb = embedding_layer(context_input)
question_emb = embedding_layer(question_input)

model = Model(inputs=[context_input, question_input], outputs=[context_emb, question_emb])
model.compile(optimizer='adam', loss='mse')

# Get embeddings
context_embeddings, question_embeddings = model.predict([context_seq, question_seq])
print("Context embeddings shape:", context_embeddings.shape)
print("Question embeddings shape:", question_embeddings.shape)

[1m2318/2318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step
Context embeddings shape: (74160, 50, 100)
Question embeddings shape: (74160, 30, 100)


In [8]:
#Extract the learned embedding weights
embedding_matrix = embedding_layer.get_weights()[0]  # shape: (voc_size, dim)

word_index = tokenizer.word_index

# Saving part
data = []
for word, idx in word_index.items():
    if idx < voc_size:
        embedding_vector = embedding_matrix[idx]
        data.append([word] + embedding_vector.tolist())

# Convert to DataFrame
df = pd.DataFrame(data)
df.to_csv("word_embeddings.csv", index=False, header=False)  # no header, no index

print("Saved word embeddings to word_embeddings.csv")


Saved word embeddings to word_embeddings.csv
