1. Loading Libraries

In [2]:
import json
import os
import pickle
import numpy as np
from gensim.models import KeyedVectors

2. Reading Data Files

In [3]:
train_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_train.json"
val_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_val.json"
test_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_test.json"

# Function to load the data from the json file
def load_from_json(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

train_data = load_from_json(train_path)
val_data = load_from_json(val_path)
test_data = load_from_json(test_path)

3. Function to tokenise the text

In [4]:
# Function to tokenize text based on space
def tokenize_text(text):
    tokens = []
    start = 0
    i = 0
    while i < len(text):
        if text[i] == ' ':
            tokens.append(text[start:i])
            start = i + 1
        i += 1
    tokens.append(text[start:i])
    return tokens

4. Load Word2Vec

In [5]:
model_path = 'GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

5. Function to get Word2Vec Embedding

In [6]:
# Function to get Word2Vec word embedding
def get_word2vec_embedding(sentence):
    # Split the sentence into tokens
    tokens = tokenize_text(sentence)

    # Create a numpy array to store the word embeddings
    embeddings = np.zeros((len(tokens), 300))

    # For each token in the sentence, get the word embedding
    for i, token in enumerate(tokens):
        if token in model:
            embeddings[i] = model[token]
        else:
            embeddings[i] = np.zeros(300)

    return embeddings

In [7]:
sentence = train_data["1"]["text"]
embeddings = get_word2vec_embedding(sentence)
print(embeddings.shape)
print(embeddings)

(19, 300)
[[ 7.91015625e-02 -5.03540039e-03  1.11816406e-01 ... -6.77490234e-03
   4.27246094e-02 -1.03515625e-01]
 [ 9.66796875e-02  8.88671875e-02  1.39648438e-01 ... -9.13085938e-02
   1.53198242e-02  6.00585938e-02]
 [ 8.44726562e-02 -3.52859497e-04  5.32226562e-02 ...  1.70898438e-02
   6.07910156e-02 -1.08886719e-01]
 ...
 [-1.70898438e-02  2.75390625e-01  3.57421875e-01 ...  6.80541992e-03
  -2.16796875e-01  7.42187500e-02]
 [-6.78710938e-02  9.52148438e-02  3.56445312e-02 ...  1.26953125e-01
  -1.03515625e-01  4.76074219e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [8]:
# Function to get the maximum length of the sentence
def get_max_length(dict):
    max_length = 0
    for key, val_dict in dict.items():
        sentence = val_dict["text"]
        tokens = tokenize_text(sentence)
        if len(tokens) > max_length:
            max_length = len(tokens)

    return max_length

# Get the maximum length of the sentence
max_length_train = get_max_length(train_data)
max_length_val = get_max_length(val_data)
max_length_test = get_max_length(test_data) 

print("Maximum length of the sentence in the training data:", max_length_train)
print("Maximum length of the sentence in the validation data:", max_length_val)
print("Maximum length of the sentence in the test data:", max_length_test)

Maximum length of the sentence in the training data: 78
Maximum length of the sentence in the validation data: 83
Maximum length of the sentence in the test data: 71


6. Create Word2Vec Embeddings for our corpora

In [9]:
# Get the glove embeddings for the sentences
train_embeddings = {}
val_embeddings = {}
test_embeddings = {}

for key, dict in train_data.items():
    sentence = dict["text"]
    embeddings = get_word2vec_embedding(sentence)
    train_embeddings[key] = embeddings

for key, dict in val_data.items():
    sentence = dict["text"]
    embeddings = get_word2vec_embedding(sentence)
    val_embeddings[key] = embeddings

for key, dict in test_data.items():
    sentence = dict["text"]
    embeddings = get_word2vec_embedding(sentence)
    test_embeddings[key] = embeddings

In [10]:
# Print the size for the embeddings
print("Size of the training embeddings:", len(train_embeddings))
print("Size of the validation embeddings:", len(val_embeddings))
print("Size of the test embeddings:", len(test_embeddings))

Size of the training embeddings: 906
Size of the validation embeddings: 219
Size of the test embeddings: 328


7. Dump Embeddings to Pickle File

In [11]:
# Dump the embeddings to 3 pickle files
train_embeddings_file = "Task2_Word2Vec_train_embeddings.pkl"
val_embeddings_file = "Task2_Word2Vec_val_embeddings.pkl"
test_embeddings_file = "Task2_Word2Vec_test_embeddings.pkl"

with open(train_embeddings_file, "wb") as file:
    pickle.dump(train_embeddings, file)

with open(val_embeddings_file, "wb") as file:
    pickle.dump(val_embeddings, file)

with open(test_embeddings_file, "wb") as file:
    pickle.dump(test_embeddings, file)

8. Load Pickle Files to Test

In [12]:
# Load the embeddings from the pickle files
train_embeddings_file = "Task2_Word2Vec_train_embeddings.pkl"
val_embeddings_file = "Task2_Word2Vec_val_embeddings.pkl"
test_embeddings_file = "Task2_Word2Vec_test_embeddings.pkl"

train_embeddings_loaded = pickle.load(open(train_embeddings_file, "rb"))
val_embeddings_loaded = pickle.load(open(val_embeddings_file, "rb"))
test_embeddings_loaded = pickle.load(open(test_embeddings_file, "rb"))

# Load the labels from the json files
train_labels = {}
val_labels = {}
test_labels = {}

train_label_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_train_labels.json"
val_label_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_val_labels.json"
test_label_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_test_labels.json"

train_labels = load_from_json(train_label_path)
val_labels = load_from_json(val_label_path)
test_labels = load_from_json(test_label_path)

In [13]:
"""
For each key, pad the embeddings and labels to the maximum length of 559 (for Task 1) and 83 (for Task 2)
# For padding labels, use 'O' tag
# For padding embeddings, use a vector of zeros
"""

max_length = 83

for key in train_embeddings_loaded:
    label = train_labels[key]
    embeddings = train_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    train_labels[key] = label
    train_embeddings_loaded[key] = embeddings

for key in val_embeddings_loaded:
    label = val_labels[key]
    embeddings = val_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    val_labels[key] = label
    val_embeddings_loaded[key] = embeddings

for key in test_embeddings_loaded:
    label = test_labels[key]
    embeddings = test_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    test_labels[key] = label
    test_embeddings_loaded[key] = embeddings

In [14]:
# Iterate thorogh the embeddings and labels and print shapes
for key in train_embeddings_loaded:
    print("Train embeddings shape: ", train_embeddings_loaded[key].shape)
    print("Train labels length: ", len(train_labels[key]))

for key in val_embeddings_loaded:
    print("Validation embeddings shape: ", val_embeddings_loaded[key].shape)
    print("Validation labels length: ", len(val_labels[key]))

for key in test_embeddings_loaded:
    print("Test embeddings shape: ", test_embeddings_loaded[key].shape)
    print("Test labels length: ", len(test_labels[key]))

Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train