1. Loading the Files

In [1]:
import json
import os
import pickle
import numpy as np

train_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_train.json"
test_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_test.json"
val_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_val.json"

# Function to load the data from the json file
def load_from_json(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

train_data = load_from_json(train_path)
val_data = load_from_json(val_path)
test_data = load_from_json(test_path)

2. Function to tokenise the text

In [2]:
# Function to tokenize text based on space
def tokenize_text(text):
    tokens = []
    start = 0
    i = 0
    while i < len(text):
        if text[i] == ' ':
            tokens.append(text[start:i])
            start = i + 1
        i += 1
    tokens.append(text[start:i])
    return tokens

3. Function to load GLoVe Embeddings

In [3]:
import numpy as np

word_vectors = {}
glove_file = "glove.840B.300d.txt"

def load_glove_vectors(glove_file):
    """
    Load GloVe vectors from a file, with error handling for non-convertible values.
    
    :param glove_file: Path to the GloVe file.
    :return: Dictionary with words as keys and their embeddings as values.
    """
    print("Loading GloVe vectors from file:", glove_file)
    
    glove_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.split()
            word = split_line[0]
            try:
                embedding = [float(val) for val in split_line[1:]]
                glove_vectors[word] = embedding
            except ValueError:
                # Skip this word if any value is not a float
                print(f"Skipping word: {word} due to conversion error.")
                continue
            
    print(f"Loaded {len(glove_vectors)} word vectors.")
    return glove_vectors

# Call the function to load the GloVe vectors
word_vectors = load_glove_vectors(glove_file)


Loading GloVe vectors from file: glove.840B.300d.txt
Skipping word: . due to conversion error.
Skipping word: at due to conversion error.
Skipping word: . due to conversion error.
Skipping word: to due to conversion error.
Skipping word: . due to conversion error.
Skipping word: . due to conversion error.
Skipping word: email due to conversion error.
Skipping word: or due to conversion error.
Skipping word: contact due to conversion error.
Skipping word: Email due to conversion error.
Skipping word: on due to conversion error.
Skipping word: At due to conversion error.
Skipping word: by due to conversion error.
Skipping word: in due to conversion error.
Skipping word: emailing due to conversion error.
Skipping word: Contact due to conversion error.
Skipping word: at due to conversion error.
Skipping word: • due to conversion error.
Skipping word: at due to conversion error.
Skipping word: is due to conversion error.
Loaded 2195884 word vectors.


In [4]:
# Function to get glove embedding for a sentence
def get_glove_embedding(sentence):

    # tokenize the sentence
    tokens = tokenize_text(sentence)

    # create a numpy array to store the embeddings
    embeddings = np.zeros((len(tokens), 300))

    # iterate through the tokens and get the embeddings
    for i, token in enumerate(tokens):
        if token in word_vectors:
            embeddings[i] = word_vectors[token]

    return embeddings

In [5]:
# Test the function on a sentence
sentence = train_data["1"]["text"]
embeddings = get_glove_embedding(sentence)
print(embeddings.shape)

(19, 300)


In [6]:
# Function to get the maximum length of the sentence
def get_max_length(dict):
    max_length = 0
    for key, val_dict in dict.items():
        sentence = val_dict["text"]
        tokens = tokenize_text(sentence)
        if len(tokens) > max_length:
            max_length = len(tokens)

    return max_length

# Get the maximum length of the sentence
max_length_train = get_max_length(train_data)
max_length_val = get_max_length(val_data)
max_length_test = get_max_length(test_data) 

print("Maximum length of the sentence in the training data:", max_length_train)
print("Maximum length of the sentence in the validation data:", max_length_val)
print("Maximum length of the sentence in the test data:", max_length_test)

Maximum length of the sentence in the training data: 78
Maximum length of the sentence in the validation data: 83
Maximum length of the sentence in the test data: 71


In [7]:
# Get the glove embeddings for the sentences
train_embeddings = {}
val_embeddings = {}
test_embeddings = {}

for key, dict in train_data.items():
    sentence = dict["text"]
    embeddings = get_glove_embedding(sentence)
    train_embeddings[key] = embeddings

for key, dict in val_data.items():
    sentence = dict["text"]
    embeddings = get_glove_embedding(sentence)
    val_embeddings[key] = embeddings

for key, dict in test_data.items():
    sentence = dict["text"]
    embeddings = get_glove_embedding(sentence)
    test_embeddings[key] = embeddings

In [8]:
# Print the size for the embeddings
print("Size of the training embeddings:", len(train_embeddings))
print("Size of the validation embeddings:", len(val_embeddings))
print("Size of the test embeddings:", len(test_embeddings))

Size of the training embeddings: 906
Size of the validation embeddings: 219
Size of the test embeddings: 328


In [10]:
# Dump the embeddings to 3 pickle files
train_embeddings_file = "Task2_GLoVe_train_embeddings.pkl"
val_embeddings_file = "Task2_GLoVe_val_embeddings.pkl"
test_embeddings_file = "Task2_GLoVe_test_embeddings.pkl"

with open(train_embeddings_file, "wb") as file:
    pickle.dump(train_embeddings, file)

with open(val_embeddings_file, "wb") as file:
    pickle.dump(val_embeddings, file)

with open(test_embeddings_file, "wb") as file:
    pickle.dump(test_embeddings, file)

4. Load Pickle Files

In [11]:
# Load these embeddings from the pickle files
train_embeddings_file = "Task2_GLoVe_train_embeddings.pkl"
val_embeddings_file = "Task2_GLoVe_val_embeddings.pkl"
test_embeddings_file = "Task2_GLoVe_test_embeddings.pkl"

train_embeddings_loaded = pickle.load(open(train_embeddings_file, "rb"))
val_embeddings_loaded = pickle.load(open(val_embeddings_file, "rb"))
test_embeddings_loaded = pickle.load(open(test_embeddings_file, "rb"))

# Load the labels from the json files
train_labels = {}
val_labels = {}
test_labels = {}

train_label_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_train_labels.json"
val_label_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_val_labels.json"
test_label_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_test_labels.json"

train_labels = load_from_json(train_label_path)
val_labels = load_from_json(val_label_path)
test_labels = load_from_json(test_label_path)

In [12]:
# For each key, pad the embeddings and labels to the maximum length of 559
# For padding labels, use 'O' tag
# For padding embeddings, use a vector of zeros
max_length = 83

for key in train_embeddings_loaded:
    label = train_labels[key]
    embeddings = train_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    train_labels[key] = label
    train_embeddings_loaded[key] = embeddings

In [13]:
print(val_embeddings_loaded.keys())

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157'

In [14]:
for key in val_embeddings_loaded:
    label = val_labels[key]
    embeddings = val_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    val_labels[key] = label
    val_embeddings_loaded[key] = embeddings

for key in test_embeddings_loaded:
    label = test_labels[key]
    embeddings = test_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    test_labels[key] = label
    test_embeddings_loaded[key] = embeddings