1. Loading Libraries

In [1]:
import fasttext
import fasttext.util
import numpy as np
import json
import os
import pickle

2. Loading Pre-Trained Fasttext Model

In [2]:
# Load the FastText model
ft_model = fasttext.load_model('crawl-300d-2M-subword.bin')



3. Load the files

In [3]:
train_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_train.json"
test_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_test.json"
val_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_val.json"

# Function to load the data from the json file
def load_from_json(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

train_data = load_from_json(train_path)
val_data = load_from_json(val_path)
test_data = load_from_json(test_path)

In [4]:
# Function to tokenize text based on space
def tokenize_text(text):
    tokens = []
    start = 0
    i = 0
    while i < len(text):
        if text[i] == ' ':
            tokens.append(text[start:i])
            start = i + 1
        i += 1
    tokens.append(text[start:i])
    return tokens

4. Generate Fasttext embeddings

In [5]:
# Get the fasttext embeddings for the tokens
train_embeddings = {}
val_embeddings = {}
test_embeddings = {}

# Train data
for key in train_data:
    words = tokenize_text(train_data[key]["text"])
    embedding = np.array([ft_model.get_word_vector(word) for word in words])
    train_embeddings[key] = embedding

# Validation data
for key in val_data:
    words = tokenize_text(val_data[key]["text"])
    embedding = np.array([ft_model.get_word_vector(word) for word in words])
    val_embeddings[key] = embedding

# Test data
for key in test_data:
    words = tokenize_text(test_data[key]["text"])
    embedding = np.array([ft_model.get_word_vector(word) for word in words])
    test_embeddings[key] = embedding

In [6]:
# Print the size of the embeddings
print("Train embeddings size: ", len(train_embeddings))
print("Validation embeddings size: ", len(val_embeddings))
print("Test embeddings size: ", len(test_embeddings))

Train embeddings size:  906
Validation embeddings size:  219
Test embeddings size:  328


5. Dump embeddings to a pickle file

In [7]:
# Dump the embeddings to 3 pickle files
train_embeddings_file = "Task2_Fasttext_train_embeddings.pkl"
val_embeddings_file = "Task2_Fasttext_val_embeddings.pkl"
test_embeddings_file = "Task2_Fasttext_test_embeddings.pkl"

with open(train_embeddings_file, "wb") as file:
    pickle.dump(train_embeddings, file)

with open(val_embeddings_file, "wb") as file:
    pickle.dump(val_embeddings, file)

with open(test_embeddings_file, "wb") as file:
    pickle.dump(test_embeddings, file)

6. Load from Pickle File (change max length for Task 1 and 2)

In [8]:
# Load these embeddings from the pickle files
train_embeddings_file = "Task2_Fasttext_train_embeddings.pkl"
val_embeddings_file = "Task2_Fasttext_val_embeddings.pkl"
test_embeddings_file = "Task2_Fasttext_test_embeddings.pkl"

train_embeddings_loaded = pickle.load(open(train_embeddings_file, "rb"))
val_embeddings_loaded = pickle.load(open(val_embeddings_file, "rb"))
test_embeddings_loaded = pickle.load(open(test_embeddings_file, "rb"))

# Load the labels from the json files
train_labels = {}
val_labels = {}
test_labels = {}

train_label_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_train_labels.json"
val_label_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_val_labels.json"
test_label_path = "/Users/arnav/Desktop/MachineLearning/NLP-Assignments/CSE556-NLP-Assignments/Assignment 2/Task2_Dataset_ATE/ATE_test_labels.json"

train_labels = load_from_json(train_label_path)
val_labels = load_from_json(val_label_path)
test_labels = load_from_json(test_label_path)

In [9]:
"""
For each key, pad the embeddings and labels to the maximum length of 559 (for Task 1) and 83 (for Task 2)
# For padding labels, use 'O' tag
# For padding embeddings, use a vector of zeros
"""

max_length = 83

for key in train_embeddings_loaded:
    label = train_labels[key]
    embeddings = train_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    train_labels[key] = label
    train_embeddings_loaded[key] = embeddings

for key in val_embeddings_loaded:
    label = val_labels[key]
    embeddings = val_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    val_labels[key] = label
    val_embeddings_loaded[key] = embeddings

for key in test_embeddings_loaded:
    label = test_labels[key]
    embeddings = test_embeddings_loaded[key]

    # Pad the labels
    if len(label) < max_length:
        label = label + ['O'] * (max_length - len(label))

    # Pad the embeddings
    if len(embeddings) < max_length:
        embeddings = np.concatenate((embeddings, np.zeros((max_length - len(embeddings), 300))), axis=0)

    test_labels[key] = label
    test_embeddings_loaded[key] = embeddings

In [10]:
# Iterate thorogh the embeddings and labels and print shapes
for key in train_embeddings_loaded:
    print("Train embeddings shape: ", train_embeddings_loaded[key].shape)
    print("Train labels length: ", len(train_labels[key]))

for key in val_embeddings_loaded:
    print("Validation embeddings shape: ", val_embeddings_loaded[key].shape)
    print("Validation labels length: ", len(val_labels[key]))

for key in test_embeddings_loaded:
    print("Test embeddings shape: ", test_embeddings_loaded[key].shape)
    print("Test labels length: ", len(test_labels[key]))

Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train labels length:  83
Train embeddings shape:  (83, 300)
Train