In [11]:
import pandas as pd
import json
import os
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
model_name = "bert-base-uncased"  # Can change to a different BERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Function to extract BERT embeddings
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() # Extract the [CLS] embeddings
    
    # Convert to tensor
    embeddings = torch.tensor(embeddings)
    return embeddings

In [12]:
# Load the train, val and test data
train_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task1_Dataset_NER/NER_train.json"
val_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task1_Dataset_NER/NER_val.json"
test_path = "/Users/arnav/Desktop/MachineLearning/CSE556-NLP-Assignments/Assignment 2/Task1_Dataset_NER/NER_test.json"

# Function to load the data from the json file
def load_from_json(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

train_dict = load_from_json(train_path)
val_dict = load_from_json(val_path)
test_dict = load_from_json(test_path)

In [18]:
# Function to get the text and labels
def get_text_labels(data_dict):
    texts = []
    labels = []
    for _, dict in data_dict.items():
        texts.append(dict["text"])
        labels.append(dict["labels"])

    return texts, labels

train_texts, train_labels = get_text_labels(train_dict)
val_texts, val_labels = get_text_labels(val_dict)
test_texts, test_labels = get_text_labels(test_dict)

In [19]:
# Get the BERT embeddings
train_embeddings = get_bert_embeddings(train_texts)
val_embeddings = get_bert_embeddings(val_texts)
test_embeddings = get_bert_embeddings(test_texts)

# Print the shape of the embeddings
print("Train embeddings shape:", train_embeddings.shape)
print("Val embeddings shape:", val_embeddings.shape)
print("Test embeddings shape:", test_embeddings.shape)

: 