In [1]:
# !pip install clean-text
# !pip install datasets
# !pip install allennlp
# !pip install --upgrade huggingface_hub

In [None]:
import numpy as np
from allennlp.modules.elmo import Elmo, batch_to_ids
from tqdm import tqdm
import torch
import math
import pickle

In [None]:
with open('hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
documents = data["documents"]
train_documents = data["train_documents"]
test_documents = data["test_documents"]

In [None]:
print(train_documents[0])

<h4>ELMo</h4>

In [None]:
# Load pre-trained ELMo model
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Assuming you've already initialized your ELMo model
elmo = Elmo(options_file, weights_file, num_output_representations=1, dropout=0).to(device)

def batch_generator(docs, batch_size):
    """Yield batches of documents."""
    for i in range(0, len(docs), batch_size):
        yield docs[i:i+batch_size]

def get_document_embeddings(documents, batch_size):
    """Generate document-level embeddings for the given documents, with a progress bar."""
    document_embeddings = []
    # Wrap the batch generator with tqdm to display the progress
    for batch in tqdm(batch_generator(documents, batch_size), total=math.ceil(len(documents) / batch_size)):
        # Convert sentences to character ids
        character_ids = batch_to_ids(batch).to(device)
        embeddings = elmo(character_ids)

        # Aggregate word embeddings to get document embeddings
        # Here, we take the mean across the sequence length dimension
        doc_embeddings = embeddings['elmo_representations'][0].mean(dim=1)
        document_embeddings.extend(doc_embeddings.detach().cpu().numpy())

    return np.array(document_embeddings)

<h4>ELMo Embeddings</h4>

In [None]:
#Word2Vec Embeddings
X_train_elmo = get_document_embeddings(train_documents, batch_size=50)
X_test_elmo = get_document_embeddings(test_documents, batch_size=50)

In [None]:
print(X_train_elmo.shape)
print(X_train_elmo[0:5])

In [None]:
print(X_train_elmo[0])

In [None]:
data_to_save = {
    "X_train_elmo": X_train_elmo,
    "X_test_elmo": X_test_elmo
}

with open('elmo_embeddings.pickle', 'wb') as file:
    pickle.dump(data_to_save, file)