In [2]:
#!/usr/bin/env python
import os
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import gc


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # token-level embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def compute_embeddings(pickle_path, col_name, model_directory, tokenizer_path, save_name, batch_size=256, checkpoint_interval=100):
    df = pd.read_pickle(pickle_path)
    documents = df[col_name].fillna(" ").tolist()
    print(f"Computing embeddings for {len(documents)} documents")

    # Load tokenizer from the provided tokenizer path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModel.from_pretrained(model_directory)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()

    # Create the embedding_output folder if it does not exist
    output_dir = os.path.join(model_directory, "embedding_output")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    all_embeddings = []
    num_batches = (len(documents) - 1) // batch_size + 1

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        encoded = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            model_output = model(**encoded)
        embeddings = mean_pooling(model_output, encoded['attention_mask'])
        embeddings = F.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu())
        batch_num = i // batch_size + 1
        print(f"Processed batch {batch_num}/{num_batches}")
        del encoded, model_output, embeddings
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Save checkpoint periodically or on the last batch
        if batch_num % checkpoint_interval == 0 or batch_num == num_batches:
            checkpoint_embeddings = torch.cat(all_embeddings, dim=0)
            checkpoint_path = os.path.join(output_dir, f"{save_name}_embeddings_checkpoint_{batch_num}.pt")
            torch.save(checkpoint_embeddings, checkpoint_path)
            print(f"Saved checkpoint at batch {batch_num} to {checkpoint_path}")

    # Final save if last batch is not a checkpoint
    if batch_num % checkpoint_interval != 0:
        final_embeddings = torch.cat(all_embeddings, dim=0)
        final_path = os.path.join(output_dir, f"{save_name}_embeddings.pt")
        torch.save(final_embeddings, final_path)
        print(f"Saved final embeddings to {final_path}")


In [4]:
if __name__ == "__main__":
    try:
        filepath = os.path.dirname(os.path.realpath(__file__))
    except NameError:
        filepath = os.getcwd()
    if "hbailey" in filepath:
        data_dir = "/home/export/hbailey/data/embedding_resonance/data"
        model_directory = "/home/export/hbailey/models/embedding_resonance/who_leads_model_final/checkpoint-100000"
        tokenizer_path = "/home/export/hbailey/models/embedding_resonance/tokenizer"
    else:
        data_dir = "/home/hannah/github/embedding_resonance/data"
        model_directory = "/home/hannah/github/embedding_resonance/model/who_leads_model_final/checkpoint-100000"
        tokenizer_path = "/home/hannah/github/embedding_resonance/model/tokenizer"

    who_leads_folder = os.path.join(data_dir, 'who_leads_who_follows')
    pickle_path = os.path.join(who_leads_folder, 'cleaned_who_leads_df.pkl')
    compute_embeddings(pickle_path, 'post_text', model_directory, tokenizer_path, "who_leads_model")

Computing embeddings for 35727478 documents


Some weights of XLMRobertaModel were not initialized from the model checkpoint at /home/hannah/github/embedding_resonance/model/who_leads_model_final/checkpoint-100000 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed batch 1/139561
Processed batch 2/139561
Processed batch 3/139561
Processed batch 4/139561
Processed batch 5/139561
Processed batch 6/139561
Processed batch 7/139561
Processed batch 8/139561
Processed batch 9/139561
Processed batch 10/139561
Processed batch 11/139561
Processed batch 12/139561
Processed batch 13/139561
Processed batch 14/139561
Processed batch 15/139561
Processed batch 16/139561
Processed batch 17/139561
Processed batch 18/139561
Processed batch 19/139561
Processed batch 20/139561
Processed batch 21/139561
Processed batch 22/139561
Processed batch 23/139561
Processed batch 24/139561
Processed batch 25/139561
Processed batch 26/139561
Processed batch 27/139561
Processed batch 28/139561
Processed batch 29/139561
Processed batch 30/139561
Processed batch 31/139561
Processed batch 32/139561
Processed batch 33/139561
Processed batch 34/139561
Processed batch 35/139561
Processed batch 36/139561
Processed batch 37/139561
Processed batch 38/139561
Processed batch 39/13

KeyboardInterrupt: 