In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModel

ModuleNotFoundError: No module named 'transformers'

In [2]:
input_json_path = "./json/merged_output_v2.json"
output_embeddings_path = "./embedding/emb_v2.json"

In [3]:
model_name = "BAAI/bge-m3" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(8194, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elem

In [4]:
def get_embeddings(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state[:, 0, :]  
    return embeddings.squeeze(0).cpu().tolist()

In [5]:
with open(input_json_path, "r", encoding="utf-8") as input_file:
    data = json.load(input_file)

embedded_data = []

for item in data:
    description = item["description"]
    embedding = get_embeddings(description, tokenizer, model, device)
    embedded_data.append({
        "video_id": item["video_id"],
        "scale": item["scale"],
        "start": item["start"],
        "end": item["end"],
        "description": item["description"],
        "embedding": embedding
    })


with open(output_embeddings_path, "w", encoding="utf-8") as output_file:
    json.dump(embedded_data, output_file, ensure_ascii=False, indent=4)