In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset("wikipedia", "20220301.en", cache_dir="./data/wikipedia/")

Downloading: 100%|██████████| 15.3k/15.3k [00:00<00:00, 10.1MB/s]
Downloading: 100%|██████████| 20.3G/20.3G [10:06<00:00, 33.4MB/s]  


In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6458670
    })
})

In [5]:
data["train"][0]

{'id': '12',
 'url': 'https://en.wikipedia.org/wiki/Anarchism',
 'title': 'Anarchism',
 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latte

In [10]:
import numpy as np
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

In [8]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [9]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')
model = AutoModel.from_pretrained('intfloat/e5-base-v2')

Downloading (…)okenizer_config.json: 100%|██████████| 314/314 [00:00<00:00, 1.90MB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 794kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 3.82MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 1.03MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 650/650 [00:00<00:00, 5.15MB/s]
Downloading model.safetensors: 100%|██████████| 438M/438M [00:30<00:00, 14.3MB/s] 


In [11]:
def vectorize_batch(input_texts: list) -> np.ndarray:
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    return embeddings

In [12]:
first_batch = [data["train"][i]["text"] for i in range(10)]

In [15]:
vectorize_batch(first_batch)

tensor([[-0.2876, -0.4200, -0.9810,  ..., -0.0512,  0.3363,  0.5509],
        [-0.1217, -0.0455, -0.8607,  ..., -0.0691,  0.4392,  0.2848],
        [-0.0559,  0.1231, -0.2727,  ...,  0.1846,  0.5108,  0.6531],
        ...,
        [-0.1997, -0.0629, -0.5502,  ..., -0.0533,  0.7621,  0.5836],
        [-0.3560, -0.4601, -0.7222,  ..., -0.0193,  0.7337,  0.6018],
        [-0.4110, -0.0369, -0.6875,  ..., -0.2163,  0.2770,  0.5313]],
       grad_fn=<DivBackward0>)