In [10]:
# Import required libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import torch_xla.core.xla_model as xm
from torch.nn import functional as F

In [11]:
device = xm.xla_device()
device

device(type='xla', index=0)

In [12]:
def tokenize(datasets, student_tokenizer):
    """
    datasets: huggingface datasets
    student_tokenizer: huggingface tokenizer (student tokenizer)
    """
    def tokenize_batch(examples):
        """
        batch tokenize function
        """
        output_en = student_tokenizer(examples["text_en"], padding="max_length", truncation=True, max_length=256)
        output_id = student_tokenizer(examples["text_id"], padding="max_length", truncation=True, max_length=256)

        return {
            "input_ids_en": output_en.input_ids,
            "attention_mask_en": output_en.attention_mask,
            "input_ids_id": output_id.input_ids,
            "attention_mask_id": output_id.attention_mask,
        }

    tokenized_datasets = datasets.map(tokenize_batch, batched=True, num_proc=8)
    return tokenized_datasets




In [13]:
def embedding(datasets, parent_model, parent_tokenizer):

    def cls_pooling(model_output):
        return model_output.last_hidden_state[:,0]

    parent_model.to(device)
    

    def embedding_batch(examples):
        encoded_input = parent_tokenizer(examples["text_en"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        encoded_input = encoded_input.to(device)
        with torch.no_grad():
            model_output = parent_model(**encoded_input)

        target_embedding = cls_pooling(model_output).detach().cpu().numpy()

        return {
            "target_embedding": target_embedding
        }

    embedding_datasets = datasets.map(embedding_batch, batched=True,batch_size=384)
    return embedding_datasets

        

In [14]:
dataset = load_dataset("carlesoctav/en-id-parallel-sentences")

Downloading readme:   0%|          | 0.00/898 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /home/carlesoctav/.cache/huggingface/datasets/carlesoctav___parquet/carlesoctav--en-id-parallel-sentences-2ac6d941a9b892f7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/22.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/534k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/8 [00:00<?, ?it/s]

Generating NeuLabTedTalks split:   0%|          | 0/94224 [00:00<?, ? examples/s]

Generating QED split:   0%|          | 0/274581 [00:00<?, ? examples/s]

Generating TED2020 split:   0%|          | 0/163319 [00:00<?, ? examples/s]

Generating Tatoeba split:   0%|          | 0/10543 [00:00<?, ? examples/s]

Generating combinedtech split:   0%|          | 0/276659 [00:00<?, ? examples/s]

Generating msmarcocollection split:   0%|          | 0/500000 [00:00<?, ? examples/s]

Generating msmarcoquery split:   0%|          | 0/500000 [00:00<?, ? examples/s]

Generating tico19 split:   0%|          | 0/3071 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/carlesoctav/.cache/huggingface/datasets/carlesoctav___parquet/carlesoctav--en-id-parallel-sentences-2ac6d941a9b892f7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/8 [00:00<?, ?it/s]

In [15]:
student_tokenizer = AutoTokenizer.from_pretrained("nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large")
parent_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-dot-v1")
parent_model = AutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-dot-v1")



In [19]:
embedding_dataset = embedding(dataset, parent_model, parent_tokenizer)

Map:   0%|          | 0/94224 [00:00<?, ? examples/s]

Map:   0%|          | 0/274581 [00:00<?, ? examples/s]

Map:   0%|          | 0/163319 [00:00<?, ? examples/s]

Map:   0%|          | 0/10543 [00:00<?, ? examples/s]

Map:   0%|          | 0/276659 [00:00<?, ? examples/s]

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3071 [00:00<?, ? examples/s]

In [21]:
embedding_tokenized_dataset = tokenize(embedding_dataset, student_tokenizer)



In [22]:
embedding_tokenized_dataset.push_to_hub("carlesoctav/en-id-parallel-sentences-embedding",
                                        token = "")



Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/11 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
test = parent_tokenizer("hello dunia",return_tensors="pt").to(device)

In [32]:
parent_model(**test).last_hidden_state.shape

torch.Size([1, 5, 384])