In [7]:
from transformers import AutoModel,AutoTokenizer

# Initialize the model
model_id = "jinaai/jina-embeddings-v3"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)


texts = [
    "Follow the white rabbit.",  # English
    "Sigue al conejo blanco.",  # Spanish
    "Suis le lapin blanc.",  # French
    "跟着白兔走。",  # Chinese
    "اتبع الأرنب الأبيض.",  # Arabic
    "Folge dem weißen Kaninchen.",  # German
]

# When calling the `encode` function, you can choose a `task` based on the use case:
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# Alternatively, you can choose not to pass a `task`, and no specific LoRA adapter will be used.
embeddings = model.encode(texts, task="text-matching")

# Compute similarities
print(embeddings[0] @ embeddings[1].T)

0.70863205


In [5]:
embeddings.shape

(6, 1024)

In [15]:
document = "\n".join(texts)
document.split()

['Follow',
 'the',
 'white',
 'rabbit.',
 'Sigue',
 'al',
 'conejo',
 'blanco.',
 'Suis',
 'le',
 'lapin',
 'blanc.',
 '跟着白兔走。',
 'اتبع',
 'الأرنب',
 'الأبيض.',
 'Folge',
 'dem',
 'weißen',
 'Kaninchen.']

In [113]:
import numpy as np

SEPARATOR_TOKEN = "<SEP>"
SEPARATOR_TOKEN_ID = tokenizer.convert_tokens_to_ids(SEPARATOR_TOKEN)
# Combine all texts into a single document
document = SEPARATOR_TOKEN.join(texts)

# Calculate the length of each text
inputs_length = [len(text) for text in texts]
print("Inputs length:", inputs_length)

# Compute cumulative lengths
cum_inputs_length = np.cumsum([0] + inputs_length)
print("Cumulative inputs length:", cum_inputs_length)

# Generate span annotations as tuples of start and end indices
span_annotations = [(start, end) for start, end in zip(cum_inputs_length[:-1], cum_inputs_length[1:])]
print("Span annotations:", span_annotations)

print("Document:", document)

Inputs length: [24, 23, 20, 6, 19, 27]
Cumulative inputs length: [  0  24  47  67  73  92 119]
Span annotations: [(0, 24), (24, 47), (47, 67), (67, 73), (73, 92), (92, 119)]
Document: Follow the white rabbit.<SEP>Sigue al conejo blanco.<SEP>Suis le lapin blanc.<SEP>跟着白兔走。<SEP>اتبع الأرنب الأبيض.<SEP>Folge dem weißen Kaninchen.


In [115]:
inputs = tokenizer(
    document,
    padding=True,
    truncation=True,
    return_tensors="pt",
    return_offsets_mapping=True,
)
inputs

{'input_ids': tensor([[     0,  77168,     70,  35011, 152131,     18, 100052,    294,  21290,
           2740,   8859,   6261,    144,    158,  18039, 101455, 100052,    294,
          21290,   2740,  14168,    164,     95,     21,   5128,  38972, 100052,
            294,  21290,   2740, 113998,   3515, 130818,   3469,     30,  16093,
            294,  21290,   2740,    396,  46776,  86218,  44573, 151721, 100052,
            294,  21290,   2740,  27591,  54090,    745,  23739,     33,   2734,
             73,   4834,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'offset_mapping': tensor([[[  0,   0],
         [  0,   6],
         [  7,  10],
         [ 11,  16],
         [ 17,  22],
         [ 22,  23],
         [ 23,  25],
         [ 25,  26],
         [ 26,  28],
         [ 28,  29],
      

In [64]:
inputs["input_ids"].shape, inputs["attention_mask"].shape, inputs["offset_mapping"].shape

(torch.Size([6, 10]), torch.Size([6, 10]), torch.Size([6, 10, 2]))

In [116]:
offset_mapping = inputs["offset_mapping"][0]

for start, end in offset_mapping:
    # Get the start and end token indices for each text
    print(f"Text: {document[start:end]}")

Text: 
Text: Follow
Text: the
Text: white
Text: rabbi
Text: t
Text: .<
Text: S
Text: EP
Text: >
Text: Si
Text: gue
Text: al
Text: con
Text: ejo
Text: blanco
Text: .<
Text: S
Text: EP
Text: >
Text: Su
Text: is
Text: le
Text: la
Text: pin
Text: blanc
Text: .<
Text: S
Text: EP
Text: >
Text: 跟着
Text: 白
Text: 兔
Text: 走
Text: 。
Text: <
Text: S
Text: EP
Text: >
Text: ات
Text: بع
Text: الأر
Text: نب
Text: الأبيض
Text: .<
Text: S
Text: EP
Text: >
Text: Fo
Text: lge
Text: dem
Text: weiß
Text: en
Text: Kan
Text: in
Text: chen
Text: .
Text: 


In [79]:
span_annotations

[(0, 24), (24, 47), (47, 67), (67, 73), (73, 92), (92, 119)]

In [None]:
chunk_positions = []
cum_inputs_length, inputs["offset_mapping"]


for start, end in span_annotations:
    chunk_start, chunk_end = None, None
    for i, (start_offset, end_offset) in enumerate(offset_mapping):
        if start_offset == start and end_offset < end:
            chunk_start = i
        if start_offset > start and end_offset == end:
            chunk_end = i
            break

    chunk_positions.append((chunk_start, chunk_end))

chunk_positions

[(1, 6), (7, 13), (14, 20), (21, 25), (26, 31), (32, 40)]

In [103]:
sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
sep_id

3

In [97]:
output = model(**inputs)

output[0].shape, output[1].shape

Flash attention implementation does not support kwargs: offset_mapping


(torch.Size([1, 42, 1024]), torch.Size([1, 1024]))

In [100]:
output[0][0][0]

tensor([ 0.0067, -2.3910,  1.0657,  ..., -0.1316, -0.2900, -0.1048])

In [56]:
input_text = """Berlin is the capital and largest city of Germany, both by area and by population.
Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits.
The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."""