I did some heavy testing to see how the chunked MPNET embeddings differed from the unchunked Jina embeddings. 

What I found was that the MPnet embeddings represented the literal information better. So the two embeddings generated complimentary and compatible results

In [1]:
import pandas as pd

In [None]:
%pip install - r requirements.txt


In [1]:
j_corpus_path = "data/all-poems-AC-AN-jina-v2-base-en-embeddings.pkl"
c_corpus_path = (
    "data/all-poems-AC-AN-max-pooled-chunked-all-mpnet-base-v2-embeddings.pkl"
)

In [2]:
import pickle

with open(jc_corpus_path, "rb") as jcfin, open(j_corpus_path, "rb") as jfin, open(
    c_corpus_path, "rb"
) as cfin:
    jc_data = pickle.load(jcfin)
    j_data = pickle.load(jfin)
    c_data = pickle.load(cfin)
    poems = jc_data["poem"]
    poem_ids = jc_data["poem_id"]
    jc_corpus = jc_data["embedding"]
    j_corpus = j_data["embedding"]
    c_corpus = c_data["embedding"]

In [12]:
jc_corpus.shape


(16807, 768)

In [14]:
# Create Index
import ngtpy

EMB_SIZE = 768

# Initialize index name
jc_corpus_ix_name = "indices/ngt_index_all-poems-AC-AN-2048-chunked-jina-v2-base-en"

# Create an empty index

ngtpy.create(
    path=bytes(jc_corpus_ix_name, encoding="utf8"),
    dimension=EMB_SIZE,
    distance_type="Normalized Cosine",
)

jc_corpus_ix = ngtpy.Index(bytes(jc_corpus_ix_name, encoding="utf8"))

# insert the objects

jc_corpus_ix.batch_insert(jc_corpus)

# save the index.
jc_corpus_ix.save()

In [15]:
# Create Index
import ngtpy

EMB_SIZE = 768

# Initialize index name
j_corpus_ix_name = "indices/ngt_index_all-poems-AC-AN-jina-v2-base-en"

# Create an empty index

ngtpy.create(
    path=bytes(j_corpus_ix_name, encoding="utf8"),
    dimension=EMB_SIZE,
    distance_type="Normalized Cosine",
)

j_corpus_ix = ngtpy.Index(bytes(j_corpus_ix_name, encoding="utf8"))

# insert the objects

j_corpus_ix.batch_insert(j_corpus)

# save the index.
j_corpus_ix.save()

In [16]:
# Create Index
import ngtpy

EMB_SIZE = 768

# Initialize index name
c_corpus_ix_name = "indices/all-poems-AC-AN-max-pooled-chunked-all-mpnet-base-v2"

# Create an empty index

ngtpy.create(
    path=bytes(c_corpus_ix_name, encoding="utf8"),
    dimension=EMB_SIZE,
    distance_type="Normalized Cosine",
)

c_corpus_ix = ngtpy.Index(bytes(c_corpus_ix_name, encoding="utf8"))

# insert the objects

c_corpus_ix.batch_insert(c_corpus)

# save the index.
c_corpus_ix.save()

In [3]:
import ngtpy

j_corpus_ix_name = "indices/ngt_index_all-poems-AC-AN-jina-v2-base-en"
j_corpus_index = ngtpy.Index(bytes(j_corpus_ix_name, encoding="utf8"))


In [4]:
import ngtpy

c_corpus_ix_name = "indices/all-poems-AC-AN-max-pooled-chunked-all-mpnet-base-v2"
c_corpus_index = ngtpy.Index(bytes(c_corpus_ix_name, encoding="utf8"))


In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoModel

mp_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
jina_model = AutoModel.from_pretrained(
    "jinaai/jina-embeddings-v2-base-en", trust_remote_code=True
)
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
# cross_encoder = CrossEncoder("Lajavaness/CrossEncoder-camembert-large", max_length=512)

Downloading (…)okenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

In [33]:
def query_all_emb(query: str, count: int):
    mpnet_query_embedding = mp_model.encode(query)
    jina_query_embedding = jina_model.encode(query)
    cc_results = search_c_corpus(mpnet_query_embedding, count)
    jj_results = search_j_corpus(jina_query_embedding, count)

    j_cross_inp = [[query[0], poems[result[0]]] for result in jj_results]
    j_cross_scores = cross_encoder.predict(j_cross_inp)
    j_corpus_result_dict = [
        {"corpus_id": tup[0], "score": tup[1]} for tup in jj_results
    ]

    c_cross_inp = [[query[0], poems[result[0]]] for result in cc_results]
    c_cross_scores = cross_encoder.predict(c_cross_inp)
    c_corpus_result_dict = [
        {"corpus_id": tup[0], "score": tup[1]} for tup in cc_results
    ]

    # Add 'cross-score' to each dict
    for idx in range(len(j_cross_scores)):
        j_corpus_result_dict[idx]["cross-score"] = j_cross_scores[idx]

    print("Top-3 Cross-Encoder Re-ranker hits from Jina")
    print("query was: ", query)
    j_reranked_hits = sorted(
        j_corpus_result_dict, key=lambda x: x["cross-score"], reverse=True
    )
    for hit in j_reranked_hits[0:3]:
        print(
            "\t{:.3f}\t{}".format(
                hit["cross-score"], poems[hit["corpus_id"]].replace("\n", " ")
            )
        )
    for idx in range(len(c_cross_scores)):
        c_corpus_result_dict[idx]["cross-score"] = c_cross_scores[idx]

    print("Top-3 Cross-Encoder Re-ranker hits from MPNet")
    print("query was: ", query)
    c_reranked_hits = sorted(
        c_corpus_result_dict, key=lambda x: x["cross-score"], reverse=True
    )
    for hit in c_reranked_hits[0:3]:
        print(
            "\t{:.3f}\t{}".format(
                hit["cross-score"], poems[hit["corpus_id"]].replace("\n", " ")
            )
        )


def search_c_corpus(embedding: np.ndarray, count: int) -> None:
    c_results = c_corpus_index.search(embedding, size=count)
    print("All-MPNet-Chunked")
    print("ID\tDistance")
    for result in c_results:
        print("{}\t{}".format(*result))
    print(
        "# of distance computations="
        + str(c_corpus_index.get_num_of_distance_computations())
    )
    return c_results


# def search_jc_corpus(embedding: np.ndarray, count: int) -> None:
#     jc_results = jc_corpus_ix.search(embedding, size=count)
#     print("Jina-Chunked")
#     print("ID\tDistance")
#     for result in jc_results:
#         print("{}\t{}".format(*result))
#     print(
#         "# of distance computations="
#         + str(jc_corpus_ix.get_num_of_distance_computations())
#    )


def search_j_corpus(embedding: np.ndarray, count: int) -> None:
    j_results = j_corpus_index.search(embedding, size=count)
    print("Jina")
    print("ID\tDistance")
    for result in j_results:
        print("{}\t{}".format(*result))
    print(
        "# of distance computations="
        + str(j_corpus_index.get_num_of_distance_computations())
    )
    return j_results

In [44]:
query_all_emb("explaining death to children", 10)


All-MPNet-Chunked
ID	Distance
5600	0.4364966154098511
14830	0.454415887594223
15792	0.46284836530685425
338	0.4656195640563965
6375	0.4805513918399811
27	0.4829980731010437
11204	0.4894413352012634
4681	0.4904647171497345
9177	0.5024305582046509
11798	0.5163055062294006
# of distance computations=6941
Jina
ID	Distance
5600	0.1718270182609558
14750	0.18301886320114136
5304	0.18790149688720703
3379	0.19929592311382294
14490	0.20940038561820984
9046	0.21109333634376526
2733	0.21349090337753296
12485	0.21472905576229095
2996	0.21675236523151398
8467	0.21900296211242676
# of distance computations=13052
Top-3 Cross-Encoder Re-ranker hits from Jina
query was:  explaining death to children
	-6.659	﻿ISo this is where the children come to die,hidden on the hospital’s highest floor.They wear their bandages like uniformsand pull their iv rigs along the hall with slow and careful steps. Or bald and pale,they lie in bright pajamas on their beds,watching another world on a screen.The mothers spend th

In [37]:
poem_ids[16389]


56302

In [6]:
def query_all_emb(query: str, count: int):
    mpnet_query_embedding = mp_model.encode(query)
    jina_query_embedding = jina_model.encode(query)
    cc_results = search_c_corpus(mpnet_query_embedding, count)
    jj_results = search_j_corpus(jina_query_embedding, count)

    unique_tuples = {}

    # Add tuples from list1 to the dictionary
    for tup in cc_results:
        unique_tuples[tup[0]] = tup

    # Add tuples from list2 to the dictionary, overwriting any duplicates from list1
    for tup in jj_results:
        unique_tuples[tup[0]] = tup

    # Convert the dictionary values back to a list
    combo_results = list(unique_tuples.values())

    combo_cross_inp = [[query[0], poems[result[0]]] for result in combo_results]
    combo_cross_scores = cross_encoder.predict(combo_cross_inp)
    combo_corpus_result_dict = [
        {"corpus_id": tup[0], "score": tup[1]} for tup in combo_results
    ]
    for idx in range(len(combo_cross_scores)):
        combo_corpus_result_dict[idx]["cross-score"] = combo_cross_scores[idx]

    print("Top-3 Cross-Encoder Re-ranker hits from Jina")
    print("query was: ", query)
    combo_reranked_hits = sorted(
        combo_corpus_result_dict, key=lambda x: x["cross-score"], reverse=True
    )
    for hit in combo_reranked_hits[0:5]:
        print(
            "\t{:.3f}\t{}".format(
                hit["cross-score"], poems[hit["corpus_id"]].replace("\n", " ")
            )
        )


def search_c_corpus(embedding: np.ndarray, count: int) -> None:
    c_results = c_corpus_index.search(embedding, size=count)
    print("All-MPNet-Chunked")
    print("ID\tDistance")
    for result in c_results:
        print("{}\t{}".format(*result))
    print(
        "# of distance computations="
        + str(c_corpus_index.get_num_of_distance_computations())
    )
    return c_results


def search_j_corpus(embedding: np.ndarray, count: int) -> None:
    j_results = j_corpus_index.search(embedding, size=count)
    print("Jina")
    print("ID\tDistance")
    for result in j_results:
        print("{}\t{}".format(*result))
    print(
        "# of distance computations="
        + str(j_corpus_index.get_num_of_distance_computations())
    )
    return j_results

In [8]:
query_all_emb("introspective poem about the passing nature of reality", 10)


All-MPNet-Chunked
ID	Distance
3006	0.33397549390792847
11854	0.34220629930496216
13007	0.3424101173877716
9241	0.34329450130462646
7629	0.34359729290008545
15135	0.34834399819374084
2032	0.34856417775154114
13240	0.34860917925834656
2008	0.35346338152885437
13148	0.3554849922657013
# of distance computations=2016
Jina
ID	Distance
12746	0.16509942710399628
4072	0.1734418272972107
13176	0.17497900128364563
588	0.17662417888641357
9592	0.17838580906391144
11032	0.18057706952095032
16194	0.18261007964611053
2181	0.18518666923046112
6855	0.1854131817817688
13108	0.18553785979747772
# of distance computations=2768
Top-3 Cross-Encoder Re-ranker hits from Jina
query was:  introspective poem about the passing nature of reality
	-3.577	One of us is a faucet reconciling to the temperature of indifference. This is the world: the drawer assembled by you pinches a finger before yielding. There are so many foreigners here, I said, when I first stepped onto a beach in Virginia. I had an idea of the oc

In [106]:
poems[6855]


'1 When you have been on trains on buses on the road with someone for months living with each other’s filthy teenager things you don’t expect him to come up with a neat shirt and black trousers from the bottom of his backpack on the last night which is what he did looking like a waiter his hair windblown across continents now cut as if for a ceremony his blue eyes like the summer’s oceans but before the dinner and the next morning’s flights to separate places we had to go find Karl Marx’s grave taking the underground across the city walking under the rippling trees of the cemetery the green of the place like cinematography and finally we are standing in front of the famous man’s enormous bronze head and I am looking at him looking at it I am looking at the side of his smiling face his ear and the speck of shaving cream I point out to him wanting to touch him there thirty years his white shirt ahead of me on the path back. 2 On the podcast the psychologist talking about death and our ne

In [19]:
def query_all_emb(query: str, count: int):
    mpnet_query_embedding = mp_model.encode(query)
    jina_query_embedding = jina_model.encode(query)
    cc_results = search_c_corpus(mpnet_query_embedding, count)
    jj_results = search_j_corpus(jina_query_embedding, count)

    combo_results = combine_and_unique_results(cc_results, jj_results)

    combo_corpus_result_dict = cross_scores_and_dict(query, combo_results)
    return print_top_hits(query, combo_corpus_result_dict)


def combine_and_unique_results(list1, list2):
    unique_tuples = {tup[0]: tup for tup in list1 + list2}
    return list(unique_tuples.values())


def cross_scores_and_dict(query, results):
    cross_inp = [[query[0], poems[result[0]]] for result in results]
    cross_scores = cross_encoder.predict(cross_inp)
    result_dict = [
        {"corpus_id": tup[0], "score": tup[1], "cross-score": score}
        for tup, score in zip(results, cross_scores)
    ]
    return result_dict


def print_top_hits(query, result_dict, top=5):
    reranked_hits = sorted(result_dict, key=lambda x: x["cross-score"], reverse=True)
    print("Top-{} Cross-Encoder Re-ranker hits".format(top))
    print("query was: ", query)
    for hit in reranked_hits[:top]:
        print(
            "\t{:.3f}\t{}".format(
                hit["cross-score"], poems[hit["corpus_id"]].replace("\n", " ")
            )
        )


def search_c_corpus(embedding: np.ndarray, count: int) -> None:
    return c_corpus_index.search(embedding, size=count)


def search_j_corpus(embedding: np.ndarray, count: int) -> None:
    return j_corpus_index.search(embedding, size=count)

In [23]:
query_all_emb(
    "I lov’d thee from the earliest dawn, When first I saw thy beauty’s ray, And will, until life’s eve comes on, And beauty’s blossom fades away; And when all things go well with thee, With smiles and tears remember me. I’ll love thee when thy morn is past, And wheedling gallantry is o’er, When youth is lost in age’s blast, And beauty can ascend no more, And when life’s journey ends with thee",
    20,
)

Top-5 Cross-Encoder Re-ranker hits
query was:  I lov’d thee from the earliest dawn, When first I saw thy beauty’s ray, And will, until life’s eve comes on, And beauty’s blossom fades away; And when all things go well with thee, With smiles and tears remember me. I’ll love thee when thy morn is past, And wheedling gallantry is o’er, When youth is lost in age’s blast, And beauty can ascend no more, And when life’s journey ends with thee
	-1.902	I lov’d thee from the earliest dawn, When first I saw thy beauty’s ray, And will, until life’s eve comes on, And beauty’s blossom fades away; And when all things go well with thee, With smiles and tears remember me. I’ll love thee when thy morn is past, And wheedling gallantry is o’er, When youth is lost in age’s blast, And beauty can ascend no more, And when life’s journey ends with thee, O, then look back and think of me. I’ll love thee with a smile or frown, ’Mid sorrow’s gloom or pleasure’s light, And when the chain of life runs down, Pursue t