In [1]:
from pymilvus import MilvusClient, Collection, FieldSchema, CollectionSchema, DataType, utility
client = MilvusClient(alias="default")
print(client)

<pymilvus.milvus_client.milvus_client.MilvusClient object at 0x11159d3d0>


In [2]:
bge_contributions = Collection(name="bge_contributions")
print(bge_contributions)

<Collection>:
-------------
<name>: bge_contributions
<description>: 
<schema>: {'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'name': 'doi', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 64}}, {'name': 'citation_count', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'pubdate', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}], 'enable_dynamic_field': True}



In [5]:
record_pubdate = 20080601
results = bge_contributions.query(expr=f"pubdate <= {record_pubdate}", output_fields=["pubdate"])
for result in results:
    pubdate = result['pubdate']
    assert pubdate <= record_pubdate, f"Retrieved pubdate {pubdate} is greater than query pubdate {record_pubdate}"

In [6]:
import pandas as pd
examples = pd.read_json('data/dataset/nontrivial_10.jsonl', lines=True)
examples.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked
0,10.1007/s00159-008-0010-0,"(Shang et al. 1998), do show unusual, faint fe...","(), do show unusual, faint features in their s...",159,[10.1086/311563],20080601,[1998ApJ...504L..23S],"([REF]), do show unusual, faint features in th..."
1,10.1016/j.newar.2004.03.017,Novak et al. (2003) studied the dust emission ...,studied the dust emission at 450 μm in the ga...,1329,[10.1086/368156],20040901,[2003ApJ...583L..83N],[REF] studied the dust emission at 450 μm in t...
2,10.1146/annurev-astro-091918-104430,The momentum per unit mass of stars formed del...,The momentum per unit mass of stars formed del...,542,[10.1086/317785],20190801,[2000ApJ...545..364M],The momentum per unit mass of stars formed del...
3,10.1146/annurev-astro-081309-130834,An example of the latter is the dormant blue s...,An example of the latter is the dormant blue s...,815,[10.48550/arXiv.astro-ph/9701042],20100901,[1997A&A...328..130P],An example of the latter is the dormant blue s...
4,10.1146/annurev.astro.37.1.127,Kaiser et al (1995) (see also Luppino Kaiser 1...,(see also for further developments) compute ...,282,"[10.1086/176071, 10.1086/303508, 10.1086/306102]",19990101,"[1995ApJ...449..460K, 1997ApJ...475...20L, 199...",[REF] (see also [REF] for further developments...


In [10]:
from embedders import Embedder
bge = Embedder.create("BAAI/bge-large-en-v1.5", device="mps", normalize=False, for_queries=True)
embeddings = bge(examples['sent_no_cit'])
print(embeddings.shape)

(10, 1024)


In [7]:
examples_basic = examples.to_dict(orient="records")
print(examples_basic)

[{'source_doi': '10.1007/s00159-008-0010-0', 'sent_original': '(Shang et al. 1998), do show unusual, faint features in their surroundings.', 'sent_no_cit': '(), do show unusual, faint features in their surroundings.', 'sent_idx': 159, 'citation_dois': ['10.1086/311563'], 'pubdate': 20080601, 'resolved_bibcodes': ['1998ApJ...504L..23S'], 'sent_cit_masked': '([REF]), do show unusual, faint features in their surroundings.'}, {'source_doi': '10.1016/j.newar.2004.03.017', 'sent_original': 'Novak et al. (2003) studied the dust emission at 450 μm in the galactic plane, and found the magnetic field to be well aligned parallel with the galactic plane (their Fig. 1), within 30 arcmin in longitude and 10 arcmin in latitude of the GC.', 'sent_no_cit': ' studied the dust emission at 450 μm in the galactic plane, and found the magnetic field to be well aligned parallel with the galactic plane (their Fig. 1), within 30 arcmin in longitude and 10 arcmin in latitude of the GC.', 'sent_idx': 1329, 'cita

In [25]:
for record, vector in zip(examples_basic, embeddings):
    hits = client.search(
                collection_name="bge_contributions",
                data=[vector],
                anns_field="vector",
                search_params={"metric_type": "IP"},
                # param={"metric_type": "IP"},
                limit=3,
                output_fields=['pubdate'],
                filter=f"pubdate <= {record['pubdate']}"
    )
    print(f"All hits have prior pubdates: {all(hit['pubdate'] <= record['pubdate'] for hit in hits[0])}")
    print("===")

All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===
All hits have prior pubdates: True
===


Let's try creating an index on the pubdate

In [17]:
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="pubdate",
    index_type="STL_SORT",
    index_name="pubdate_index"
)
client.create_index(
    collection_name="bge_contributions",
    index_params=index_params
)

# Verify
client.list_indexes("bge_contributions")

['vector', 'pubdate_index']

In [18]:
for record, vector in zip(examples_basic, embeddings):
    hits = bge_contributions.search(
        data=[vector],
        anns_field="vector",
        param={"metric_type": "IP"},
        limit=10,
        output_fields=["pubdate"],
        filter=f"pubdate <= 19990101",
    )
    print(hits[0])
    print("===")

[{'id': 460223411266388818, 'distance': 0.7013970017433167, 'entity': {'pubdate': 20240801}}, {'id': 460223411266338740, 'distance': 0.6690324544906616, 'entity': {'pubdate': 20020601}}, {'id': 460223411266304513, 'distance': 0.6650761365890503, 'entity': {'pubdate': 19891101}}, {'id': 460223411266377062, 'distance': 0.6511155962944031, 'entity': {'pubdate': 20050301}}, {'id': 460223411266310089, 'distance': 0.6474418044090271, 'entity': {'pubdate': 20000401}}, {'id': 460223411266360103, 'distance': 0.6473026275634766, 'entity': {'pubdate': 20180101}}, {'id': 460223411266311807, 'distance': 0.645524799823761, 'entity': {'pubdate': 20060801}}, {'id': 460223411266371349, 'distance': 0.6444376707077026, 'entity': {'pubdate': 20050601}}, {'id': 460223411266383306, 'distance': 0.6431231498718262, 'entity': {'pubdate': 20120301}}, {'id': 460223411266361682, 'distance': 0.6428191065788269, 'entity': {'pubdate': 20220301}}]
===
[{'id': 460223411266355363, 'distance': 0.7332022190093994, 'entit