In [2]:
from embedders import SentenceTransformerEmbedder
embedder = SentenceTransformerEmbedder(model_name='BAAI/bge-small-en', device='mps', normalize=True)
print(embedder)

BAAI/bge-small-en, normalize=True


In [3]:
sentences = [
    'testing', 'Just after the Big Bang, a cosmic primordial gas consisted mostly of H, He, and a small amount of light elements (Li, Be, B, etc.).']

result = embedder(sentences)
print(type(result))
print(result.shape)

<class 'numpy.ndarray'>
(2, 384)


In [4]:
from database.database import DatabaseProcessor
from dotenv import load_dotenv
import os

load_dotenv()
db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
}
db = DatabaseProcessor(db_params)

db.test_connection()
print(db.db_params)

Database version: ('PostgreSQL 17.3 (Homebrew) on x86_64-apple-darwin23.6.0, compiled by Apple clang version 16.0.0 (clang-1600.0.26.6), 64-bit',)
{'dbname': 'test', 'user': 'bbasseri', 'password': 'citeline25', 'host': 'localhost', 'port': '5432'}


In [7]:
import pandas as pd
data = pd.read_json('data/dataset/small/nontrivial.jsonl', lines=True)
print(data['sent_no_cit'])

0    Classically, this process has been represented...
1    Observations with the HST Solar Blind Channel ...
2    A remarkable individual LyC detection at z = 4...
3    Two important HST parallel imaging surveys inc...
Name: sent_no_cit, dtype: object


In [8]:
embeddings = embedder(data['sent_no_cit'])
print(embeddings.shape)

(4, 384)


In [14]:
results = db.query_vector_table('bge', query_vector=embeddings[0], metric='vector_cosine_ops', top_k=5)
for result in results:
    print(result.similarity)

0.31170819331951705
0.3100535088745342
0.3090857408251144
0.3029460491505779
0.30200259637762805


In [15]:
ip_results = db.query_vector_table('bge', query_vector=embeddings[0], metric='vector_ip_ops', top_k=5)
for result in ip_results:
    print(result.similarity)

-0.6882918477058411
-0.6899465322494507
-0.6909140944480896
-0.6970539093017578
-0.6979975700378418
