In [14]:
from dotenv import load_dotenv
import os
import psycopg
import pandas as pd
import numpy as np
from time import time
import torch
from TextEnrichers import get_enricher, TextEnricher
from database.database import Database
from Embedders import Embedder, get_embedder
from tqdm import tqdm

load_dotenv('.env', override=True)
print(os.getenv('DB_PORT'))

# Database setup
db = Database()
db.test_connection()

device = 'cuda' if torch.cuda.is_available(
) else 'mps' if torch.mps.is_available() else 'cpu'
print(f"Using device: {device}")

def cosine_distance(a, b):
    return 1 - (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

# Load data
data = pd.read_json('data/dataset/split/train.jsonl', lines=True)
examples = data.sample(1, random_state=42)

5432
Database         User             Host                             Port            
citeline_db      bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.3 (Homebrew) on x86_64-apple-darwin23.6.0, compiled by Apple clang version 16.0.0 (clang-1600.0.26.6), 64-bit',)
Using device: mps


In [19]:
embedder = get_embedder('BAAI/bge-small-en', device)
enricher = get_enricher('identity', path_to_data='data/preprocessed/reviews.jsonl')

enriched_texts = enricher(examples)
examples['embedding'] = embedder(enriched_texts).tolist()
print(examples.columns)

Example: source_doi                               10.1007/s00159-013-0064-5
sent_original    CGRO observations of X-ray binaries detected n...
sent_no_cit      CGRO observations of X-ray binaries detected n...
sent_idx                                                       821
citation_dois                                     [10.1086/305746]
Name: 9902, dtype: object
Index(['source_doi', 'sent_original', 'sent_no_cit', 'sent_idx',
       'citation_dois', 'embedding'],
      dtype='object')


In [22]:
distances = {}
for enrichment_name in ['identity', 'add_title', 'add_abstract', 'add_title_and_abstract']:
    enricher = get_enricher(enrichment_name, path_to_data='data/preprocessed/reviews.jsonl')
    for _, example in examples.iterrows():
        matching_rows = db.get_chunks_by_doi(
            # Need to implement to query by doi
            doi = example['source_doi'],
            table_name = 'library',
            vector_column = 'bge_norm'
        )

        if not matching_chunks:
            print(f"No chunks found for {example['source_doi']}")

        vectors = np.array([row.vector for row in matching_rows])
        query_vector = np.array([example['embedding']]) 
        chunk_distances = cosine_distance(query_vector, vectors)
        print(f"Enrichment: {enrichment_name}, Distance: {chunk_distances}")
        distances[enrichment_name] = min(chunk_distances)

ValueError: shapes (1,384) and (123,384) not aligned: 384 (dim 1) != 123 (dim 0)

In [21]:
for _, example in examples.iterrows():
    print(example['embedding'])

[-0.05819232761859894, 0.033215057104825974, 0.016498690471053123, -0.027855027467012405, 0.016864100471138954, 0.022042516618967056, -0.004633986856788397, -0.010202179662883282, -0.009182802401483059, 0.0005426202551461756, 0.024157695472240448, -0.023943794891238213, -0.003231110516935587, 0.02695990912616253, -0.025957776233553886, -0.027821391820907593, 0.0059440359473228455, -0.03650195524096489, -0.04003653675317764, 0.03924986720085144, 0.02094208635389805, -0.0436999574303627, 0.007927662692964077, -0.033748626708984375, 0.018476303666830063, 0.0486508384346962, -0.013940504752099514, -0.0001320174924330786, -0.043024368584156036, -0.2315157949924469, -0.012535442598164082, -0.014589567668735981, 0.019611801952123642, -0.029407285153865814, 0.004010611213743687, 0.001645959448069334, -0.014488116838037968, 0.004014691337943077, -0.044133804738521576, 0.012072728015482426, 0.0563507117331028, 0.0069456021301448345, 0.0012945806374773383, -0.05824795365333557, -0.004272500518709