In [4]:
from dotenv import load_dotenv
import os
import pandas as pd
from time import time
import torch
from TextEnrichers import get_enricher, TextEnricher
from database.database import Database
from Embedders import Embedder, get_embedder
from tqdm import tqdm
from pprint import pprint

load_dotenv(".env", override=True)

db = Database()
db.test_connection()

Database         User             Host                             Port            
citeline_db      bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.3 (Homebrew) on x86_64-apple-darwin23.6.0, compiled by Apple clang version 16.0.0 (clang-1600.0.26.6), 64-bit',)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu'
examples = pd.read_json('data/dataset/100/nontrivial.jsonl', lines=True)

Let's take a look at the first example's sentence without citations:

In [38]:
example = examples.iloc[29]
pprint(example['sent_no_cit'])
pprint(f"Sentence number: {example['sent_idx']}")
pprint(f"Source doi: {example['source_doi']}")
pprint(f"Target dois: {example['citation_dois']}")

('The work of  suggests a fundamental distinction in elemental abundances '
 'between closed and open magnetic structures, matching the nominal '
 'photospheric and coronal abundances, respectively.')
'Sentence number: 369'
'Source doi: 10.1146/annurev.aa.30.090192.000553'
"Target dois: ['10.1086/167871']"


We can add a little more context to see what star we're talking about:

In [39]:
all_reviews = pd.read_json('data/preprocessed/reviews.jsonl', lines=True)
source_record = all_reviews.loc[all_reviews["doi"] == example["source_doi"]].iloc[0]
print(source_record)

bibcode                                         1992ARA&A..30..113K
abstract          Solar flare and coronal mass ejection (CME) ob...
aff                                            [Boston College, MA]
author                                              [Kahler, S. W.]
bibstem                                          [ARA&A, ARA&A..30]
doctype                                                     article
doi                             10.1146/annurev.aa.30.090192.000553
id                                                          1840100
pubdate                                                  1992-01-01
title                      Solar flares and coronal mass ejections.
read_count                                                        7
reference         [1852RSPT..142..103S, 1859MNRAS..20...13C, 190...
citation_count                                                  463
citation          [1992LNP...399....1S, 1993AZh....70..165C, 199...
body              Annu. Rev. Astron. Astrophys. 

In [40]:
idx = example['sent_idx']
pprint(source_record.body_sentences[idx-3:idx+1])

['Compared with photospheric abundances, the coronal abundances are known to '
 'be deficient in elements with a first ionization potential exceeding about 9 '
 'ev (Meyer 1985).',
 'Thus, the Breneman & Stone (1985) observations rule out a SEP source with '
 'photospheric abundances.',
 'There is now some evidence to suggest that elemental abundances of flare '
 'plasmas more closely match the photospheric rather than the coronal '
 'abundances (Feldman & Widing 1990), perhaps because the photospheric '
 'material is the primary source of the heated flare plasma.',
 'The work of Widing & Feldman (1989) suggests a fundamental distinction in '
 'elemental abundances between closed and open magnetic structures, matching '
 'the nominal photospheric and coronal abundances, respectively.']


Next let's embed the sentence and get query results. We'll want to know
1. At what distance do we find the target doi?
1. For any closer results, what is in their chunks?

In [41]:
embedder = get_embedder(model_name='BAAI/bge-small-en', device=device, normalize=True)
query_vector = embedder(example['sent_no_cit'])

In [42]:
query_results = db.query_vector_column(
    query_vector=query_vector,
    table_name='lib',
    target_column='bge',
    metric='vector_cosine_ops',
    pubdate=example.pubdate,
    use_index=True,
    top_k = 100_000,
    probes=100
)

  Query execution time: 10.94 seconds


Let's do a reality check on the results:
1. Did you get all `top_k` results?
1. Is the target doi in the top k results?

In [43]:
print(f" Got {len(query_results)} results")
result_dois = set(result.doi for result in query_results)
print(f" Found {len(result_dois)} unique DOIs")

target_doi = example['citation_dois'][0]
print(f"Target doi in the top k: {target_doi in result_dois}")

 Got 100000 results
 Found 10221 unique DOIs
Target doi in the top k: True


We found the target doi in the results. At what rank and distance was it found?

In [51]:
target_idx = -1
for i, result in enumerate(query_results):
    if result.doi == target_doi:
        target_idx = i
        print(f"Rank: {target_idx+1}")
        print(f"Distance: {result.distance}")
        break


Rank: 40
Distance: 0.1385815657030588


This example was the 40th closest to the query. Its chunk:

In [53]:
target_query_result = query_results[target_idx]
pprint(target_query_result.pubdate)
pprint(target_query_result.chunk)

datetime.date(1989, 9, 1)
('Furthermore, the abundance variations are correlated with the type of '
 'magnetic field topologv observed. Comparison of Mg vm 315.02 Â with Si vm '
 '319.84 A indicates that the magnesium abundance is relatively unchanged, so '
 'that the variations primarily reflect changes in the abundance of neon. '
 'Meyer (1985) suggested that the apparent depletion of ele- 1049 Ne VI 40 I. '
 'I 4 Mg VI 400.66 Fig. 4.—(a) The Ne vu 465/Ca ix 466 intensity ratio plotted '
 'against the Ne vi 401/Mg vi 400 intensity ratio from the observations in '
 'Table 1 and an impulsive flare showing the variation in the neon abundance '
 'relative to calcium or magnesium, (b) The Ne vii/Ca ix ratio divided by the '
 'Ne vi/Mg vi ratio in 4(a) showing the smaller variation in the Mg/Ca '
 'abundance ratio. ments with high-first ionization potentials (FIP) in the '
 'corona compared to low-FIP elements could be explained by assuming a '
 'separation process at the base of the coron

This looks like a quality citation. The review says this paper suggests there exist different element abundances between closed and open magnetic structures, and this chunk aligns with that claim.

What did the embedder consider even more similar than this chunk? What were the top chunks?

In [49]:
top = query_results[0]
pprint(top.chunk)

('3, 1969 Thermal continuum radiation 377 carried out. The variation of the '
 'ratio g(Z, T, c/X)lg(i, T, c/X) was checked at a number of wavelengths and '
 'temperatures in the ranges o*8 to ioo*o io6 °K and i to 30 Â for the twelve '
 'elements listed in Table I. The variations found in the ratio of the Gaunt '
 'factors would lead to fluctuations of up to 20 per cent in the value of the '
 'bracketed term in equation (6). These fluctuations are similar to those '
 'noted by Hovenier who found variations of up to 15 per cent in this term. '
 'Table I shows values of the bracketed term in equation (5) at a temperature '
 'of 5*0 io6 °K and a wavelength of 5 Â. Two sets of element abundances were '
 'employed. For the corona the abundance values used are those of Pottasch '
 '(1967) and Jordan (1968). In cases where these results have disagreed, an '
 'average value Table I Dependence of free-free flux on element abundances '
 'Element H He C N O Ne Mg Si S Ar Ca Fe Coronal abundances

This chunk is also talking about variation in abundances as well as corona.

In [54]:
print(f"Top pubdate: {top.pubdate}")
print(f"Example pubdate: {example.pubdate}")
print(f"Target pubdate: {target_query_result.pubdate}")

Top pubdate: 1969-01-01
Example pubdate: 1992-01-01
Target pubdate: 1989-09-01


In this case the top ranked result's pubdate was published much earlier than the target pubdate. Should there be a preference for newer publications?

In [55]:
# print all the pubdates for records ranked higher than the target
pprint([result.pubdate for result in query_results[:target_idx]])

[datetime.date(1969, 1, 1),
 datetime.date(1989, 9, 1),
 datetime.date(1977, 5, 1),
 datetime.date(1975, 10, 1),
 datetime.date(1974, 7, 1),
 datetime.date(1971, 12, 1),
 datetime.date(1969, 1, 1),
 datetime.date(1975, 12, 1),
 datetime.date(1984, 1, 1),
 datetime.date(1965, 2, 1),
 datetime.date(1974, 1, 1),
 datetime.date(1988, 9, 1),
 datetime.date(1969, 1, 1),
 datetime.date(1991, 11, 1),
 datetime.date(1984, 1, 1),
 datetime.date(1980, 6, 1),
 datetime.date(1969, 1, 1),
 datetime.date(1975, 10, 1),
 datetime.date(1985, 1, 1),
 datetime.date(1972, 12, 1),
 datetime.date(1985, 1, 1),
 datetime.date(1971, 12, 1),
 datetime.date(1975, 10, 1),
 datetime.date(1963, 4, 1),
 datetime.date(1981, 10, 1),
 datetime.date(1975, 12, 1),
 datetime.date(1971, 5, 1),
 datetime.date(1982, 11, 1),
 datetime.date(1961, 5, 1),
 datetime.date(1975, 10, 1),
 datetime.date(1989, 9, 1),
 datetime.date(1966, 9, 1),
 datetime.date(1960, 11, 1),
 datetime.date(1985, 1, 1),
 datetime.date(1960, 11, 1),
 datet

These were all earlier except the one at index 1, which has the same pubdate as the target. But it's not the target?

In [57]:
second_result = query_results[1]
pprint(second_result.pubdate)
pprint(second_result.doi)
pprint(second_result.chunk)

datetime.date(1989, 9, 1)
'10.1086/167855'
('Furthermore, some of the variations of the observed spectrum are not simply '
 'due directly to the higher abundance of individual elements near the '
 'magnetic pole, but to the changes in atmospheric structure induced by the '
 'combination of all such abundance variations acting in concert. Two other '
 'features of the inferred abundance distributions are notable. First, the '
 'highest abundance regions found for the already cosmically abundant elements '
 'Si and Fe are rather large; in our model the two rings out to a = 72° are '
 'both high-abundance regions for these elements relative to the equatorial '
 'abundances. In contrast, the polar caps of the cosmically lower abundance '
 'elements Ti and Cr are smaller; for both these elements the midlatitude '
 '(ring 2) abundances are considerably smaller than at the pole. A second, '
 'possibly significant feature of the abundance models is that all three '
 'iron-peak elements have qu