In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
from time import time
import torch
from TextEnrichers import get_enricher, TextEnricher
from database.database import Database
from embedders import Embedder, get_embedder
from tqdm import tqdm
from pprint import pprint
import numpy as np

load_dotenv(".env", override=True)

db = Database()
db.test_connection()

Database         User             Host                             Port            
citeline_db      bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.3 (Homebrew) on x86_64-apple-darwin23.6.0, compiled by Apple clang version 16.0.0 (clang-1600.0.26.6), 64-bit',)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu'
print(f"Using device: {device}")
examples = pd.read_json('data/dataset/100/nontrivial.jsonl', lines=True)
print(f"Loaded {len(examples)} examples")

Using device: mps
Loaded 100 examples


In [3]:
def get_index_of_target(query_results, target_doi: str) -> int:
    """
    Get the index of the target DOI in the query results.
    :param query_results: The query results from the database.
    :param target_doi: The DOI of the target paper.
    :return: The index of the target DOI in the query results.
    """
    for i, result in enumerate(query_results):
        if result.doi == target_doi:
            return i
    return -1

def get_query_results_and_ranks(examples, embedder, enricher, target_column) -> list[int]:
    all_query_results = []
    target_ranks = []
    # Enrich and embed the sentences
    enriched_sentences = enricher(examples)
    embeddings = embedder(enriched_sentences)

    for i, embedding in tqdm(enumerate(embeddings)):
        pubdate = examples.iloc[i]['pubdate']
        target_doi = examples.iloc[i]['citation_dois'][0]

        query_results = db.query_vector_column(
            query_vector=embedding,
            target_table="lib",
            target_column=target_column,
            pubdate=pubdate,
            top_k=10_000,
            probes=40,
            explain=False,
        )
        all_query_results.append(query_results)

        # Get the rank of the target DOI in the query results
        target_rank = get_index_of_target(query_results, target_doi)
        target_ranks.append(target_rank)
    return all_query_results, target_ranks

In [4]:
# BGE with identity experiment
bge_embedder = get_embedder("BAAI/bge-small-en", device=device, normalize=True)
identity_enricher = get_enricher("identity", path_to_data="data/preprocessed/reviews.jsonl")

bge_query_results, bge_ranks = get_query_results_and_ranks(examples, bge_embedder, identity_enricher, "bge_norm")


100it [03:08,  1.89s/it]


In [6]:
def print_rank_stats(ranks):
    """
    Print the rank statistics.
    :param ranks: The ranks to analyze.
    """
    print(f"Mean rank: {np.mean(ranks)}")
    print(f"Median rank: {np.median(ranks)}")
    print(f"Max rank: {np.max(ranks)}")
    print(f"Rank 0 count: {ranks.count(0)}")
    print(f"Rank 1 count: {ranks.count(1)}")
    print(f"Rank 2 count: {ranks.count(2)}")
    print(f"Rank 3 count: {ranks.count(3)}")

print_rank_stats(bge_ranks)

Mean rank: 645.41
Median rank: 18.0
Max rank: 8045
Rank 0 count: 10
Rank 1 count: 10
Rank 2 count: 2
Rank 3 count: 2


In [8]:
failed_queries = [i for i in range(len(bge_ranks)) if bge_ranks[i] == -1]
print(f"Failed queries: {len(failed_queries)}")
print(failed_queries)

Failed queries: 18
[0, 8, 10, 11, 14, 23, 27, 40, 42, 46, 58, 65, 66, 68, 72, 78, 80, 93]


## Methodology

Each failed query represents an example query executed on the database with top-k of 10,000 and 40 probes where none of the results were chunks from the target document.

- Verify the target doi is correct
- If not, identify why not
  - would the correct doi have closer chunk embeddings?
- If correct, inspect what chunks were considered close
- What transformations, if any, would make the target chunk's embedding closer to the query vector?

Let's try with example 18, the worst-performing one in the dataset

In [10]:
example = examples.iloc[failed_queries[0]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: These are minor species, with abundances of 1.5 10 12 , '
 '1.7 10 9 , and 7 10 8 kg N, respectively ( Ussiri and Lal, 2013 ).')
('Example: These are minor species, with abundances of 1.5 10 12 , 1.7 10 9 , '
 'and 7 10 8 kg N, respectively ( Ussiri and .')
Target doi: ['10.1016/j.epsl.2013.07.013']


This is an incorrect doi: the inline citation missed the first author. 

### Incorrect DOI

In [11]:
example = examples.iloc[failed_queries[1]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: But assuming the existence of ad hoc density '
 'fluctuations, may change the volume integrated gamma-ray spectrum as to make '
 'it similar to the observed one Morlino and Caprioli ( 2012 ).')
('Example: But assuming the existence of ad hoc density fluctuations, may '
 'change the volume integrated gamma-ray spectrum as to make it similar to the '
 'observed one Morlino and .')
Target doi: ['10.1088/1475-7516/2012/07/038']


### Incorrect DOI

In [12]:
example = examples.iloc[failed_queries[2]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Sweatman (2021) ignored data from carbon spherules '
 'generated by both YDIH proponents and critics that provides one of the most '
 'compelling pieces of evidence against the YDIH.')
('Example: ignored data from carbon spherules generated by both YDIH '
 'proponents and critics that provides one of the most compelling pieces of '
 'evidence against the YDIH.')
Target doi: ['10.1016/j.gca.2021.07.031']


The doi actually refers to Sun et al 2021, not Sweatman 2021. The dataset build process should not have included an example where the year and author initial had more than one possibility

In [33]:
reviews = pd.read_json("data/preprocessed/reviews.jsonl", lines=True)
original_record = reviews[reviews['doi'] == example.source_doi].iloc[0]

In [34]:
for ref in original_record.reference:
    if ref[:4] == "2021" and ref[-1] == "S": 
        print(ref)

2021GeCoA.312...57S


That's the bibcode in the original review paper references that matched "2021" and "S". However, the bibcode for Sweatman (2021) is actually `2021ESRv..21803677S`. 

The bibcode for this inline citation is missing in the document's references:

In [35]:
for ref in original_record.reference:
    if ref[:4] == "2021":
        print(ref)


2021CliPa..17.1409N
2021EGUGA..23.8442T
2021GeCoA.312...57S
2021JGRD..12635379Z
2021MNRAS.501.3350F
2021NatCo..12.2106C
2021NatSR..1118632B
2021NatSR..1122359O
2021Natur.595...66R


### Conclusion: if this is a one-off error we can add the missing reference to the training data. 

In [37]:
example = examples.iloc[failed_queries[3]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Gyrochronology is based on the stellar spin-down due to '
 'magnetic braking and uses the surface rotation rate as a clock, adopting the '
 'so-called Skumanich relation between angular momentum loss and rotation '
 'rate, ( Skumanich 1972 ).')
('Example: Gyrochronology is based on the stellar spin-down due to magnetic '
 'braking and uses the surface rotation rate as a clock, adopting the '
 'so-called Skumanich relation between angular momentum loss and rotation '
 'rate, ( .')
Target doi: ['10.1086/151310']


This is the correct doi, so why didn't it come up?

The passage discusses:
- Gyrochronology being based on stellar spin-down
- Magnetic braking being the cause of the the spin-down
- Skumanich relation to compute this, which relates angular momentum loss and rotation rate

Is it in the dataset?

In [39]:
research = pd.read_json("data/preprocessed/research.jsonl", lines=True)

In [46]:
target_doi = example.citation_dois[0]
print(f"Target doi: {target_doi}")

target_record = research[research['doi'] == target_doi].iloc[0]
print(target_record.title)

Target doi: 10.1086/151310
Time Scales for Ca II Emission Decay, Rotational Braking, and Lithium Depletion


In [48]:
target_chunks = [chunk[0] for chunk in db.query(f"SELECT chunk FROM lib WHERE doi = '{target_doi}'")]
print(f"Number of chunks: {len(target_chunks)}")

Number of chunks: 6


In [50]:
from scipy.spatial.distance import cosine as cosine_distance
example_embedding = bge_embedder([example.sent_no_cit])[0]
chunk_embeddings = bge_embedder(target_chunks)
for i in range(len(target_chunks)):
    chunk = target_chunks[i]
    chunk_embedding = chunk_embeddings[i]
    distance = cosine_distance(example_embedding, chunk_embedding)
    pprint(f"Distance: {distance}")
    pprint(chunk)
    print()

'Distance: 0.1627938151359558'
('197 2ApJ. . .171. .565S The Astrophysical Journal, 171:565-567, 1972 '
 'February 1 © 1972. The University of Chicago All rights reserved. Printed in '
 'U S A TIME SCALES FOR Ca n EMISSION DECAY, ROTATIONAL BRAKING, AND LITHIUM '
 'DEPLETION A. Skumanich High Altitude Observatory, National Center for '
 'Atmospheric Research,* Boulder, Colorado Received 1971 June 21 ABSTRACT A '
 'comparison of the Ca+ emission luminosity—after correction for spectral-type '
 'effects—for the Pleiades, Ursa Major, and Hyades stars and the Sun indicate '
 'an emission decay which varies as the inverse square root of the age. '
 'Further, the rotational decay curve is found to satisfy the same law. It is '
 'further suggested that lithium depletion follows the+ same law but only as '
 'far as the Hyades age, after which the depletion proceeds exponentially. '
 'Since Ca emission is linearly proportional to magnetic field strength at the '
 'surface, one can predict that 

Even though this is the paper that established the Skumanich relation, all chunk embeddings are rather distant at 0.16 - 0.17.

What did the embedding think was more similar?

In [52]:
example_query_results = bge_query_results[failed_queries[3]]
for query_result in example_query_results[:3]:
    pprint(f"Distance: {query_result.distance}\n {query_result.chunk}")

('Distance: 0.09885724166455323\n'
 ' Stars like the Sun are believed to have been spun down by magnetic braking '
 '(Skumamch 1972). Magnetic dynamo models indicate that the braking rate '
 "decreases with decreasing angular velocity. Skumamch's surface velocity ~ ~c "
 't 1/2, where t is the age of the star, leads to a spin-down timescale of ;d '
 '°~ ~spin, 1187 1188 RASIO ET AL. Vol. 470 while the theory of Tout & Pringle '
 '(1992) for fully convective, rapidly rotating stars leads to the weaker, but '
 'still inverse, dependence of ;d °~ ~-i/2~ In general the tidal '
 'synchronization rate increases with the lack of synchronicity, ,rsu oc ~ ~, '
 'where A1~ is the difference between stellar spin and orbital angular '
 'velocities. If ;~ and ;d were both short compared with the age of the star, '
 'then an equilibrium would be set up in which ;~ ~ ;d* This is the case for '
 'cataclysmic variables, in which tidal spin-up and magnetic braking (or '
 'gravitational radiation of angu

Trying query expansion: use an LLM to generate paraphrases of the query that surface additional background details:

> 
You are a domain-expert astrophysical writer. When given any single input sentence in quotes, your job is to produce 3 concise, standalone paraphrases that:

1. Preserve **all** original technical details and domain-specific jargon.
2. **Explicitly expose** any implied mathematical relationships in algebraic form (e.g. y = x^{-1/2}).
3. **Cite or mention** at least one observational proxy or physical manifestation
4. Keep the same meaning and level of formality.
5. List each paraphrase on its own numbered line.

Input: "Gyrochronology is based on the stellar spin-down due to magnetic braking and uses the surface rotation rate as a clock, adopting the so-called Skumanich relation between angular momentum loss and rotation rate."

In [56]:
paraphrases = [
    """Gyrochronology employs stellar rotation period decay (via magnetic braking's angular momentum dissipation) as an age indicator, mathematically expressed as 
J
˙
∝
Ω
3
J
˙
 ∝Ω 
3
  (Skumanich relation), with surface differential rotation patterns serving as observational proxies for spin-down rates.
Exposed relationship: 
τ
∝
P
2
τ∝P 
2
  (age 
τ
τ vs. rotation period 
P
P) derived from 
J
∝
Ω
−
1
/
2
J∝Ω 
−1/2
 .""",
    """
Stellar age dating via gyrochronology quantifies magnetic braking-induced spin-down using the Skumanich law (
Ω
∝
t
−
1
/
2
Ω∝t 
−1/2
 ), where surface rotation modulation (observable through starspot periodicity) directly traces angular momentum loss rates.
Exposed relationship: 
L
bol
L 
bol
 -normalized Rossby number 
R
o
≡
P
/
τ
c
Ro≡P/τ 
c
  links convection timescales 
τ
c
τ 
c
  to magnetic activity proxies like Ca II H+K emission.
  """,
    """
Gyrochronological models calibrate stellar ages through the Skumanich-derived 
Ω
˙
∝
−
Ω
3
Ω
˙
 ∝−Ω 
3
  torque law, where photometric rotation period measurements (e.g., from Kepler light curve periodograms) serve as direct observational inputs for angular momentum evolution tracks.
Exposed relationship: 
Ω
(
t
)
=
Ω
0
(
1
+
t
/
τ
Sk
)
−
1
/
2
Ω(t)=Ω 
0
 (1+t/τ 
Sk
 ) 
−1/2
  with characteristic timescale 
τ
Sk
∼
10
−
100
τ 
Sk
 ∼10−100 Myr for solar-type stars.""",
]

In [57]:
paraphrase_embeddings = bge_embedder(paraphrases)
for i in range(len(paraphrases)):
    distance = cosine_distance(example_embedding, paraphrase_embeddings[i])
    pprint(f"Distance {i}: {distance}")

'Distance 0: 0.07871377468109131'
'Distance 1: 0.08848387002944946'
'Distance 2: 0.11988002061843872'


We see here that a couple of these paraphrases do very well against the example embedding, beating the 0.99 distance of the top-ranked query result. Does that hold on a full query against the database if `paraphrase_embedding[0]` had been used?

In [58]:
paraphrase_query_results = db.query_vector_column(
    query_vector=paraphrase_embeddings[0],
    target_table="lib",
    target_column="bge_norm",
    pubdate=example.pubdate,
    top_k=10_000,
    probes=40,
    explain=False,
)


In [60]:
get_index_of_target(paraphrase_query_results, example.citation_dois[0])

943

In [62]:
for chunk in target_chunks:
    print(chunk)
    print()

197 2ApJ. . .171. .565S The Astrophysical Journal, 171:565-567, 1972 February 1 © 1972. The University of Chicago All rights reserved. Printed in U S A TIME SCALES FOR Ca n EMISSION DECAY, ROTATIONAL BRAKING, AND LITHIUM DEPLETION A. Skumanich High Altitude Observatory, National Center for Atmospheric Research,* Boulder, Colorado Received 1971 June 21 ABSTRACT A comparison of the Ca+ emission luminosity—after correction for spectral-type effects—for the Pleiades, Ursa Major, and Hyades stars and the Sun indicate an emission decay which varies as the inverse square root of the age. Further, the rotational decay curve is found to satisfy the same law. It is further suggested that lithium depletion follows the+ same law but only as far as the Hyades age, after which the depletion proceeds exponentially. Since Ca emission is linearly proportional to magnetic field strength at the surface, one can predict that the surface fields are proportional to angular velocity and decay as the inverse 

In [65]:
example_embedding = bge_embedder([example.sent_no_cit])[0]

In [68]:
doc_expansions = [
    "The inverse square-root relationship between rotation rate and stellar age, remains foundational for modeling angular momentum loss via magnetized winds, though age revisions (e.g., van den Heuvel 1969) challenge the universality of this timescale",
    "Skumanich's correlation between Li abundance decay and rotational braking suggests a shared dependence on magnetic activity, implicating wind-driven spin-down in regulating interior mixing efficiency",
    "The proportionality between Ca II emission, rotation, and Li abundance (Skumanich 1972) underscores the role of dynamo-generated fields in shaping both surface activity and light-element depletion",
    "Discrepancies in Hyades' age estimates (van den Heuvel 1969) introduce uncertainties in power-law indices, necessitating multi-cluster analyses to disentangle rotational and chemical evolution",
    "While Skumanich's inverse power law fits young clusters, deviations in older systems hint at unresolved physics in late-stage angular momentum transport",
]

doc_embeddings = bge_embedder(doc_expansions)
for i in range(len(doc_expansions)):
    distance = cosine_distance(example_embedding, doc_embeddings[i])
    pprint(f"Distance {i}: {distance}")

'Distance 0: 0.10995858907699585'
'Distance 1: 0.1576480269432068'
'Distance 2: 0.16275274753570557'
'Distance 3: 0.18524116277694702'
'Distance 4: 0.15468883514404297'


Not great but much better!

### Conclusion: 
- the embedder will naturally consider chunks that cite the same sources as overly similar. 
- Paraphrasing the query to surface more background details brought the target doc under rank 1000
- Document expansion also improved matters somewhat

## Failure 4

In [69]:
example = examples.iloc[failed_queries[4]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Significantly, however, a similar step of 20–40 km in '
 'lithospheric thickness across the Shield/Platform boundary is reported by '
 'Al-Amri et al. (2008) .')
('Example: Significantly, however, a similar step of 20–40 km in lithospheric '
 'thickness across the Shield/Platform boundary is reported by  .')
Target doi: ['10.1016/j.jafrearsci.2008.01.004']


This is the wrong doi. The correct doi is "10.1515/agp-2015-0009"

In [80]:
original_record = reviews[reviews.doi == example.source_doi].iloc[0]
print(original_record.title)
for ref in original_record.reference:
    if ref[:4] == "2008":
        print(ref)


Continental lithosphere of the Arabian Plate: A geologic, petrologic, and geophysical synthesis
2008E&ES....2a2005S
2008GGG.....9.7020P
2008GeoJI.172.1179G
2008GondR..14....5M
2008JAfES..51..189A
2008JGRB..11311404G
2008JGSoc.165..453J


We see there is only one bibcode matching `2008...A`, but this bibcode actually refers to another paper:
_Geochemistry and metamorphism of the Pan-African back-arc Malhaq volcano-sedimentary Neoproterozoic association, W. Kid area, SE Sinai, Egypt_
Abu El-Enen, Mahrous M.

NOT Al-Amri et al (2008).

### Conclusion: error in the dataset
- delete or fix
- add logic that when a bibcode is resolved from an inline citation, we check that the bibcode actually matches the author in the inline citation

## Failure 5

In [81]:
example = examples.iloc[failed_queries[5]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Cantat-Gaudin(2019) find that all 7 of the groups they '
 'identify within the Vela-Puppis region are expanding, Armstrongetal.')
('Example: find that all 7 of the groups they identify within the Vela-Puppis '
 'region are expanding, Armstrongetal.')
Target doi: ['10.1093/mnras/stz2467']


Incorrect target doi. The target doi above refers to "On the survivability of planets in young massive clusters and its implication of planet orbital architectures in globular cluster", Maxwell Xu Cai, S. Portegies Zwart, M.B.N. Kouwenhoven, Rainer Spurzem.

But Cantat-Gaudin 2019 is "Gaia DR2 unravels incompleteness of nearby cluster population: new open clusters in the direction of Perseus" 

### Conclusion: error in the dataset

## Failure 6

In [85]:
example = examples.iloc[failed_queries[6]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Here we review on preliminary work by Bekki et al. ( 2019 '
 '), who simulated fully-compressible convection inside a rotating spherical '
 'shell extending from to .')
('Example: Here we review on preliminary work by , who simulated '
 'fully-compressible convection inside a rotating spherical shell extending '
 'from to .')
Target doi: ['10.1103/PhysRevFluids.4.013803']


Incorrect target doi: The target above refers to 'Linear and nonlinear stability of a quasi-geostrophic mixing layer subject to a uniform background shear' by Biancofiore and Umurhan. 

Bekki has two papers published in 2019 both of which they are the sole author. Nevertheless the dataset prep should have looked for Bekki 2019 papers. It resolved the wrong paper due to incorrect bibcode in the references:

In [86]:
original_record = reviews[reviews.doi == example.source_doi].iloc[0]
print(original_record.title)
for ref in original_record.reference:
    if ref[:4] == "2019":
        print(ref)

Rossby Waves in Astrophysics
2019A&A...621A.136M
2019A&A...622A.124A
2019A&A...623A..50L
2019A&A...626A...3L
2019A&A...626A..38L
2019ApJ...871L..32H
2019ApJ...874..162G
2019ApJ...884L...5S
2019ApJ...887....1R
2019JPO....49..291E
2019LRSP...16....2M
2019MNRAS.487..782L
2019MNRAS.488..645P
2019PASP..131g2001L
2019PhRvE.100d3105H
2019PhRvF...4a3803B
2019SoPh..294...88M


Were any of the query results from a Bekki 2019 paper?

In [90]:
example_query_results = bge_query_results[failed_queries[6]]
bekki_dois = ["10.1051/0004-6361/201629898", "10.1093/mnras/stz999"]
for i in range(len(example_query_results)):
    query_result = example_query_results[i]
    if query_result.doi in bekki_dois:
        print(f"Distance: {query_result.distance}")
        print(f"Chunk: {query_result.chunk}")
        print(f"Rank: {i}")
        print()

Are any Bekki 2019 papers in the dataset?

In [91]:
bekki_research = research[research.doi.isin(bekki_dois)]
print(bekki_research)

Empty DataFrame
Columns: [bibcode, abstract, aff, author, bibstem, doctype, doi, id, pubdate, title, read_count, reference, data, citation_count, citation, body, dois, keywords, loaded_from, body_sentences]
Index: []


### Conclusion: error in the dataset
- correct the bibcode reference or delete
- the Bekki research cited inline wasn't in the research data to begin with


## Failure 7

In [94]:
example = examples.iloc[failed_queries[7]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: It also hosts a Compton-thick AGN in the Western '
 'component, observed directly in hard X-rays (Della Ceca et al. 2002 ; Ballo '
 'et al. 2004 ).')
('Example: It also hosts a Compton-thick AGN in the Western component, '
 'observed directly in hard X-rays (Della ; .')
Target doi: ['10.1086/339896', '10.1086/379887']


### Conclusion: incorrect target doi from failing to extract first author

## Failure 8

In [95]:
example = examples.iloc[failed_queries[8]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Evidence for very great youth comes from the study of BM '
 'Ori (θ 1 Ori B) by Palla Stahler (2001) , who find that star to have an age '
 'of 100,000 years.')
('Example: Evidence for very great youth comes from the study of BM Ori (θ 1 '
 'Ori B) by Palla  , who find that star to have an age of 100,000 years.')
Target doi: ['10.1086/319078']


### Conclusion: incorrect target doi from failing to extract first author

## Failure 9 


In [96]:
example = examples.iloc[failed_queries[9]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: The second brightest SCUBA source, falling just outside '
 'the primary WFPC2 field, has no obvious counterpart in the flanking field '
 'WFPC2 images, nor in the NICMOS data of Dickinson et al (2000b) .')
('Example: The second brightest SCUBA source, falling just outside the primary '
 'WFPC2 field, has no obvious counterpart in the flanking field WFPC2 images, '
 'nor in the NICMOS data of  .')
Target doi: ['10.1086/308508']


In [97]:
research[research.doi == example.citation_dois[0]]

Unnamed: 0,bibcode,abstract,aff,author,bibstem,doctype,doi,id,pubdate,title,read_count,reference,data,citation_count,citation,body,dois,keywords,loaded_from,body_sentences
42955,2000ApJ...531..624D,We describe an object in the Hubble Deep Field...,"[Visiting Astronomer, Kitt Peak National Obser...","[Dickinson, Mark, Hanley, Christopher, Elston,...","[ApJ, ApJ...531]",article,10.1086/308508,11681082,2000-03-01,The Unusual Infrared Object HDF-N J123656.3+62...,20,"[1969ApJ...158L.133B, 1977ApJ...212..438M, 198...","[ESA:1, NED:4, SIMBAD:15, hst:1]",103,"[1999ASPC..193..448B, 2000AJ....120.2735W, 200...","1. INTRODUCTION In recent years, astronomers h...","[10.1086/308508, 10.48550/arXiv.astro-ph/9908083]","[COSMOLOGY: EARLY UNIVERSE, GALAXIES: EVOLUTIO...",data/json/salvaged_articles.json,"[1. INTRODUCTION In recent years, astronomers ..."


The research was in the dataset. The original sentence refers to images taken by a camera system (SCUBA) and ongoing research into some of its data. This includes Dickinson et all (2000b) which describes a specific object in the HDF found in a different dataset (NICMOS) and hypothesizes about its nature.

Let's see how the abstract or other chunks compare to the query embedding:

In [104]:
targets = [(chunk[0], chunk[1]) for chunk in db.query(f"SELECT chunk, abstract FROM lib WHERE doi = '{example.citation_dois[0]}'")]
print(f"Number of chunks: {len(targets)}")
target_chunks = [chunk[0] for chunk in targets]
target_abstract = targets[0][1]
print(f"Target abstract: {target_abstract}")

Number of chunks: 34
Target abstract: We describe an object in the Hubble Deep Field North with very unusual near-infrared properties. It is readily visible in Hubble Space Telescope NICMOS images at 1.6 μm and from the ground at 2.2 μm, but it is undetected (with S/N&lt;~2) in very deep WFPC2 and NICMOS data from 0.3 to 1.1 μm. The f<SUB>ν</SUB> flux density drops by a factor &gt;~8.3 (97.7% confidence) from 1.6 to 1.1 μm. The object is compact but may be slightly resolved in the NICMOS 1.6 μm image. In a low-resolution, near-infrared spectrogram, we find a possible emission line at 1.643 μm, but a reobservation at higher spectral resolution failed to confirm the line, leaving its reality in doubt. We consider various hypotheses for the nature of this object. Its colors are unlike those of known Galactic stars, except perhaps the most extreme carbon stars or Mira variables with thick circumstellar dust shells. It does not appear to be possible to explain its spectral energy distributi

Let's see how the embedding of the abstract alone would do against this query

In [None]:
# Compare abstract embedding to query embedding
abstract_embedding = bge_embedder([target_abstract])[0]
example_embedding = bge_embedder([example.sent_no_cit])[0]
print(cosine_distance(abstract_embedding, example_embedding))

0.209521


In [106]:
# Compare other chunks to query embedding
chunk_embeddings = bge_embedder(target_chunks)
for i in range(len(target_chunks)):
    chunk = target_chunks[i]
    chunk_embedding = chunk_embeddings[i]
    distance = cosine_distance(example_embedding, chunk_embedding)
    print(f"({i}) Distance: {distance}")
    print(chunk[:100])
    print()

(0) Distance: 0.21469300985336304
Its formal significance depends on how one treats the non‐Gaussian tail of positive pixel values due

(1) Distance: 0.21431702375411987
(1999) ISO source list, as well as a possible 3 σ detection in the 1.3 mm IRAM map of Downes et al. 

(2) Distance: 0.2400287389755249
The target was periodically reacquired and placed at new positions along the slit. Seeing throughout

(3) Distance: 0.2389879822731018
Spectra were extracted through a 3 pixel (105) wide window at the nominal position of the object rel

(4) Distance: 0.23204004764556885
We may therefore make a direct comparison to the observed luminosity function ( Dickinson 1998 ; Ste

(5) Distance: 0.2336444854736328
If HDFN‐JD1 were a galaxy forming stars at with a Salpeter IMF and without dust, its UV luminosity w

(6) Distance: 0.24145996570587158
If rapid, monolithic galaxy formation took place anywhere in that redshift range, then either it was

(7) Distance: 0.21190601587295532
The NICMOS images

Not clear how this could be improved

### Conclusion: requires further research

## Failure 10

In [108]:
example = examples.iloc[failed_queries[10]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: Models have been suggested for intermittent pulsars and '
 'RRATs (e.g., Luo Melrose 2007 ) that might be relevant to FRBs but the '
 'vastly different energetics may make these models less relevant.')
('Example: Models have been suggested for intermittent pulsars and RRATs '
 '(e.g., Luo  that might be relevant to FRBs but the vastly different '
 'energetics may make these models less relevant.')
Target doi: ['10.1086/510850']


### Conclusion: incorrect target doi from failure to extract first author

## Failure 11

In [109]:
example = examples.iloc[failed_queries[11]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: However, the lack of high cadence data that could catch '
 'the brightenings/dimmings associated with early reconnection implied that '
 'they could not rule out the tether-cutting scenario (see also Sterling and '
 'Moore, 2004 ).')
('Example: However, the lack of high cadence data that could catch the '
 'brightenings/dimmings associated with early reconnection implied that they '
 'could not rule out the tether-cutting scenario (see also Sterling and .')
Target doi: ['10.1086/381085']


### Conclusion: incorrect target doi from failure to extract first author

## Failure 12

In [111]:
example = examples.iloc[failed_queries[12]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: 3. GLOBAL ASTROMETRY WITH THE HIPPARCOS–<italic '
 'toggle="yes">Gaia</italic> CONCEPT The concept for performing global '
 'astrometry with Gaia follows the same principles that were used for the '
 'Hipparcos mission ( Lindegren 2005 ; see the sidebar titled Gaia in Brief):')
('Example: 3. GLOBAL ASTROMETRY WITH THE HIPPARCOS–<italic '
 'toggle="yes">Gaia</italic> CONCEPT The concept for performing global '
 'astrometry with Gaia follows the same principles that were used for the '
 'Hipparcos mission ( ; see the sidebar titled Gaia in Brief):')
Target doi: ['10.1051/0004-6361:20042241']


The is the incorrect target doi. The inline citation refers to Lindegren 2005, which is presumably meant to be bibcode `2005ASPC..338...25L` "Performance of the Gaia Mission". The doi resolved refers to a Lamers paper

In [112]:
example_record = reviews[reviews.doi == example.source_doi].iloc[0]
for ref in example_record.reference:
    if ref[:4] == "2005":
        print(ref)

2005A&A...438..745B
2005A&A...441..117L


### Conclusion: error in dataset

## Failure 13

In [113]:
example = examples.iloc[failed_queries[13]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: The corresponding for an isothermal turbulent field, '
 'however, does not have a unique functional form (Vázquez-Semadeni and García '
 '2001 ).')
('Example: The corresponding for an isothermal turbulent field, however, does '
 'not have a unique functional form (Vázquez-Semadeni and .')
Target doi: ['10.1086/322873']



### Conclusion: incorrect first author extraction

## Failure 14

In [114]:
example = examples.iloc[failed_queries[14]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: This is somewhat above the 95% confidence level upper '
 'limit at 1.25 μm of Wright (2001b) .')
('Example: This is somewhat above the 95% confidence level upper limit at 1.25 '
 'μm of  .')
Target doi: ['10.1086/320942']


This example clearly seems to lack sufficient context.

In [118]:
print(f"Original sentence index: {example.sent_idx}")
example_record = reviews[reviews.doi == example.source_doi].iloc[0]
contextualized_example = " ".join(example_record.body_sentences[example.sent_idx - 4: example.sent_idx + 1])
pprint(f"Contextualized example: {contextualized_example}")

Original sentence index: 294
('Contextualized example: After subtraction of the IPD contribution, the '
 'residuals showed some remaining variation with ecliptic latitude, which they '
 'interpreted as evidence for a fairly isotropic background. To obtain a '
 'quantitative value for the background at each wavelength, they correlated '
 'their star-subtracted brightness at each point with the IPD model brightness '
 'and used the extrapolation to zero IPD contribution as a measurement of the '
 'CIB. The CIB intensities reported by Matsumoto et al. near 2.2 and 3.5 μm '
 'are similar to the values found by Gorjian et al. (2000) ( Figure 3 ). At '
 'shorter wavelengths, the Matsumoto et al. (2000) results continue to rise '
 'steeply to ∼65 nW m −2 sr −1 at 1.4 μm. This is somewhat above the 95% '
 'confidence level upper limit at 1.25 μm of Wright (2001b) .')


In [119]:
contextualized_query_vector = bge_embedder([contextualized_example])[0]
contextualized_query_results = db.query_vector_column(
    query_vector=contextualized_query_vector,
    target_table="lib",
    target_column="bge_norm",
    pubdate=example.pubdate,
    top_k=10_000,
    probes=40,
    explain=False,
)
contextualized_target_rank = get_index_of_target(contextualized_query_results, example.citation_dois[0])
print(f"Contextualized target rank: {contextualized_target_rank}")

Contextualized target rank: -1


### Conclusion: it is difficult to predict the citation semantically when the sentence directly refers to the research by name. Perhaps we should include an author filter to handle this as a separate use case?

## Failure 15

In [120]:
example = examples.iloc[failed_queries[15]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: In regard to the high multiple of 45 suggested by Usoskin '
 'et al. ( 2013 ) for the spectrum of the 1956 GLE to account for the 774 SEP '
 'event, this factor was later found to be an underestimate (see Sect. 7.4.2 '
 ').')
('Example: In regard to the high multiple of 45 suggested by  for the spectrum '
 'of the 1956 GLE to account for the 774 SEP event, this factor was later '
 'found to be an underestimate (see Sect. 7.4.2 ).')
Target doi: ['10.1051/0004-6361/201321080']


This is the correct doi. Let's see what it thought was closer

In [122]:
example_query_results = bge_query_results[failed_queries[15]]
for i in range(3):
    query_result = example_query_results[i]
    print(f"Rank: {i}")
    print(f"Distance: {query_result.distance}")
    pprint(f"Chunk: {query_result.chunk}")
    print()

Rank: 0
Distance: 0.17195202551983668
('Chunk: 2003 ). If the temperature is significantly higher than indicated by '
 'the assumed equation of state in the simulation, then the results for '
 'fragmentation in the region whose temperature has been underestimated must '
 'be regarded with suspicion.')

Rank: 1
Distance: 0.17688580475876237
('Chunk: For the same case, the excess energies varied by about + 50 °/~ and '
 '-1O°/~.')

Rank: 2
Distance: 0.1797103316023273
('Chunk: This might be due to either an underestimate of the Mg abundance in '
 'the stellar wind, or an underestimate of the ISM pressure surrounding the '
 'star.')



None of these are particularly close. The passage discusses
- a data point for the 1956 GLE event
- its relation to the 774-775 SEP event

Is it possible the embedder isn't giving proper weight to the GLE and SEP events since they are acronyms?

Let's see how astrobert does, and let's also try unpacking the acronyms

In [123]:
astrobert_embedder = get_embedder("adsabs/astroBERT", device=device, normalize=False)
query_vector_astrobert = astrobert_embedder([example.sent_no_cit])[0]
query_results_astrobert = db.query_vector_column(
    query_vector=query_vector_astrobert,
    target_table="lib",
    target_column="astrobert",
    pubdate=example.pubdate,
    top_k=10_000,
    probes=40,
    explain=False,
)
astrobert_target_rank = get_index_of_target(query_results_astrobert, example.citation_dois[0])
print(f"AstroBERT target rank: {astrobert_target_rank}")

AstroBERT target rank: 2952


In [127]:
nasa_embedder = get_embedder("nasa-impact/nasa-ibm-st.38m", device=device, normalize=True)
query_vector_nasa = nasa_embedder([example.sent_no_cit])[0]
query_results_nasa = db.query_vector_column(
    query_vector=query_vector_nasa,
    target_table="lib",
    target_column="nasa",
    pubdate=example.pubdate,
    top_k=10_000,
    probes=40,
    explain=False,
)
nasa_target_rank = get_index_of_target(query_results_nasa, example.citation_dois[0])
print(f"Astrosage target rank: {nasa_target_rank}")

Astrosage target rank: -1


Not great. Back to BGE with unpacked acronyms:

In [124]:
modified_query = example.sent_no_cit.replace("GLE", "Ground Level Enhancement").replace("SEP", "Solar Energetic Particle")
query_vector_modified = bge_embedder([modified_query])[0]
query_results_modified = db.query_vector_column(
    query_vector=query_vector_modified,
    target_table="lib",
    target_column="bge_norm",
    pubdate=example.pubdate,
    top_k=10_000,
    probes=40,
    explain=False,
)
modified_target_rank = get_index_of_target(query_results_modified, example.citation_dois[0])
print(f"Modified target rank: {modified_target_rank}")

Modified target rank: 3339


### Conclusion: the embedder struggled with acronyms, even ones trained on astrophysics. Might require more research on query expansion

## Failure 16



In [128]:
example = examples.iloc[failed_queries[16]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: The FP shifts imply a decrease of the rest-frame M/L '
 'ratio Δlog M/L B ≃ −0.46 z ( van Dokkum Stanford 2003 ), but—as emphasized '
 'above— the formation redshift one can derive from it depends on both '
 'cosmology and the IMF.')
('Example: The FP shifts imply a decrease of the rest-frame M/L ratio Δlog M/L '
 'B ≃ −0.46 z ( van Dokkum , but—as emphasized above— the formation redshift '
 'one can derive from it depends on both cosmology and the IMF.')
Target doi: ['10.1086/374570']


### Conclusion: incorrect first author extraction

## Failure 17

In [129]:
example = examples.iloc[failed_queries[17]]
pprint(f"Original sentence: {example.sent_original}")
pprint(f"Example: {example.sent_no_cit}")
print(f"Target doi: {example.citation_dois}")

('Original sentence: This is clearly shown by the lateral duplicate sampling '
 'performed by Scott et al. (2017, SI fig. S4) .')
('Example: This is clearly shown by the lateral duplicate sampling performed '
 'by , SI fig. S4) .')
Target doi: ['10.1016/j.quascirev.2017.01.005']


This is the wrong target doi. Given target doi refers to Slowinski et al, 2017 but Scott et al. 2017 could not be found.

### Conclusion: error in dataset. Fix or remove