In [None]:
import json
results = json.load(
    open(
        "experiments/results/contributions_identity_normTrue_n100_topk1000_20250718_101133/query_results_contributions_identity_normTrue_n100_topk1000_20250718_101133.json"
    )
)

In [60]:
import pandas as pd

In [None]:
import numpy as np

def cosine_distance(a, b):
    return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [17]:
# Get the unique dois from the database contribution table
from database.database import Database

db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


In [24]:
from Embedders import get_embedder

embedder = get_embedder(model_name="BAAI/bge-large-en-v1.5", device="mps", normalize=True)

In [9]:
"""
results is a list of lists: [query, query results] for each query.
query: dict
query_results: list of dicts [first result, second result, ...]


"""
from pprint import pprint
print("SCHEMA EXAMPLE:")
pprint(results[0][0])
print("=" * 100)
pprint("RESULTS EXAMPLE:")
pprint(results[0][1][0])

SCHEMA EXAMPLE:
{'citation_dois': ['10.1086/173633'],
 'expanded_query': 'Alternatively, photons produced at the disk or nucleus can '
                   'be reprocessed and/or scattered and therefore isotropized '
                   'in a region of appropriate scale ( [REF] ); their energy '
                   'density is then amplified in the jet frame by a factor Γ 2 '
                   '.',
 'first_rank': 37,
 'last_rank': 37,
 'pubdate': '1997-01-01',
 'resolved_bibcodes': ['1994ApJ...421..153S'],
 'sent_idx': 641,
 'sent_no_cit': 'Alternatively, photons produced at the disk or nucleus can be '
                'reprocessed and/or scattered and therefore isotropized in a '
                'region of appropriate scale ( [REF] ); their energy density '
                'is then amplified in the jet frame by a factor Γ 2 .',
 'sent_original': 'Alternatively, photons produced at the disk or nucleus can '
                  'be reprocessed and/or scattered and therefore isotropized '
   

In [10]:
# Get the records that never found all the target DOIs with topk 1000
failed_records = [record for record in results if record[0]["last_rank"] is None]
print(f"Number of records that failed to find all target DOIs: {len(failed_records)}")


Number of records that failed to find all target DOIs: 16


In [102]:
class QueryResult():
    def __init__(self, result: dict):
        self.distance = result["distance"]
        self.doi = result["doi"]
        self.pubdate = result["pubdate"]
        self.text = result["text"]


class FailureAnalysis():
    def __init__(self, failure: list, embedder):
        query, query_results = failure
        # Extract query keys
        self.citation_dois = query["citation_dois"]
        self.expanded_query = query["expanded_query"]
        self.first_rank = query["first_rank"]
        self.last_rank = query["last_rank"]
        self.pubdate = query["pubdate"]
        self.sent_idx = query["sent_idx"]
        self.sent_no_cit = query["sent_no_cit"]
        self.sent_original = query["sent_original"]
        self.source_doi = query["source_doi"]

        self.query_results = [QueryResult(result) for result in query_results]

        # Establish if there are any completely unfound target DOIs
        self.unfound_dois = set(self.citation_dois) - {result.doi for result in self.query_results}

        # Provide reference to the db and an embedder
        self.db = Database()
        self.embedder = embedder
        self.query_vector = self.embedder([self.expanded_query])[0]

        self.comparison_to_targets = self.__compare_contributions()

    def __compare_contributions(self):
        all_comparisons = dict()
        for doi in self.unfound_dois:
            contributions_results = self.db.query(
                f"SELECT text, embedding FROM contributions WHERE doi = '{doi}'"
            )
            comparisons = [{'text': text, 'distance': cosine_distance(self.query_vector, embedding)} for text, embedding in contributions_results]
            all_comparisons[doi] = comparisons
        return all_comparisons

    def print_overview(self):
        print(f"Query: {self.expanded_query}")
        print(f"  Source DOI: {self.source_doi}")
        print(f"  Sent original: {self.sent_original}")
        print(f"  Sent no cit: {self.sent_no_cit}")
        print(f"  Citation DOIs: {self.citation_dois}")
        print(f"First rank: {self.first_rank}, Last rank: {self.last_rank}") 
        print(f"Unfound DOIs: {self.unfound_dois}")
        print("="*100)
        print("First 3 query results:")
        for i, result in enumerate(self.query_results[:3]):
            print(f"Distance: {result.distance:.4f}")
            print(f"DOI: {result.doi}")
            print(f"Text: {result.text}")
            print("-"*50)

    def print_query_comparison_to_targets(self):
        for doi, results in self.comparison_to_targets.items():
            print(f"Comparing query to contributions for DOI: {doi}")
            for res in results:
                print(f"  {res['distance']:.4f}: {res['text']}")
            print("="*50)

fa = FailureAnalysis(failed_records[3], embedder)
fa.print_overview()
fa.print_query_comparison_to_targets()

Query: This follows from equating in the contact discontinuity frame the kinetic flux L /4π r 2 to the external ram pressure ρ ext γ 2 during the initial phase while γ ∼ constant, r ∝ t ( [REF] ).
  Source DOI: 10.1146/annurev.astro.40.060401.093821
  Sent original: This follows from equating in the contact discontinuity frame the kinetic flux L /4π r 2 to the external ram pressure ρ ext γ 2 during the initial phase while γ ∼ constant, r ∝ t ( Rees Mészáros 1992 ; see also Sari 1998 ).
  Sent no cit: This follows from equating in the contact discontinuity frame the kinetic flux L /4π r 2 to the external ram pressure ρ ext γ 2 during the initial phase while γ ∼ constant, r ∝ t ( [REF] ).
  Citation DOIs: ['10.1093/mnras/258.1.41P', '10.1086/311269']
First rank: 273, Last rank: None
Unfound DOIs: {'10.1093/mnras/258.1.41P'}
First 3 query results:
Distance: 0.3197
DOI: 10.1086/306297
Text: Onset flux of ~2 × 10^19 Mx in a dark pore appearing in an area where the flux increases by ~1 × 10^

In [100]:
print(f"{'Dist':<8}Text")
for doi, results in fa.compare_contributions().items():
    for res in results:
        print(f"{res['distance']:.4f}: {res['text']}")

Dist    Text
0.4178: The expansion energy of a relativistic fireball can be reconverted into radiation upon interaction with an external medium, with time-scales of the order of seconds for Lorentz factors greater than or approximately equal to 1000.
0.4738: This mechanism operates in cosmological scenarios of gamma-ray bursts involving initial energies of about a percent of a stellar rest mass, producing photon energies and time-scales compatible with observed gamma-ray bursts.
0.4364: Interaction with even very tenuous external matter efficiently reconverts the kinetic energy of a baryon-loaded relativistic fireball into high-energy photons, observable on time-scales similar to those of gamma-ray bursts.
0.4098: The deceleration of a fireball in an external medium leads to the reconversion of bulk kinetic energy into thermal energy, which can be radiated away if cooling times are sufficiently short.
0.4813: For a fireball with specific energy and external density parameters, the obse

In [16]:
# Look at the query
query = failed_records[0][0]
for key, value in query.items():
    print(f"{key}: {value}")

print("=" * 100)

# Find out which DOIs were not found
found_record = failed_records[0][1][query['first_rank']-1]
for key, value in found_record.items():
    print(f"{key}: {value}")

source_doi: 10.1146/annurev-astro-082812-140953
sent_original: Tacconi et al. (2010 , 2013 ) used this technique to identify samples of CSGs at z ∼ 1 to 3 for CO observations.
sent_no_cit: [REF] ) used this technique to identify samples of CSGs at z ∼ 1 to 3 for CO observations.
sent_idx: 361
citation_dois: ['10.1038/nature08773', '10.1088/0004-637X/768/1/74']
pubdate: 2013-08-01
resolved_bibcodes: ['2010Natur.463..781T', '2013ApJ...768...74T']
expanded_query: [REF] ) used this technique to identify samples of CSGs at z ∼ 1 to 3 for CO observations.
first_rank: 4
last_rank: None
text: PHIBSS provides 52 CO detections in two redshift slices at z ~ 1.2 and 2.2, with log(M*(M⊙)) >= 10.4 and log(SFR(M⊙/yr)) >= 1.5.
doi: 10.1088/0004-637X/768/1/74
pubdate: 2013-05-01
distance: 0.2715341721136365


In [41]:
# Get the contributions from the database for the missing DOI
contributions_results = db.query(
    "SELECT text, embedding FROM contributions WHERE doi = '10.1038/nature08773'"
)
contributions = [result[0] for result in contributions_results]
for text, c_embedding in contributions_results:
    print(text)
    print(f"Cosine distance to query: {cosine_distance(query_embedding, c_embedding):.4f}")
    print("=" * 100)

Distant star-forming galaxies at redshifts z ≈ 1.2 and z ≈ 2.3 were found to be gas-rich, with cold gas fractions of about 34% and 44% of their total baryonic mass, respectively.
Cosine distance to query: 0.3874
The star formation efficiency in these galaxies does not strongly depend on cosmic epoch, indicating similar star formation processes across different times.
Cosine distance to query: 0.5026
The cold gas fractions in these young galaxies are three to ten times higher than in today's massive spiral galaxies, suggesting a significant evolution in gas content over cosmic time.
Cosine distance to query: 0.4687
Observations reveal that the molecular gas in these galaxies is distributed in giant clumps with masses of ∼5 × 10^9 M⊙, diameters of 2–4 kpc, and surface densities ≥500 M⊙ pc^-2, which are larger but similar to conglomerates of molecular gas in local spiral galaxies.
Cosine distance to query: 0.4033
The CO dynamics in observed galaxies trace ordered rotating disk patterns, w

In [42]:
query, query_results = failed_records[1]
for key, value in query.items():
    print(f"{key}: {value}")

print("=" * 100)
for key, value in query_results[0].items():
    print(f"{key}: {value}")

source_doi: 10.1007/s00159-022-00144-z
sent_original: In Table 2 , that is a slightly modified version of Table 1 of Romano et al. ( 2017 ), determinations of , , and in a number of local and high-redshift objects (see Sect. 3.2.1 ) are reported.
sent_no_cit: In Table 2 , that is a slightly modified version of Table 1 of [REF], determinations of , , and in a number of local and high-redshift objects (see Sect. 3.2.1 ) are reported.
sent_idx: 678
citation_dois: ['10.1093/mnras/stx1197']
pubdate: 2022-12-01
resolved_bibcodes: ['2017MNRAS.470..401R']
expanded_query: In Table 2 , that is a slightly modified version of Table 1 of [REF], determinations of , , and in a number of local and high-redshift objects (see Sect. 3.2.1 ) are reported.
first_rank: None
last_rank: None
text: Some z ∼ 2 galaxies exhibit N/O ratios exceeding those of local galaxies in the N/O-O/H relation, while most show comparable N/O ratios in the N/O-stellar mass relation.
doi: 10.1093/pasj/psx017
pubdate: 2017-06-01


In [47]:
target_doi = query['citation_dois'][0]
contributions_results = db.query(
    f"SELECT text, embedding FROM contributions WHERE doi = '{target_doi}'"
)
for text, c_embedding in contributions_results:
    print(text)
    print(f"\tCosine distance to query: {cosine_distance(query_embedding, c_embedding):.4f}")

State-of-the-art chemical models track the cosmic evolution of CNO isotopes in the interstellar medium of galaxies, providing constraints on their stellar initial mass function (IMF).
	Cosine distance to query: 0.4121
The study reassesses the roles of massive stars, asymptotic giant branch (AGB) stars, and novae in producing rare isotopes like 13C, 15N, 17O, and 18O, alongside 12C, 14N, and 16O.
	Cosine distance to query: 0.4269
Includes CNO isotope yields from super-AGB stars, novae, and fast-rotating massive stars in the analysis.
	Cosine distance to query: 0.3940
Reproduces isotope enrichment data in the solar neighborhood and across the Galaxy, assessing model sensitivity to uncertainties like nova yields and star formation history.
	Cosine distance to query: 0.4104
Demonstrates the ability to constrain the stellar IMF in galaxies using C, O, and N isotope abundance ratios.
	Cosine distance to query: 0.3615
Finds compelling evidence for a top-heavy stellar IMF in starburst galaxies

In [70]:
# If the data wasn't missing, would it have done any better?
test_query = "In Table 2 , that is a slightly modified version of Table 1 of [REF], " \
                + "determinations of 12CO/13CO, 12CO/C180, and 13CO/C18O in a number of local and high-redshift objects " \
                + "(see Sect. 3.2.1 ) are reported"

In [71]:
results = db.vector_search(
    query_vector=embedder([test_query][0]),
    target_table="contributions",
    target_column="embedding",
    metric="vector_cosine_ops",
    pubdate=query['pubdate'],
    top_k=1000
)

In [72]:
rows_for_target = results[results['doi'] == target_doi]
with pd.option_context('display.max_colwidth', None):
    print(rows_for_target)

                                                                                                                                                                                        text  \
44                                                                             Demonstrates the ability to constrain the stellar IMF in galaxies using C, O, and N isotope abundance ratios.   
289     The study reassesses the roles of massive stars, asymptotic giant branch (AGB) stars, and novae in producing rare isotopes like 13C, 15N, 17O, and 18O, alongside 12C, 14N, and 16O.   
908  State-of-the-art chemical models track the cosmic evolution of CNO isotopes in the interstellar medium of galaxies, providing constraints on their stellar initial mass function (IMF).   

                       doi     pubdate  distance  
44   10.1093/mnras/stx1197  2017-09-01  0.282092  
289  10.1093/mnras/stx1197  2017-09-01  0.308724  
908  10.1093/mnras/stx1197  2017-09-01  0.326546  


In [65]:
with pd.option_context('display.max_colwidth', None):
    print(results.iloc[44])

text        Demonstrates the ability to constrain the stellar IMF in galaxies using C, O, and N isotope abundance ratios.
doi                                                                                                 10.1093/mnras/stx1197
pubdate                                                                                                        2017-09-01
distance                                                                                                         0.282092
Name: 44, dtype: object


In [73]:
def inspect(failed_example):
    query, query_results = failed_example
    print("Query:")
    for key, value in query.items():
        print(f"{key}: {value}")

    print("\nFirst Result:")
    for key, value in query_results[0].items():
        print(f"{key}: {value}")
    print("=" * 50)

    unfound_target_dois = set(query['citation_dois'])
    for i, result in enumerate(query_results):
        if result['doi'] in unfound_target_dois:
            unfound_target_dois.remove(result['doi'])
            print(f"Found DOI {result['doi']} at rank {i + 1}")
    print(f"Unfound DOIs: {unfound_target_dois}")


In [75]:
inspect(failed_records[2])

Query:
source_doi: 10.1007/s00159-015-0084-4
sent_original: is a reasonable assumption in the star-forming regions of the disk (Bell 1978 ).
sent_no_cit: is a reasonable assumption in the star-forming regions of the disk ([REF] ).
sent_idx: 118
citation_dois: ['10.1093/mnras/182.3.443']
pubdate: 2015-12-01
resolved_bibcodes: ['1978MNRAS.182..443B']
expanded_query: is a reasonable assumption in the star-forming regions of the disk ([REF] ).
first_rank: None
last_rank: None

First Result:
text: Optically thick starburst disks have characteristic properties: flux F ~ 10^13 L_solar kpc^-2, star formation rate per unit area Σ˙* ~ 10^3 M_solar yr^-1 kpc^-2, and dust effective temperature T_eff ~ 90 K.
doi: 10.1086/431923
pubdate: 2005-09-01
distance: 0.22343240487480354
Unfound DOIs: {'10.1093/mnras/182.3.443'}


In [76]:
def fixed_query(query, target_doi, pubdate):
    results = db.vector_search(
        query_vector=embedder([query])[0],
        target_table="contributions",
        target_column="embedding",
        metric="vector_cosine_ops",
        pubdate=pubdate,
        top_k=1000
    )
    rows_for_target = results[results['doi'] == target_doi]
    with pd.option_context('display.max_colwidth', None):
        print(rows_for_target)

    return results

In [77]:
fixed_query_str = "K_0 \simeq 100 is a reasonable assumption in the star-forming regions of the disk ([REF] )"
target_doi = "10.1093/mnras/182.3.443"
pubdate = "2015-12-01"
results = fixed_query(fixed_query_str, target_doi, pubdate)

Empty DataFrame
Columns: [text, doi, pubdate, distance]
Index: []


In [82]:
contributions_results = db.query(
    f"SELECT text, embedding FROM contributions WHERE doi = '{target_doi}'"
)
fixed_embedding = embedder([fixed_query_str])[0]
for text, c_embedding in contributions_results:
    print(text)
    print(f"\tCosine distance to query: {cosine_distance(fixed_embedding, c_embedding):.4f}")

The acceleration to relativistic energies of the high-energy tail of the particle distribution produced by a shock front is discussed, with particles needing to pass through the shock without strong deflection to be accelerated.
	Cosine distance to query: 0.3965
Large numbers of protons, and likely electrons, are produced by a shock front that satisfy the condition for acceleration, using Earth's bow shock as an example.
	Cosine distance to query: 0.4568
The energy spectrum of initially nonrelativistic particles is calculated, providing an estimate of the density of cosmic-ray particles in a shocked gas and indicating a large proportion of random energy from a shock is given to accelerated particles.
	Cosine distance to query: 0.3381
Synchrotron radio emission from energetic electrons in a shocked gas is calculated, with theoretical and observed flux densities of supernova remnants Tycho's SNR and Cas A found to agree satisfactorily.
	Cosine distance to query: 0.3500
The implications f

In [81]:
results.iloc[0]

text        New estimates of the spectral dependence of di...
doi                                            10.1086/511741
pubdate                                            2007-04-01
distance                                             0.200788
Name: 0, dtype: object

In [83]:
inspect(failed_records[3])

Query:
source_doi: 10.1146/annurev.astro.40.060401.093821
sent_original: This follows from equating in the contact discontinuity frame the kinetic flux L /4π r 2 to the external ram pressure ρ ext γ 2 during the initial phase while γ ∼ constant, r ∝ t ( Rees Mészáros 1992 ; see also Sari 1998 ).
sent_no_cit: This follows from equating in the contact discontinuity frame the kinetic flux L /4π r 2 to the external ram pressure ρ ext γ 2 during the initial phase while γ ∼ constant, r ∝ t ( [REF] ).
sent_idx: 77
citation_dois: ['10.1093/mnras/258.1.41P', '10.1086/311269']
pubdate: 2002-01-01
resolved_bibcodes: ['1992MNRAS.258P..41R', '1998ApJ...497L..17S']
expanded_query: This follows from equating in the contact discontinuity frame the kinetic flux L /4π r 2 to the external ram pressure ρ ext γ 2 during the initial phase while γ ∼ constant, r ∝ t ( [REF] ).
first_rank: 273
last_rank: None

First Result:
text: Onset flux of ~2 × 10^19 Mx in a dark pore appearing in an area where the flux in

In [85]:
target_doi = "10.1093/mnras/258.1.41P"
contributions_results = db.query(f"SELECT text, embedding FROM contributions WHERE doi = '{target_doi}'")

query_embedding = embedder([failed_records[3][0]['expanded_query']])[0]
for text, c_embedding in contributions_results:
    print(text)
    print(f"\tCosine distance to query: {cosine_distance(query_embedding, c_embedding):.4f}")

The expansion energy of a relativistic fireball can be reconverted into radiation upon interaction with an external medium, with time-scales of the order of seconds for Lorentz factors greater than or approximately equal to 1000.
	Cosine distance to query: 0.4178
This mechanism operates in cosmological scenarios of gamma-ray bursts involving initial energies of about a percent of a stellar rest mass, producing photon energies and time-scales compatible with observed gamma-ray bursts.
	Cosine distance to query: 0.4738
Interaction with even very tenuous external matter efficiently reconverts the kinetic energy of a baryon-loaded relativistic fireball into high-energy photons, observable on time-scales similar to those of gamma-ray bursts.
	Cosine distance to query: 0.4364
The deceleration of a fireball in an external medium leads to the reconversion of bulk kinetic energy into thermal energy, which can be radiated away if cooling times are sufficiently short.
	Cosine distance to query: 0