In [29]:
import numpy as np
import pandas as pd
import json
results = json.load(
    open(
        "experiments/results/contributions_add_prev_3_normTrue_n12006_topk1000_20250725_153323/query_results_contributions_add_prev_3_normTrue_n12006_topk1000_20250725_153323.json"
    )
)

def cosine_distance(a, b):
    return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [30]:
from embedders import get_embedder
from database.database import Database

embedder = get_embedder(model_name="BAAI/bge-large-en-v1.5", device="mps", normalize=True)
db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


In [31]:
def fetch_by_doi(doi: str, table: str = "contributions"):
    """
    Fetches a document by DOI from the database.
    """
    results = db.query(f"SELECT text, embedding FROM {table} WHERE doi = '{doi}'")
    return [{"text": text, "embedding": embedding} for text, embedding in results]

In [32]:
"""
results is a list of lists: [query, query results] for each query.
query: dict
query_results: list of dicts [first result, second result, ...]
"""

class QueryResult():
    def __init__(self, result: dict):
        self.distance = result["distance"]
        self.doi = result["doi"]
        self.pubdate = result["pubdate"]
        self.text = result["text"]

    def __str__(self):
        return f"DOI: {self.doi}, Distance: {self.distance:.6f},\n  '{self.text[:200]}...'"


class FailureAnalysis():
    def __init__(self, failure: list, embedder):
        query, query_results = failure
        # Extract query keys
        self.citation_dois = query["citation_dois"]
        self.expanded_query = query["expanded_query"]
        self.first_rank = query["first_rank"]
        self.last_rank = query["last_rank"]
        self.pubdate = query["pubdate"]
        self.sent_idx = query["sent_idx"]
        self.sent_no_cit = query["sent_no_cit"]
        self.sent_original = query["sent_original"]
        self.source_doi = query["source_doi"]

        self.query_results = [QueryResult(result) for result in query_results]

        # Establish if there are any completely unfound target DOIs
        self.unfound_dois = list(set(self.citation_dois) - {result.doi for result in self.query_results})

        # Provide reference to the db and an embedder
        self.db = Database()
        self.embedder = embedder
        self.query_vector = self.embedder([self.expanded_query])[0]

        # Get the target results
        self.target_results = {doi: fetch_by_doi(doi) for doi in self.unfound_dois}

        self.comparison_to_targets = self.__compare_contributions()

    def __compare_contributions(self):
        all_comparisons = dict()
        for doi in self.unfound_dois:
            contributions_results = self.db.query(
                f"SELECT text, embedding FROM contributions WHERE doi = '{doi}'"
            )
            comparisons = [{'text': text, 'distance': cosine_distance(self.query_vector, embedding)} for text, embedding in contributions_results]
            all_comparisons[doi] = comparisons
        return all_comparisons

    def print_overview(self):
        print(f"Query: {self.expanded_query}")
        print(f"  Source DOI: {self.source_doi}")
        print(f"  Sent original: {self.sent_original}")
        print(f"  Sent no cit: {self.sent_no_cit}")
        print(f"  Citation DOIs: {self.citation_dois}")
        print(f"First rank: {self.first_rank}, Last rank: {self.last_rank}") 
        print(f"Unfound DOIs: {self.unfound_dois}")
        print("="*100)
        print("First 3 query results:")
        for i, result in enumerate(self.query_results[:3]):
            print(f"Distance: {result.distance:.4f}")
            print(f"DOI: {result.doi}")
            print(f"Text: {result.text}")
            print("-"*50)

    def print_query_comparison_to_targets(self):
        for doi, results in self.comparison_to_targets.items():
            print(f"Comparing query to documents for DOI: {doi}")
            for res in results:
                print(f"  {res['distance']:.4f}: {res['text']}")
            print("="*50)

fa = FailureAnalysis(results[0], embedder)

fa.print_overview()

Query: Assuming binaries are randomly oriented and observed at random phases, one expects circular orbits to preferentially be observed with = 90 deg (perpendicular position and relative velocity vectors), while eccentric orbits should preferentially have = 0 deg and = 180 deg (aligned position and velocity vectors). This is illustrated in the bottom left panel of Fig. 3 . While this method loses some information compared to the approach of Tokovinin (2020) , it has the advantage of not requiring estimates of stellar masses and is likely less sensitive to the effects of unrecognized higher-order multiples. [REF] inferred an eccentricity distribution consistent with uniform at separations of 100 au , and steadily increasing eccentricities over 100 1000 au ( Fig. 3 , bottom right).
  Source DOI: 10.1016/j.newar.2024.101694
  Sent original: Hwang et al. (2022c) inferred an eccentricity distribution consistent with uniform at separations of 100 au , and steadily increasing eccentricities o

In [None]:
from tqdm import tqdm
misses = []
for i, result in tqdm(enumerate(results)):
    fa = FailureAnalysis(result, embedder)
    if fa.first_rank is None or fa.first_rank > 99:
        misses.append(i)

print(f"Total misses: {len(misses)}")

9
24
26
33
34
38
42
43
46
51
53
58
61
67
71
74
76
77
78
81
86
87
88
89
95
96
97
98
100
102
103
106
107
111
114
115
116
117
118
119
122
124
126
132
141
142
144
145
147
150
159
166
170
171
172
174
175
176
178
179
182
184
186
187
196
197
200
203
213
214
215
221
224
228
229
230
231
232
235
236
237
239
242
244
246
252
255
256
261
264
265
266
268
270
271
272
273
274
277
281
286
288
290
291
292
294
295
297
299
301
305
306
307
308
310
316
318
322
323
326
331
332
333
335
336
337
338
339
340
341
343
345
346
347
348
350
351
352
353
354
359
360
362
363
365
370
372
376
378
380
384
386
392
393
394
395
396
401
405
408
409
410
411
412
413
417
425
432
437
438
441
443
444
445
452
454
457
458
459
460
461
463
464
466
467
468
473
474
475
478
484
486
489
492
495
496
499
502
506
507
508
511
512
513
517
526
531
539
545
550
558
559
560
561
563
567
569
571
574
581
587
591
594
597
599
602
603
604
605
607
608
609
610
613
616
617
620
621
625
630
631
641
643
646
650
651
652
653
654
659
662
664
665
667
668
669
673
6

In [41]:
misses[0]

9

In [42]:
fa = FailureAnalysis(results[misses[0]], embedder)
fa.print_overview()

Query: This was a factor of 7 more than they would have expected to find if wide binaries were formed by randomly pairing field stars. That is, if a star is an eclipsing binary, it is more likely that its wide companion is an eclipsing binary than if it is not. They interpreted this finding as resulting partly from the age- and metallicity-dependence of the binary fraction (since the components of wide binaries are generally coeval and have the same metallicity), and partly reflecting the fact that wide tertiaries aid the formation of tight binaries. 3.3 Tests of gravity Binaries with separations larger than 1 0 4 au have sufficiently low internal accelerations that they are in the so-called deep MOND regime: that is, the Modified Newtonian Dynamics (MOND; [REF] ) theory of gravity predicts measurably different orbital velocities from general relativity.
  Source DOI: 10.1016/j.newar.2024.101694
  Sent original: 3.3 Tests of gravity Binaries with separations larger than 1 0 4 au have s

In [16]:
for qr in fa.query_results[:20]:
    print(qr.distance)

0.2680499787151681
0.2726014202178636
0.2767469121964613
0.28057700395583596
0.2850811567757433
0.28698592963687386
0.289584006120681
0.2897678708476258
0.29126087091538044
0.2937421448538131
0.2937798921001009
0.2948578068868255
0.29548930037276233
0.2973546530834561
0.2987845005778359
0.29933546189454296
0.3001391112721483
0.3003570913417072
0.3008544445037792
0.3009333489100051


In [17]:
with_prev3 = ("The sample contains 97 resolved "
              "WD+MS binaries, and 15 resolved WD+WD binaries. The "
              "number of WD+WD binaries is at least a factor of 10 lower "
              "than predicted by binary population synthesis models (Toonen "
              "et al., 2017). This mismatch could be partially explained by "
              "WD kicks, which would unbind many would-be WD+WD binaries. It may also owe in part to the mass ratio distribution "
              "adopted in population synthesis models, which is more topheavy than observed")

prev3_vector = embedder([with_prev3])[0]
for record in fa.target_results[fa.unfound_dois[0]]:
    contribution_vector = record["embedding"]
    print(cosine_distance(prev3_vector, contribution_vector))

# So the min distance with query expansion is 0.282. But does this also reduce the query results we already got?

0.31549996
0.34505463
0.3290373
0.56622386
0.48239732
0.34872222
0.28165507
0.3004474
0.37554318


In [18]:
distance_to_expanded_query = [cosine_distance(prev3_vector, embedder([qr.text])[0]) for qr in fa.query_results]

In [19]:
sorted_distances = sorted(distance_to_expanded_query)
print(sorted_distances[:10])  # Print the 10 smallest distances

[np.float32(0.27636135), np.float32(0.28430128), np.float32(0.28854042), np.float32(0.28928882), np.float32(0.29416144), np.float32(0.29911715), np.float32(0.3079399), np.float32(0.3101337), np.float32(0.3131963), np.float32(0.3158363)]


In [10]:
for doi, values in fa.target_results.items():
    for contribution in values:
        print(contribution["text"])

Discovery of an equal-mass 'twin' binary population reaching separations of 1000 + au.
Use of a homogeneous catalogue of 42,000 main-sequence wide binaries identified by Gaia to measure the mass ratio distribution, p(q), across a range of primary masses and separations.
Identification of a sharp excess of equal-mass 'twin' binaries statistically significant out to separations of 1000-10,000 au, depending on primary mass.
Observation that the excess is narrow, with a steep increase in p(q) at 0.95 ≲ q < 1, and no significant excess at q ≲ 0.95.
Confirmation through various tests that the signal is real and not a data artefact or selection effect.
Combination of Gaia constraints with those from close binaries to show that the twin excess decreases with increasing separation, but its width (q ≳ 0.95) is constant over a wide range of separations.
Discussion of the difficulty in explaining the wide twin population if the components of all wide binaries formed via core fragmentation, suggest

In [None]:
vector = fa.target_results[fa.unfound_dois[0]][-1]['embedding']
cosine_distance(vector, fa.query_vector)

In [None]:
v1 = embedder(
    [
        "Population synthesis models adopt mass ratio distributions that are more equal-mass-biased or bottom-light than what is observed in wide binaries"
    ]
)[0]
cosine_distance(v1, vector)

In [None]:
print(fa.query_results[6].text)

In [None]:
target_text_original = (
    "1979). Depending upon the presumed age for the disk and the star-formation rate, these effects can change "
    "the number of expected G dwarf stars at the present time relative to that predicted by models which assume "
    "metallicityindependent ages. In this paper we present calculations of the G dwarf metallicity distribution "
    "based upon several stellar mass-age relations both with and without dependences on metal and helium abundances. "
    "The results presented in § III indicate that the metallicity-dependence of the stellar ages could explain at "
    "least part of the G dwarf problem. II. CALCULATIONS We utilize a simple closed-box model for these calculations "
    "(Tinsley 1974; Audouze and Tinsley 1976; Pagel 1989). We also employ the well-known instantaneous recycling "
    "approximation (IRA) (Tinsley 1974) for the calculation of oxygen and metallicity histories of the disk. However, "
    "we must carefully account for main-sequence lifetimes when calculating the number of G dwarfs still in existence, "
    "since their lifetimes are comparable to any reasonable assumed age of the Galactic disk. This requires that we take "
    "into account the time of formation, the metallicity, and the helium abundance at the time of formation. The rate of "
    "formation of G dwarfs at time of formation tf is given by dN Cmh — = J c¡>{m)x¡j{tf)dm , (1) where m* and mh denote "
    "the low- and high-mass limits of G dwarfs, ~0.79 M0 and ~1.09 MG (Bowers and Deeming 1984), respectively."
)

target_text_fixed = (
    "1979). Depending upon the presumed age for the disk and the star-formation rate, these effects can change "
    "the number of expected G dwarf stars at the present time relative to that predicted by models which assume "
    "metallicity independent ages. In this paper we present calculations of the G dwarf metallicity distribution "
    "based upon several stellar mass-age relations both with and without dependences on metal and helium abundances. "
    "The results presented in § III indicate that the metallicity-dependence of the stellar ages could explain at "
    "least part of the G dwarf problem. II. CALCULATIONS We utilize a simple closed-box model for these calculations "
    "(Tinsley 1974; Audouze and Tinsley 1976; Pagel 1989). We also employ the well-known instantaneous recycling "
    "approximation (IRA) (Tinsley 1974) for the calculation of oxygen and metallicity histories of the disk. However, "
    "we must carefully account for main-sequence lifetimes when calculating the number of G dwarfs still in existence, "
    "since their lifetimes are comparable to any reasonable assumed age of the Galactic disk. This requires that we take "
    "into account the time of formation, the metallicity, and the helium abundance at the time of formation. The rate of "
    "formation of G dwarfs at time of formation tf is given by dN Cmh — = J c¡>{m)x¡j{tf)dm , (1) where m* and mh denote "
    "the low- and high-mass limits of G dwarfs, ~0.79 M0 and ~1.09 MG (Bowers and Deeming 1984), respectively."
)

original_vector = embedder([target_text_original])[0]
fixed_vector = embedder([target_text_fixed])[0]
query_vector = embedder([fa.expanded_query])[0]
top_text_vector = embedder([fa.query_results[0].text])[0]

print("Cosine distance between original and fixed text:", cosine_distance(original_vector, fixed_vector))
print("Cosine distance between query and original text:", cosine_distance(query_vector, original_vector))
print("Cosine distance between query and fixed text:", cosine_distance(query_vector, fixed_vector))
print("Cosine distance between query and top text:", cosine_distance(query_vector, top_text_vector))

In [None]:
print("Expanded Query:")
pprint(fa.expanded_query)

print("First Query Result:")
print(f"Distance: {fa.query_results[0].distance:.6f}")
pprint(fa.query_results[0].text[:100])



In [None]:
print("Seventh Query Result:")
print(f"Distance: {fa.query_results[6].distance:.6f}")
pprint(fa.query_results[6].text[:100])


In [None]:
for result in fa.query_results[:40]:
    print(f"{result.distance:.6f}: {result.text[:60]}")


In [None]:
print(f"{'Dist':<8}Text")
for doi, results in fa.compare_contributions().items():
    for res in results:
        print(f"{res['distance']:.4f}: {res['text']}")

In [None]:
# Look at the query
query = failed_records[0][0]
for key, value in query.items():
    print(f"{key}: {value}")

print("=" * 100)

# Find out which DOIs were not found
found_record = failed_records[0][1][query['first_rank']-1]
for key, value in found_record.items():
    print(f"{key}: {value}")

In [None]:
# Get the contributions from the database for the missing DOI
contributions_results = db.query(
    "SELECT text, embedding FROM contributions WHERE doi = '10.1038/nature08773'"
)
contributions = [result[0] for result in contributions_results]
for text, c_embedding in contributions_results:
    print(text)
    print(f"Cosine distance to query: {cosine_distance(query_embedding, c_embedding):.4f}")
    print("=" * 100)

In [None]:
query, query_results = failed_records[1]
for key, value in query.items():
    print(f"{key}: {value}")

print("=" * 100)
for key, value in query_results[0].items():
    print(f"{key}: {value}")

In [None]:
target_doi = query['citation_dois'][0]
contributions_results = db.query(
    f"SELECT text, embedding FROM contributions WHERE doi = '{target_doi}'"
)
for text, c_embedding in contributions_results:
    print(text)
    print(f"\tCosine distance to query: {cosine_distance(query_embedding, c_embedding):.4f}")

In [None]:
# If the data wasn't missing, would it have done any better?
test_query = "In Table 2 , that is a slightly modified version of Table 1 of [REF], " \
                + "determinations of 12CO/13CO, 12CO/C180, and 13CO/C18O in a number of local and high-redshift objects " \
                + "(see Sect. 3.2.1 ) are reported"

In [None]:
results = db.vector_search(
    query_vector=embedder([test_query][0]),
    target_table="contributions",
    target_column="embedding",
    metric="vector_cosine_ops",
    pubdate=query['pubdate'],
    top_k=1000
)

In [None]:
rows_for_target = results[results['doi'] == target_doi]
with pd.option_context('display.max_colwidth', None):
    print(rows_for_target)

In [None]:
with pd.option_context('display.max_colwidth', None):
    print(results.iloc[44])

In [None]:
def inspect(failed_example):
    query, query_results = failed_example
    print("Query:")
    for key, value in query.items():
        print(f"{key}: {value}")

    print("\nFirst Result:")
    for key, value in query_results[0].items():
        print(f"{key}: {value}")
    print("=" * 50)

    unfound_target_dois = set(query['citation_dois'])
    for i, result in enumerate(query_results):
        if result['doi'] in unfound_target_dois:
            unfound_target_dois.remove(result['doi'])
            print(f"Found DOI {result['doi']} at rank {i + 1}")
    print(f"Unfound DOIs: {unfound_target_dois}")


In [None]:
inspect(failed_records[2])

In [None]:
def fixed_query(query, target_doi, pubdate):
    results = db.vector_search(
        query_vector=embedder([query])[0],
        target_table="contributions",
        target_column="embedding",
        metric="vector_cosine_ops",
        pubdate=pubdate,
        top_k=1000
    )
    rows_for_target = results[results['doi'] == target_doi]
    with pd.option_context('display.max_colwidth', None):
        print(rows_for_target)

    return results

In [None]:
fixed_query_str = "K_0 \simeq 100 is a reasonable assumption in the star-forming regions of the disk ([REF] )"
target_doi = "10.1093/mnras/182.3.443"
pubdate = "2015-12-01"
results = fixed_query(fixed_query_str, target_doi, pubdate)

In [None]:
contributions_results = db.query(
    f"SELECT text, embedding FROM contributions WHERE doi = '{target_doi}'"
)
fixed_embedding = embedder([fixed_query_str])[0]
for text, c_embedding in contributions_results:
    print(text)
    print(f"\tCosine distance to query: {cosine_distance(fixed_embedding, c_embedding):.4f}")

In [None]:
results.iloc[0]

In [None]:
inspect(failed_records[3])

In [None]:
target_doi = "10.1093/mnras/258.1.41P"
contributions_results = db.query(f"SELECT text, embedding FROM contributions WHERE doi = '{target_doi}'")

query_embedding = embedder([failed_records[3][0]['expanded_query']])[0]
for text, c_embedding in contributions_results:
    print(text)
    print(f"\tCosine distance to query: {cosine_distance(query_embedding, c_embedding):.4f}")