In [1]:
import json
import os
from pprint import pprint
from tqdm import tqdm

PATH_TO_DATA = 'data/processed_for_chroma/reviews'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()

for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)

# This one is missing the doi key
del data['Earth_Science_Reviews']['metadatas'][292]
del data['Earth_Science_Reviews']['documents'][292]
del data['Earth_Science_Reviews']['ids'][292]

# postprocessing
for journal in data:
    for i, d in enumerate(data[journal]['metadatas']):
        # Convert stringified list to list
        d['reference'] = json.loads(d['reference'])
        d['doi'] = json.loads(d['doi'])

print(data.keys())
print(f"Journal keys: {data['Astro_Reviews'].keys()}")
paper = data['Astro_Reviews']['metadatas'][0]
print(paper.keys())
print(paper['doi'])
print(type(paper['doi']))
print(paper['reference'][:3])

dict_keys(['Earth_Science_Reviews', 'Planetary_Reviews', 'Astro_Reviews'])
Journal keys: dict_keys(['documents', 'metadatas', 'ids'])
dict_keys(['bibcode', 'abstract', 'aff', 'author', 'bibstem', 'doctype', 'doi', 'id', 'keyword', 'pubdate', 'title', 'read_count', 'reference', 'citation_count', 'citation', 'body'])
['10.1146/annurev.astro.46.060407.145222', '10.48550/arXiv.0909.0948']
<class 'list'>
['1929ApJ....70...11R', '1956RvMP...28...53S', '1958ZA.....46..108B']


In [2]:
for journal in data:
    for i, d in enumerate(data[journal]['metadatas']):
        if not 'doi' in d:
            print(f"missing doi in {journal} {i}")

In [3]:
# How many bibcodes are there in this paper's references that start with 1929 and end with 'R'?
import re
pattern = r'^2000.*B$'
matches = [s for s in paper['reference'] if re.match(pattern, s)]
print(matches)

['2000A&A...363.1091B', '2000A&AS..142..467B', '2000MNRAS.311..535B', '2000MNRAS.312..116B']


In [4]:
import re

# Define the patterns
lastname = r"[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?"
year = r"\(?(\d{4}[a-z]?)\)?"
name_sep = r",?\s"
INLINE_CITATION_PATTERN = fr"({lastname}(?:{name_sep}{lastname})*(?: et al.?)?)\s*{year}"

# Compile the regex pattern
inline_regex = re.compile(INLINE_CITATION_PATTERN)

print(inline_regex)

test = " Delbouille et al. 1981 the future (Section 5). 2. INGREDIENTS FOR SOLAR ABUNDANCE ANALYSIS 2.1. Observations Analyses of th"

# Find all matches using the compiled pattern
matches = inline_regex.finditer(test)
results = [match for match in matches]
print(f"Results: {len(results)}")

# Print the groups of each match
for i, result in enumerate(results):
    print(i+1, result.groups())

re.compile("([A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?(?:,?\\s[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?)*(?: et al.?)?)\\s*\\(?(\\d{4}[a-z]?)\\)?")
Results: 1
1 ('Delbouille et al.', '1981')


### Get the inline citations from a paper's body

In [5]:
def get_inline_citations(record: dict) -> list[tuple[str, str]]:
    return [match.groups() for match in inline_regex.finditer(record['body'])]

# inline_citations = [match.groups() for match in inline_regex.finditer(paper['body'])]
inline_citations = get_inline_citations(paper)
print(f"Results: {len(inline_citations)}")
pprint(inline_citations[:5])

Results: 424
[('Russell', '1929'),
 ('Suess Urey', '1956'),
 ('Goldberg, Müller Aller', '1960'),
 ('Anders Grevesse', '1989'),
 ('Grevesse Sauval', '1998')]


### Functions to resolve incline citation to bibcode

In [6]:
# for each result, get the first letter of the first author's last name
# get the year
def bibcode_regex(author: str, year: str):
    """
    Given first author and year, return a regex pattern for the
    corresponding bibcode
    """
    initial = author[0]
    year = year[:4] # cut off any letters at the end
    pattern = fr'^{year}.*{initial}$'
    return re.compile(pattern)

def bibcode_matches(inline_citation: tuple[str, str], references: list[str]) -> int:
    """
    Given an inline citation and a list of references, return the number of
    references that match the inline citation's bibcode regex pattern
    """
    pattern = bibcode_regex(*inline_citation)
    return [s for s in references if pattern.match(s)]

def make_citation_bibcode_list(inline_citations: list[tuple[str, str]], references: list[str]) -> list[tuple[tuple[str, str], str]]:
    """
    Given a paper's list of inline citations and list of references, return a list of
    tuples where the first element is the inline citation and the second element
    is the corresponding bibcode from the references list where there is exactly one match
    """
    return [(citation, matches[0]) for citation in inline_citations 
            if len((matches := bibcode_matches(citation, references))) == 1]

usable_citations = make_citation_bibcode_list(inline_citations, paper['reference'])
print(f"Results: {len(usable_citations)}")
print(usable_citations[:5])

Results: 206
[(('Russell', '1929'), '1929ApJ....70...11R'), (('Suess Urey', '1956'), '1956RvMP...28...53S'), (('Goldberg, Müller Aller', '1960'), '1960ApJS....5....1G'), (('Anders Grevesse', '1989'), '1989GeCoA..53..197A'), (('Grevesse Sauval', '1998'), '1998SSRv...85..161G')]


In [7]:
def get_all_bibcodes_from_file(path: str) -> list[str]:
    with open(path, 'r') as file:
        return json.load(file)

bibcodes = get_all_bibcodes_from_file('data/bibcodes.json')
print(f"Results: {len(bibcodes)}")
print(bibcodes[:5])

Results: 5944
['1975E&PSL..26..207S', '1983E&PSL..64..295W', '1997E&PSL.148..243B', '1978E&PSL..40...25M', '1988E&PSL..90..297H']


In [8]:
def resolve_inline_references(records, bibcodes):
    in_dataset, out_of_dataset = [], []
    for record in tqdm(records, desc='Processing records'):
        usable_citations = make_citation_bibcode_list(get_inline_citations(record), record['reference'])
        for citation in usable_citations:
            # Construct the citation dictionary
            inline_citation, bibcode = citation
            cite_dict = {'source_bibcode': record['bibcode'],
                         'inline_citation': inline_citation, 
                         'reference_bibcode': bibcode}

            # Determine if the referenced bibcode is in the dataset or not
            in_dataset.append(cite_dict) if bibcode in bibcodes else out_of_dataset.append(cite_dict)
    return in_dataset, out_of_dataset

have, dont_have = resolve_inline_references(data['Astro_Reviews']['metadatas'][:1], bibcodes)
print(f"In dataset: {len(have)}")
print(f"Out of dataset: {len(dont_have)}")
pprint(dont_have[:2])

Processing records: 100%|██████████| 1/1 [00:00<00:00, 41.10it/s]

In dataset: 12
Out of dataset: 194
[{'inline_citation': ('Russell', '1929'),
  'reference_bibcode': '1929ApJ....70...11R',
  'source_bibcode': '2009ARA&A..47..481A'},
 {'inline_citation': ('Suess Urey', '1956'),
  'reference_bibcode': '1956RvMP...28...53S',
  'source_bibcode': '2009ARA&A..47..481A'}]





In [9]:
# Get all the review papers and get a list of their unresolved references

all_reviews = [record for journal in data for record in data[journal]['metadatas']]
print(len(all_reviews))
print(all_reviews[0].keys())

have, dont_have = resolve_inline_references(all_reviews, bibcodes)
bibcodes_out_of_dataset = set(
    [cite['reference_bibcode'] for cite in dont_have])
bibcodes_in_dataset = set([cite['reference_bibcode'] for cite in have])
print(f"Unique in dataset bibcodes: {len(bibcodes_in_dataset)}")
print(f"Unique out of dataset bibcodes: {len(bibcodes_out_of_dataset)}")

2984
dict_keys(['bibcode', 'abstract', 'aff', 'author', 'bibstem', 'doctype', 'doi', 'id', 'keyword', 'pubdate', 'title', 'read_count', 'reference', 'citation_count', 'citation', 'body'])


Processing records: 100%|██████████| 2984/2984 [00:33<00:00, 90.23it/s] 

Unique in dataset bibcodes: 2048
Unique out of dataset bibcodes: 73055





In [10]:
import chromadb

client = chromadb.PersistentClient(path='./vector_stores/foo/')
client.list_collections()

[Collection(name=test-bert-base-uncased__cosine__no_augmentation),
 Collection(name=test-bge-small-en__cosine__no_augmentation),
 Collection(name=test-NV-Embed-v2__cosine__no_augmentation)]

In [11]:
collection = client.get_collection(
    'test-bge-small-en__cosine__no_augmentation')
print(collection)

Collection(name=test-bge-small-en__cosine__no_augmentation)


In [12]:
collection.get(limit=2, where={
               'doi': '10.1146/annurev.astro.46.060407.145222'})

{'ids': ['417a2e07-3a13-4ce2-aa10-2028f54d32de',
  '3523f5c9-865e-49b0-8a84-2555766c1aff'],
 'embeddings': None,
 'documents': ['1. INTRODUCTION The solar chemical composition is a fundamental yardstick in astronomy, to which the elemental abundances of essentially all cosmic objects, be they planets, stars, nebulae or galaxies, are anchored. ',
  'The importance of having accurate solar elemental abundances thus can not be overstated. '],
 'uris': None,
 'data': None,
 'metadatas': [{'doi': '10.1146/annurev.astro.46.060407.145222'},
  {'doi': '10.1146/annurev.astro.46.060407.145222'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [14]:
import pysbd


def get_inline_citations(text: str) -> list[tuple[str, str]]:
    return [match.groups() for match in inline_regex.finditer(text)]

def make_evaluation_samples(record):
    # Split the body into sentences
    splitter = pysbd.Segmenter(language="en", clean=False)
    sentences = splitter.segment(record['body'])
    examples = []
    for sentence in sentences:
        usable_sentence = True
        sentence_bibcodes = []
        inline_citations = get_inline_citations(sentence)
        for citation in inline_citations:
            bibcodes = bibcode_matches(citation, record['reference'])
            if len(bibcodes) != 1:
                usable_sentence = False
                break
            # Exactly one bibcode
            sentence_bibcodes.append(bibcodes[0])
        if usable_sentence:
            examples.append((record['doi'], sentence, sentence_bibcodes))


    # for each sentence
    # get the inline citations
    # determine if all the inline citations are resolvable to references
    # remove the sentence if not, keep it if yes

    # for kept sentences, put the bibcodes corresponding to the inline citations into a list
    # create a version of the sentence with inline citations removed
    # doc's (doi, sentence) becomes an input and its label is [bibcodes]

sents = make_evaluation_samples(paper)
for sent in sents:
    print(sent)
    print(get_inline_citations(sent))


TypeError: 'NoneType' object is not iterable

In [25]:
def inline_to_doi_through_bibcode(record, sentence):
    def citation_to_doi(citation):
        print(f"Working on citation {citation}")
        bibcodes = bibcode_matches(citation, record['reference'])
        if len(bibcodes) != 1:
            print(f"Doesn't have exactly 1 bibcode in references")
            return None
        matching_dois = [record['doi'][0] for record in all_research if record['bibcode'] == bibcodes[0]]
        print(f"Matching dois: {matching_dois}")
        if len(matching_dois) != 1:
            return None
        return matching_dois[0]
    

    inline_citations = get_inline_citations(sentence)
    citation_dois = []
    for citation in inline_citations:
        if doi := citation_to_doi(citation):
            citation_dois.append(doi)
        else:
            print(f"Failed to find unique doi for {citation}")
            break
    if len(inline_citations) == len(citation_dois):
        return (record['doi'][0], sentence, citation_dois)
    return None
        

In [18]:
PATH_TO_DATA = 'data/processed_for_chroma/research'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()

for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)


# postprocessing
for journal in data:
    for i, d in enumerate(data[journal]['metadatas']):
        if not 'doi' in d:
            del data[journal]['metadatas'][i]
            del data[journal]['documents'][i]
            del data[journal]['ids'][i]

for journal in data:
    for d in data[journal]['metadatas']:

        # Convert stringified list to list
        d['reference'] = json.loads(d['reference'])
        d['doi'] = json.loads(d['doi'])

print(data.keys())
data['Astro_Research'].keys()

all_research = all_reviews + [record for journal in data for record in data[journal]['metadatas']]
print(f"All research: {len(all_research)}")

dict_keys(['Earth_Science_Research', 'Planetary_Research', 'Astro_Research'])
All research: 5929


In [19]:
sentences = pysbd.Segmenter(language="en", clean=False).segment(paper['body'])

In [31]:
examples = [
    example for sentence in sentences if (example := inline_to_doi_through_bibcode(paper, sentence))    
]

Working on citation ('Russell', '1929')
Matching dois: []
Failed to find unique doi for ('Russell', '1929')
Working on citation ('Suess Urey', '1956')
Matching dois: []
Failed to find unique doi for ('Suess Urey', '1956')
Working on citation ('Grevesse Sauval', '1998')
Matching dois: []
Failed to find unique doi for ('Grevesse Sauval', '1998')
Working on citation ('Delbouille, Roland Neven', '1973')
Matching dois: []
Failed to find unique doi for ('Delbouille, Roland Neven', '1973')
Working on citation ('Delbouille et al.', '1981')
Doesn't have exactly 1 bibcode in references
Failed to find unique doi for ('Delbouille et al.', '1981')
Working on citation ('Kurucz', '2006')
Matching dois: []
Failed to find unique doi for ('Kurucz', '2006')
Working on citation ('Kurucz', '1992')
Matching dois: []
Failed to find unique doi for ('Kurucz', '1992')
Working on citation ("Anstee O'Mara", '1995')
Matching dois: []
Failed to find unique doi for ("Anstee O'Mara", '1995')
Working on citation ('Bad

In [33]:
for example in examples:
    if example[2]:
        print(example)
        print()

('10.1146/annurev.astro.46.060407.145222', 'The reader is referred to Nordlund, Stein Asplund (2009) for a recent review of the physics of solar surface convection and the numerical details of its simulation. ', ['10.12942/lrsp-2009-2'])

('10.1146/annurev.astro.46.060407.145222', 'Current generations of 3D models are very successful in reproducing the observed solar granulation topology, typical length- and timescales, convective velocities, and intensity brightness contrast (e.g., Nordlund, Stein Asplund 2009 ), clearly none of which 1D models are able to predict. ', ['10.12942/lrsp-2009-2'])

('10.1146/annurev.astro.46.060407.145222', 'Such line broadening can thus be explained as the result of the Doppler shifts arising from the convective motions with a smaller contribution from the solar oscillations (e.g., Nordlund, Stein Asplund 2009 , and references therein). ', ['10.12942/lrsp-2009-2'])

('10.1146/annurev.astro.46.060407.145222', 'Finally, from Table 4 it is clear that the su

In [52]:
inline_to_doi_through_bibcode(paper, sentences[112])

('["10.1146/annurev.astro.46.060407.145222", "10.48550/arXiv.0909.0948"]',
 'The wings of the H i Balmer lines are also sensitive tracers of the temperature stratification in the deeper layers of the photosphere. ',
 [])

In [53]:
type(paper['doi'])

str