1. Load a review dataset into memory
1. For each paper, parse the body into sentences
1. Iterate over sentences:
    1. Extract any inline citations
    1. Check if the inline citation is resolvable to a bibcode
        1. If resolvable, add the (doc, sentence, [bibcodes]) to dataset
        1. Otherwise, skip it

In [38]:
import json
import os
from pprint import pprint
from tqdm import tqdm

PATH_TO_DATA = 'data/processed_for_chroma/reviews'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()

def load_dataset(filename):
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        # data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)
        data = json.load(file)

    for record in data['metadatas']:
        record['reference'] = json.loads(record['reference'])
        record['doi'] = json.loads(record['doi'])

    return data['metadatas']

data = load_dataset('Astro_Reviews.json')
print(data[0].keys())


dict_keys(['bibcode', 'abstract', 'aff', 'author', 'bibstem', 'doctype', 'doi', 'id', 'keyword', 'pubdate', 'title', 'read_count', 'reference', 'citation_count', 'citation', 'body'])


In [39]:
import pysbd

seg = pysbd.Segmenter(language="en", clean=False)


In [40]:
# Test the segmentation
sentences = seg.segment(data[0]['body'])
print(f"Length: {len(sentences)}")
print(f"Examples: {sentences[:2]}")

Length: 699
Examples: ['1. INTRODUCTION The solar chemical composition is a fundamental yardstick in astronomy, to which the elemental abundances of essentially all cosmic objects, be they planets, stars, nebulae or galaxies, are anchored. ', 'The importance of having accurate solar elemental abundances thus can not be overstated. ']


In [41]:
import re

# Define the patterns
lastname = r"[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?"
year = r"\(?(\d{4}[a-z]?)\)?"
name_sep = r",?\s"
INLINE_CITATION_PATTERN = fr"({lastname}(?:{name_sep}{lastname})*(?: et al.?)?)\s*{year}"

# Compile the regex pattern
inline_regex = re.compile(INLINE_CITATION_PATTERN)


def get_inline_citations(text: str) -> list[tuple[str, str]]:
    return [match.groups() for match in inline_regex.finditer(text)]

In [42]:
for sentence in sentences[:3]:
    match_groups = inline_regex.finditer(sentence)
    matches = get_inline_citations(sentence)
    print(f"Sentence: {sentence}")
    print(f"Matches: {matches}")

Sentence: 1. INTRODUCTION The solar chemical composition is a fundamental yardstick in astronomy, to which the elemental abundances of essentially all cosmic objects, be they planets, stars, nebulae or galaxies, are anchored. 
Matches: []
Sentence: The importance of having accurate solar elemental abundances thus can not be overstated. 
Matches: []
Sentence: From the pioneering efforts of Russell (1929) , Suess Urey (1956) , and Goldberg, Müller Aller (1960) to the more recent works of Anders Grevesse (1989) , Grevesse Sauval (1998) , Lodders (2003) , Asplund, Grevesse Sauval (2005) , Grevesse, Asplund Sauval (2007) , and Lodders, Palme Gail (2009) , compilations of the Solar System abundances have found extremely wide-ranging use in astronomy and cosmology. 
Matches: [('Russell', '1929'), ('Suess Urey', '1956'), ('Goldberg, Müller Aller', '1960'), ('Anders Grevesse', '1989'), ('Grevesse Sauval', '1998'), ('Lodders', '2003'), ('Asplund, Grevesse Sauval', '2005'), ('Grevesse, Asplund Sa

In [43]:
def bibcode_regex(author: str, year: str):
    """
    Given first author and year, return a regex pattern for the
    corresponding bibcode
    """
    initial = author[0]
    year = year[:4]  # cut off any letters at the end
    pattern = fr'^{year}.*{initial}$'
    return re.compile(pattern)


def bibcode_matches(inline_citation: tuple[str, str], references: list[str]) -> int:
    """
    Given an inline citation and a list of references, return the number of
    references that match the inline citation's bibcode regex pattern
    """
    pattern = bibcode_regex(*inline_citation)
    return [s for s in references if pattern.match(s)]


def make_citation_bibcode_list(inline_citations: list[tuple[str, str]], references: list[str]) -> list[tuple[tuple[str, str], str]]:
    """
    Given a paper's list of inline citations and list of references, return a list of
    tuples where the first element is the inline citation and the second element
    is the corresponding bibcode from the references list where there is exactly one match
    """
    return [(citation, matches[0]) for citation in inline_citations
            if len((matches := bibcode_matches(citation, references))) == 1]

def get_bibcodes_from_inline(text: str, references: list[str]) -> list[str]:
    reference_matches = [matches[0] for citation in get_inline_citations(text) if len((matches := bibcode_matches(citation, references))) == 1]
    return reference_matches

In [50]:
def make_samples_from_record(record: dict, segmenter: pysbd.Segmenter):
    sentences = segmenter.segment(record['body'])
    samples = []
    for sentence in sentences:
        bibcodes = get_bibcodes_from_inline(sentence, record['reference'])
        if bibcodes:
            samples.append({'doi': record['doi'][0], 'sentence': sentence, 'bibcodes': bibcodes})

    return samples


samples = make_samples_from_record(data[0], segmenter=seg)
for i, sample in enumerate(samples):
    print(i, sample)

0 {'doi': '10.1146/annurev.astro.46.060407.145222', 'sentence': 'From the pioneering efforts of Russell (1929) , Suess Urey (1956) , and Goldberg, Müller Aller (1960) to the more recent works of Anders Grevesse (1989) , Grevesse Sauval (1998) , Lodders (2003) , Asplund, Grevesse Sauval (2005) , Grevesse, Asplund Sauval (2007) , and Lodders, Palme Gail (2009) , compilations of the Solar System abundances have found extremely wide-ranging use in astronomy and cosmology. ', 'bibcodes': ['1929ApJ....70...11R', '1956RvMP...28...53S', '1960ApJS....5....1G', '1989GeCoA..53..197A', '1998SSRv...85..161G', '2003ApJ...591.1220L']}
1 {'doi': '10.1146/annurev.astro.46.060407.145222', 'sentence': 'Prior knowledge of the solar photospheric Si abundance is therefore required in order to place the meteoritic abundances on the same absolute scale as the Sun ( Suess Urey 1956 ). ', 'bibcodes': ['1956RvMP...28...53S']}
2 {'doi': '10.1146/annurev.astro.46.060407.145222', 'sentence': 'In the decade since th

In [49]:
for i, sentence in enumerate(seg.segment(data[0]['body'])):
    print(i, sentence)

0 1. INTRODUCTION The solar chemical composition is a fundamental yardstick in astronomy, to which the elemental abundances of essentially all cosmic objects, be they planets, stars, nebulae or galaxies, are anchored. 
1 The importance of having accurate solar elemental abundances thus can not be overstated. 
2 From the pioneering efforts of Russell (1929) , Suess Urey (1956) , and Goldberg, Müller Aller (1960) to the more recent works of Anders Grevesse (1989) , Grevesse Sauval (1998) , Lodders (2003) , Asplund, Grevesse Sauval (2005) , Grevesse, Asplund Sauval (2007) , and Lodders, Palme Gail (2009) , compilations of the Solar System abundances have found extremely wide-ranging use in astronomy and cosmology. 
3 There are two independent and complementary ways of determining the Solar System abundances, each with its pros and cons. 
4 Through mass spectroscopy of meteorites in terrestrial laboratories, it is possible to directly measure the abundance of almost every element and isoto