In [50]:
import json
import os
from pprint import pprint
from tqdm import tqdm

PATH_TO_DATA = 'data/processed_for_chroma/reviews'
FILENAMES = os.listdir(PATH_TO_DATA)
review_data = dict()

for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        review_data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)

# This one is missing the doi key
del review_data['Earth_Science_Reviews']['metadatas'][292]
del review_data['Earth_Science_Reviews']['documents'][292]
del review_data['Earth_Science_Reviews']['ids'][292]

# postprocessing
for journal in review_data:
    for i, d in enumerate(review_data[journal]['metadatas']):
        # Convert stringified list to list
        d['reference'] = json.loads(d['reference'])
        d['doi'] = json.loads(d['doi'])

print(review_data.keys())
print(f"Journal keys: {review_data['Astro_Reviews'].keys()}")
paper = review_data['Astro_Reviews']['metadatas'][0]
print(paper.keys())
print(paper['doi'])
print(type(paper['doi']))
print(paper['reference'][:3])
all_reviews = [
    record for journal in review_data for record in review_data[journal]['metadatas']]
print(len(all_reviews))
print(all_reviews[0].keys())

dict_keys(['Earth_Science_Reviews', 'Planetary_Reviews', 'Astro_Reviews'])
Journal keys: dict_keys(['documents', 'metadatas', 'ids'])
dict_keys(['bibcode', 'abstract', 'aff', 'author', 'bibstem', 'doctype', 'doi', 'id', 'keyword', 'pubdate', 'title', 'read_count', 'reference', 'citation_count', 'citation', 'body'])
['10.1146/annurev.astro.46.060407.145222', '10.48550/arXiv.0909.0948']
<class 'list'>
['1929ApJ....70...11R', '1956RvMP...28...53S', '1958ZA.....46..108B']


In [None]:
PATH_TO_DATA = 'data/processed_for_chroma/research'
FILENAMES = os.listdir(PATH_TO_DATA)
research_data = dict()

for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        research_data[os.path.splitext(os.path.basename(filename))[
            0]] = json.load(file)


# postprocessing
for journal in research_data:
    for i, d in enumerate(research_data[journal]['metadatas']):
        if not 'doi' in d:
            del research_data[journal]['metadatas'][i]
            del research_data[journal]['documents'][i]
            del research_data[journal]['ids'][i]

for journal in research_data:
    for d in research_data[journal]['metadatas']:

        # Convert stringified list to list
        d['reference'] = json.loads(d['reference'])
        d['doi'] = json.loads(d['doi'])

print(research_data.keys())
research_data['Astro_Research'].keys()

all_data = all_reviews + \
    [record for journal in review_data for record in review_data[journal]['metadatas']]
print(f"All data: {len(all_data)}")

In [3]:
# How many bibcodes are there in this paper's references that start with 1929 and end with 'R'?
import re
pattern = r'^2000.*B$'
matches = [s for s in paper['reference'] if re.match(pattern, s)]
print(matches)

['2000A&A...363.1091B', '2000A&AS..142..467B', '2000MNRAS.311..535B', '2000MNRAS.312..116B']


In [4]:

# Define the patterns
lastname = r"[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?"
year = r"\(?(\d{4}[a-z]?)\)?"
name_sep = r",?\s"
INLINE_CITATION_PATTERN = fr"({lastname}(?:{name_sep}{lastname})*(?: et al.?)?)\s*{year}"

# Compile the regex pattern
inline_regex = re.compile(INLINE_CITATION_PATTERN)

print(inline_regex)

test = " Delbouille et al. 1981 the future (Section 5). 2. INGREDIENTS FOR SOLAR ABUNDANCE ANALYSIS 2.1. Observations Analyses of th"

# Find all matches using the compiled pattern
matches = inline_regex.finditer(test)
results = [match for match in matches]
print(f"Results: {len(results)}")

# Print the groups of each match
for i, result in enumerate(results):
    print(i+1, result.groups())

re.compile("([A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?(?:,?\\s[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?)*(?: et al.?)?)\\s*\\(?(\\d{4}[a-z]?)\\)?")
Results: 1
1 ('Delbouille et al.', '1981')


### Get the inline citations from a paper's body

In [5]:
def get_inline_citations(record: dict) -> list[tuple[str, str]]:
    return [match.groups() for match in inline_regex.finditer(record['body'])]

# inline_citations = [match.groups() for match in inline_regex.finditer(paper['body'])]
inline_citations = get_inline_citations(paper)
print(f"Results: {len(inline_citations)}")
pprint(inline_citations[:5])

Results: 424
[('Russell', '1929'),
 ('Suess Urey', '1956'),
 ('Goldberg, Müller Aller', '1960'),
 ('Anders Grevesse', '1989'),
 ('Grevesse Sauval', '1998')]


### Functions to resolve incline citation to bibcode

In [6]:
# for each result, get the first letter of the first author's last name
# get the year
def bibcode_regex(author: str, year: str):
    """
    Given first author and year, return a regex pattern for the
    corresponding bibcode
    """
    initial = author[0]
    year = year[:4] # cut off any letters at the end
    pattern = fr'^{year}.*{initial}$'
    return re.compile(pattern)

def bibcode_matches(inline_citation: tuple[str, str], references: list[str]) -> int:
    """
    Given an inline citation and a list of references, return the number of
    references that match the inline citation's bibcode regex pattern
    """
    pattern = bibcode_regex(*inline_citation)
    return [s for s in references if pattern.match(s)]

def make_citation_bibcode_list(inline_citations: list[tuple[str, str]], references: list[str]) -> list[tuple[tuple[str, str], str]]:
    """
    Given a paper's list of inline citations and list of references, return a list of
    tuples where the first element is the inline citation and the second element
    is the corresponding bibcode from the references list where there is exactly one match
    """
    return [(citation, matches[0]) for citation in inline_citations 
            if len((matches := bibcode_matches(citation, references))) == 1]

usable_citations = make_citation_bibcode_list(inline_citations, paper['reference'])
print(f"Results: {len(usable_citations)}")
print(usable_citations[:5])

Results: 206
[(('Russell', '1929'), '1929ApJ....70...11R'), (('Suess Urey', '1956'), '1956RvMP...28...53S'), (('Goldberg, Müller Aller', '1960'), '1960ApJS....5....1G'), (('Anders Grevesse', '1989'), '1989GeCoA..53..197A'), (('Grevesse Sauval', '1998'), '1998SSRv...85..161G')]


In [7]:
def get_all_bibcodes_from_file(path: str) -> list[str]:
    with open(path, 'r') as file:
        return json.load(file)

bibcodes = get_all_bibcodes_from_file('data/bibcodes.json')
print(f"Results: {len(bibcodes)}")
print(bibcodes[:5])

Results: 5944
['1975E&PSL..26..207S', '1983E&PSL..64..295W', '1997E&PSL.148..243B', '1978E&PSL..40...25M', '1988E&PSL..90..297H']


In [None]:
def resolve_inline_references(records, bibcodes):
    in_dataset, out_of_dataset = [], []
    for record in tqdm(records, desc='Processing records'):
        usable_citations = make_citation_bibcode_list(get_inline_citations(record), record['reference'])
        for citation in usable_citations:
            # Construct the citation dictionary
            inline_citation, bibcode = citation
            cite_dict = {'source_bibcode': record['bibcode'],
                         'inline_citation': inline_citation, 
                         'reference_bibcode': bibcode}

            # Determine if the referenced bibcode is in the dataset or not
            in_dataset.append(cite_dict) if bibcode in bibcodes else out_of_dataset.append(cite_dict)
    return in_dataset, out_of_dataset

have, dont_have = resolve_inline_references(review_data['Astro_Reviews']['metadatas'][:1], bibcodes)
print(f"In dataset: {len(have)}")
print(f"Out of dataset: {len(dont_have)}")
pprint(dont_have[:2])

Processing records: 100%|██████████| 1/1 [00:00<00:00, 41.10it/s]

In dataset: 12
Out of dataset: 194
[{'inline_citation': ('Russell', '1929'),
  'reference_bibcode': '1929ApJ....70...11R',
  'source_bibcode': '2009ARA&A..47..481A'},
 {'inline_citation': ('Suess Urey', '1956'),
  'reference_bibcode': '1956RvMP...28...53S',
  'source_bibcode': '2009ARA&A..47..481A'}]





In [9]:
# Get all the review papers and get a list of their unresolved references



have, dont_have = resolve_inline_references(all_reviews, bibcodes)
bibcodes_out_of_dataset = set(
    [cite['reference_bibcode'] for cite in dont_have])
bibcodes_in_dataset = set([cite['reference_bibcode'] for cite in have])
print(f"Unique in dataset bibcodes: {len(bibcodes_in_dataset)}")
print(f"Unique out of dataset bibcodes: {len(bibcodes_out_of_dataset)}")

2984
dict_keys(['bibcode', 'abstract', 'aff', 'author', 'bibstem', 'doctype', 'doi', 'id', 'keyword', 'pubdate', 'title', 'read_count', 'reference', 'citation_count', 'citation', 'body'])


Processing records: 100%|██████████| 2984/2984 [00:33<00:00, 90.23it/s] 

Unique in dataset bibcodes: 2048
Unique out of dataset bibcodes: 73055





In [58]:
import pysbd


def get_inline_citations(text: str) -> list[tuple[str, str]]:
    return [match.groups() for match in inline_regex.finditer(text)]

# def make_evaluation_samples(record):
#     # Split the body into sentences
#     splitter = pysbd.Segmenter(language="en", clean=False)
#     sentences = splitter.segment(record['body'])
#     examples = []
#     for sentence in sentences:
#         usable_sentence = True
#         sentence_bibcodes = []
#         inline_citations = get_inline_citations(sentence)
#         for citation in inline_citations:
#             bibcodes = bibcode_matches(citation, record['reference'])
#             if len(bibcodes) != 1:
#                 usable_sentence = False
#                 break
#             # Exactly one bibcode
#             sentence_bibcodes.append(bibcodes[0])
#         if usable_sentence:
#             examples.append((record['doi'], sentence, sentence_bibcodes))


In [68]:
def sentence_to_example(record, sentence):
    """
    Takes all the inline citations of a sentence and if it can resolve them to dois
    then it returns the """
    def citation_to_doi(citation):
        """
        Takes a single inline citation as tuple of (author, year) and determines if there is a unique
        matching bibcode in the record's references. If so, it continues to look for a unique
        doi matching that bibcode in the entire dataset. It returns the doi if resolved, otherwise None.
        """
        bibcodes = bibcode_matches(citation, record['reference'])
        if len(bibcodes) != 1:
            return None
        matching_dois = [record['doi'][0] for record in all_research if record['bibcode'] == bibcodes[0]]
        if len(matching_dois) != 1:
            return None
        return matching_dois[0]
    
    inline_citations = get_inline_citations(sentence)
    citation_dois = []
    for citation in inline_citations:
        if not (doi := citation_to_doi(citation)):
            break
        citation_dois.append(doi)

    # If all citations resolved to dois, return the example
    if len(inline_citations) != len(citation_dois):
        return None
    return {
            'source_doi': record['doi'][0],
            'sentence': sentence,
            'citation_dois': citation_dois
           }
        

In [70]:
def create_examples_from_record(record):
    splitter = pysbd.Segmenter(language="en", clean=False)
    sentences = splitter.segment(record['body'])
    return [
        example for sentence in sentences if (example := sentence_to_example(record, sentence))
    ]

In [71]:
examples = create_examples_from_record(
    review_data['Astro_Reviews']['metadatas'][1])

In [72]:
examples[0]

{'source_doi': '10.1146/annurev-astro-081811-125615',
 'sentence': '1. INTRODUCTION The origin and evolution of galaxies are among the most intriguing and complex chapters in the formation of cosmic structure, and observations in this field have accumulated at an astonishing pace. ',
 'citation_dois': []}

In [74]:
with open('data/test_set.jsonl', 'a') as file:
    for record in review_data['Astro_Reviews']['metadatas'][0:2]:
        examples = create_examples_from_record(record)
        for example in examples:
            json.dump(example, file)
            file.write('\n')

[{'source_doi': '10.1146/annurev.astro.46.060407.145222',
  'sentence': '1. INTRODUCTION The origin and evolution of galaxies are among the most intriguing and complex chapters in the formation of cosmic structure, and observations in this field have accumulated at an astonishing pace. ',
  'citation_dois': []},
 {'source_doi': '10.1146/annurev.astro.46.060407.145222',
  'sentence': 'Photometric redshifts have become an unavoidable tool for placing faint galaxies onto a cosmic timeline. ',
  'citation_dois': []},
 {'source_doi': '10.1146/annurev.astro.46.060407.145222',
  'sentence': 'The Galaxy Evolution Explorer (GALEX) satellite has quantified the UV galaxy luminosity function (LF) of galaxies in the local Universe and its evolution at z ≲1. ',
  'citation_dois': []}]

In [62]:
review_data['Astro_Reviews']['metadatas'][1]['doi']

['10.1146/annurev-astro-081811-125615', '10.48550/arXiv.1403.0007']