In [None]:
import json
import os
from pprint import pprint
from tqdm import tqdm

PATH_TO_DATA = 'data/processed_for_chroma/reviews'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()

for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)

# postprocessing
for journal in data:
    for d in data[journal]['metadatas']:
        # Convert stringified list to list
        d['reference'] = json.loads(d['reference'])

print(data.keys())
data['Astro_Reviews'].keys()

In [2]:
paper = data['Astro_Reviews']['metadatas'][0]

In [None]:
# How many bibcodes are there in this paper's references that start with 1929 and end with 'R'?
import re
pattern = r'^2000.*B$'
matches = [s for s in paper['reference'] if re.match(pattern, s)]
print(matches)

In [None]:
import re

# Define the patterns
lastname = r"[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?"
year = r"\(?(\d{4}[a-z]?)\)?"
name_sep = r",?\s"
INLINE_CITATION_PATTERN = fr"({lastname}(?:{name_sep}{lastname})*(?: et al.?)?)\s*{year}"

# Compile the regex pattern
inline_regex = re.compile(INLINE_CITATION_PATTERN)

print(inline_regex)

test = " Delbouille et al. 1981 the future (Section 5). 2. INGREDIENTS FOR SOLAR ABUNDANCE ANALYSIS 2.1. Observations Analyses of th"

# Find all matches using the compiled pattern
matches = inline_regex.finditer(test)
results = [match for match in matches]
print(f"Results: {len(results)}")

# Print the groups of each match
for i, result in enumerate(results):
    print(i+1, result.groups())

### Get the inline citations from a paper's body

In [None]:
def get_inline_citations(record: dict) -> list[tuple[str, str]]:
    return [match.groups() for match in inline_regex.finditer(record['body'])]

# inline_citations = [match.groups() for match in inline_regex.finditer(paper['body'])]
inline_citations = get_inline_citations(paper)
print(f"Results: {len(inline_citations)}")
pprint(inline_citations[:5])

### Functions to resolve incline citation to bibcode

In [None]:
# for each result, get the first letter of the first author's last name
# get the year
def bibcode_regex(author: str, year: str):
    """
    Given first author and year, return a regex pattern for the
    corresponding bibcode
    """
    initial = author[0]
    year = year[:4] # cut off any letters at the end
    pattern = fr'^{year}.*{initial}$'
    return re.compile(pattern)

def bibcode_matches(inline_citation: tuple[str, str], references: list[str]) -> int:
    """
    Given an inline citation and a list of references, return the number of
    references that match the inline citation's bibcode regex pattern
    """
    pattern = bibcode_regex(*inline_citation)
    return [s for s in references if pattern.match(s)]

def make_citation_bibcode_list(inline_citations: list[tuple[str, str]], references: list[str]) -> list[tuple[tuple[str, str], str]]:
    """
    Given a paper's list of inline citations and list of references, return a list of
    tuples where the first element is the inline citation and the second element
    is the corresponding bibcode from the references list where there is exactly one match
    """
    return [(citation, matches[0]) for citation in inline_citations 
            if len((matches := bibcode_matches(citation, references))) == 1]

usable_citations = make_citation_bibcode_list(inline_citations, paper['reference'])
print(f"Results: {len(usable_citations)}")
print(usable_citations[:5])

In [None]:
def get_all_bibcodes_from_file(path: str) -> list[str]:
    with open(path, 'r') as file:
        return json.load(file)

bibcodes = get_all_bibcodes_from_file('data/bibcodes.json')
print(f"Results: {len(bibcodes)}")
print(bibcodes[:5])

In [None]:
def resolve_inline_references(records, bibcodes):
    in_dataset, out_of_dataset = [], []
    for record in tqdm(records, desc='Processing records'):
        usable_citations = make_citation_bibcode_list(get_inline_citations(record), record['reference'])
        for citation in usable_citations:
            # Construct the citation dictionary
            inline_citation, bibcode = citation
            cite_dict = {'source_bibcode': record['bibcode'],
                         'inline_citation': inline_citation, 
                         'reference_bibcode': bibcode}

            # Determine if the referenced bibcode is in the dataset or not
            in_dataset.append(cite_dict) if bibcode in bibcodes else out_of_dataset.append(cite_dict)
    return in_dataset, out_of_dataset

have, dont_have = resolve_inline_references(data['Astro_Reviews']['metadatas'][:1], bibcodes)
print(f"In dataset: {len(have)}")
print(f"Out of dataset: {len(dont_have)}")
pprint(dont_have[:2])

In [None]:
# Get all the review papers and get a list of their unresolved references

all_reviews = [record for journal in data for record in data[journal]['metadatas']]
print(len(all_reviews))
print(all_reviews[0].keys())

have, dont_have = resolve_inline_references(all_reviews, bibcodes)
bibcodes_out_of_dataset = set(
    [cite['reference_bibcode'] for cite in dont_have])
bibcodes_in_dataset = set([cite['reference_bibcode'] for cite in have])
print(f"Unique in dataset bibcodes: {len(bibcodes_in_dataset)}")
print(f"Unique out of dataset bibcodes: {len(bibcodes_out_of_dataset)}")