In [1]:
import json
import os
from pprint import pprint
from tqdm import tqdm
from utils import load_dataset


In [21]:
PATH_TO_DATA = 'data/json/Astro_Reviews.json'
astro_reviews = load_dataset(PATH_TO_DATA)
print(astro_reviews[0].keys())

astro_research = load_dataset('data/json/Astro_Research.json')
doi_articles = load_dataset('data/json/doi_articles.json')
salvaged_articles = load_dataset('data/json/salvaged_articles.json')

all_records = astro_reviews + astro_research + doi_articles + salvaged_articles

data/json/Astro_Reviews.json: 996/1000 have all required keys
dict_keys(['bibcode', 'abstract', 'aff', 'author', 'bibstem', 'doctype', 'doi', 'id', 'keyword', 'pubdate', 'title', 'read_count', 'reference', 'citation_count', 'citation', 'body'])
data/json/Astro_Research.json: 981/1000 have all required keys
data/json/doi_articles.json: 1898/1898 have all required keys
data/json/salvaged_articles.json: 50021/72374 have all required keys


In [22]:

import re

# Define bibcode and citation patterns
lastname = r"[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*(?:'[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-]*)?"
year = r"\(?(\d{4}[a-z]?)\)?"
name_sep = r",?\s"
INLINE_CITATION_PATTERN = fr"({lastname}(?:{name_sep}{lastname})*(?: et al.?)?)\s*{year}"

# Compile the regex pattern
INLINE_REGEX = re.compile(INLINE_CITATION_PATTERN)

# test = " Delbouille et al. 1981 the future (Section 5). 2. INGREDIENTS FOR SOLAR ABUNDANCE ANALYSIS 2.1. Observations Analyses of th"

# # Find all matches using the compiled pattern
# matches = inline_regex.finditer(test)
# results = [match for match in matches]
# print(f"Results: {results}")


### Functions to resolve incline citation to bibcode

In [None]:

def bibcode_regex(author: str, year: str):
    """
    Given first author and year, return a regex pattern for the
    corresponding bibcode
    """
    initial = author[0]
    year = year[:4] # cut off any letters at the end
    pattern = fr'^{year}.*{initial}$'
    return re.compile(pattern)

def bibcode_matches(inline_citation: tuple[str, str], references: list[str]) -> int:
    """
    Given an inline citation and a list of references, return the references
    h the inline citation's bibcode regex pattern
    """
    pattern = bibcode_regex(*inline_citation)
    return [s for s in references if pattern.match(s)]

# def make_citation_bibcode_list(inline_citations: list[tuple[str, str]], references: list[str]) -> list[tuple[tuple[str, str], str]]:
#     """
#     Given a paper's list of inline citations and list of references, return a list of
#     tuples where the first element is the inline citation and the second element
#     is the corresponding bibcode from the references list where there is exactly one match
#     """
#     return [(citation, matches[0]) for citation in inline_citations 
#             if len((matches := bibcode_matches(citation, references))) == 1]

# usable_citations = make_citation_bibcode_list(inline_citations, paper['reference'])
# print(f"Results: {len(usable_citations)}")
# print(usable_citations[:5])

In [24]:
import pysbd

def get_inline_citations(text: str) -> list[tuple[str, str]]:
    return [match.groups() for match in INLINE_REGEX.finditer(text)]

def sentence_to_example(record, sentence, all_records):
    """
    Takes all the inline citations of a sentence and if it can resolve them to dois
    then it returns the """
    def citation_to_doi(citation):
        """
        Takes a single inline citation as tuple of (author, year) and determines if there is a unique
        matching bibcode in the record's references. If so, it continues to look for a unique
        doi matching that bibcode in the entire dataset. It returns the doi if resolved, otherwise None.
        """
        bibcodes = bibcode_matches(citation, record['reference'])
        if len(bibcodes) != 1:
            return None
        
        # Take the bibcode and look for a unique corresponding doi
        matching_dois = [record['doi'][0]
                         for record in all_records if record['bibcode'] == bibcodes[0]]
        if len(matching_dois) != 1:
            return None
        return matching_dois[0]
    
    inline_citations = get_inline_citations(sentence)
    citation_dois = []
    for citation in inline_citations:
        if not (doi := citation_to_doi(citation)):
            break
        citation_dois.append(doi)

    # If all citations resolved to dois, return the example
    # TODO: is this too strict?
    if len(inline_citations) != len(citation_dois):
        return None
    return {
            'source_doi': record['doi'][0],
            'sentence': sentence,
            'citation_dois': citation_dois
           }


def create_examples_from_record(record):
    splitter = pysbd.Segmenter(language="en", clean=False)
    sentences = [s for s in splitter.segment(record['body']) if len(s) > 40]
    return [
        example for sentence in sentences if (example := sentence_to_example(record, sentence))
    ]
        

In [25]:
def find_salvageable_bibcodes(record, sentence, all_bibcodes):
    """
    Similar logic to sentence_to_example, except this function, when it finds an inline citation
    that resolves to a single bibcode in a record/sentence, but it CANNOT find the corresponding
    doi in the entire dataset (i.e. the bibcode is not in the dataset), it returns the bibcode
    """
    def citation_to_missing_bibcode(citation):
        """
        Takes a single inline citation as tuple of (author, year) and determines if there is a unique
        matching bibcode in the record's references. If so, it continues to look for a unique
        doi matching that bibcode in the entire dataset. It returns the doi if resolved, otherwise None.
        """
        bibcodes = bibcode_matches(citation, record['reference'])
        if len(bibcodes) != 1:
            return None
        
        bibcode = bibcodes[0]
        
        # The inline citation corresponds to a unique bibcode, but that bibcode is not in the dataset
        if bibcode not in all_bibcodes:
            return bibcode
        
        matching_dois = [record['doi'][0]
                         for record in all_records if record['bibcode'] == bibcode]
        if len(matching_dois) != 1:
            return None
        return matching_dois[0]

    inline_citations = get_inline_citations(sentence)
    return [bibcode for citation in inline_citations if (bibcode := citation_to_missing_bibcode(citation))]


In [26]:
all_bibcodes = [record['bibcode'] for record in all_records]
len(all_bibcodes) == len(all_records)

True

In [27]:
import pysbd

salvageable_bibcodes = set()
splitter = pysbd.Segmenter(language="en", clean=False)
for record in tqdm(astro_reviews):
    sentences = [s for s in splitter.segment(record['body']) if len(s) > 40]
    for sentence in sentences:
        salvageable_bibcodes.update(find_salvageable_bibcodes(record, sentence, all_bibcodes))

print(f"Salvageable bibcodes: {len(salvageable_bibcodes)}")

100%|██████████| 996/996 [1:16:50<00:00,  4.63s/it]

Salvageable bibcodes: 45216





In [28]:
with open('more_salvageable_bibcodes.txt', 'w') as f:
    for bibcode in salvageable_bibcodes:
        f.write(f"{bibcode}\n")

In [15]:
earth_reviews = load_dataset('data/json/Earth_Science_Reviews.json')
earth_research = load_dataset('data/json/Earth_Science_Research.json')
planet_reviews = load_dataset('data/json/Planetary_Reviews.json')
planet_research = load_dataset('data/json/Planetary_Research.json')

all_records = astro_reviews + astro_research + earth_reviews + earth_research + planet_reviews + planet_research
all_bibcodes = [record['bibcode'] for record in all_records]

data/json/Earth_Science_Reviews.json: 994/1000 have all required keys
data/json/Earth_Science_Research.json: 1000/1000 have all required keys
data/json/Planetary_Reviews.json: 994/1000 have all required keys
data/json/Planetary_Research.json: 964/1000 have all required keys


In [16]:
for record in tqdm(earth_reviews + planet_reviews):
    sentences = [s for s in splitter.segment(record['body']) if len(s) > 40]
    for sentence in sentences:
        salvageable_bibcodes.update(find_salvageable_bibcodes(record, sentence, all_bibcodes))

print(f"Salvageable bibcodes: {len(salvageable_bibcodes)}")

100%|██████████| 1988/1988 [2:44:51<00:00,  4.98s/it]   

Salvageable bibcodes: 74365





In [17]:
with open('data/salvageable_bibcodes.txt', 'w') as file:
    for bibcode in salvageable_bibcodes:
        file.write(f"{bibcode}\n")

In [16]:
with open('data/test_set.jsonl', 'a') as file:
    for journal in review_data:
        for record in review_data[journal]['metadatas'][:10]:
            examples = create_examples_from_record(record)
            for example in examples:
                json.dump(example, file)
                file.write('\n')

In [None]:
num_sentences = []
num_examples = []
examples_with_empty_labels = []
total_examples = 0
for i, record in tqdm(enumerate(review_data['Astro_Reviews']['metadatas'][:100])):
    splitter = pysbd.Segmenter(language="en", clean=False)
    sentences = [s for s in splitter.segment(record['body']) if len(s) > 40]
    num_sentences.append(len(sentences))
    examples = [example for sentence in sentences if (example := sentence_to_example(record, sentence))]
    num_examples.append(len(examples))
    examples_with_empty_labels.append(len([e for e in examples if e['citation_dois'] == []]))

In [None]:
print(f"Total number of examples: {sum(num_examples)}")
print(f"Total number of examples with no citations: {sum(examples_with_empty_labels)}")

In [29]:
len(salvageable_bibcodes)

45216

In [30]:
with open('data/salvageable_bibcodes.txt', 'r') as file:
    previous_salvageable_bibcodes = set(file.read().splitlines())

len(previous_salvageable_bibcodes)

74365

In [32]:
len(salvageable_bibcodes.intersection(previous_salvageable_bibcodes))

10858