In [9]:
import json
import os
from pprint import pprint
from tqdm import tqdm
from pathlib import Path
from utils import load_dataset

DATA_PATH = Path('data/json')
# files = os.listdir(DATA_PATH)
data_paths = list(DATA_PATH.glob('*.json'))
print(f'Loading {len(data_paths)} files...')
for path in data_paths:
    print('\t' + path.name)

data_by_journal = {path.stem: load_dataset(path) for path in data_paths}
print(f"data_by_journal has keys {data_by_journal.keys()}")

all_data = [record for data in data_by_journal.values() for record in data]
print(f'Loaded {len(all_data)} records')

Loading 8 files...
	Earth_Science_Reviews.json
	Earth_Science_Research.json
	Planetary_Research.json
	Planetary_Reviews.json
	salvaged_articles.json
	Astro_Reviews.json
	Astro_Research.json
	doi_articles.json
data/json/Earth_Science_Reviews.json: 994/1000 have all required keys
data/json/Earth_Science_Research.json: 1000/1000 have all required keys
data/json/Planetary_Research.json: 964/1000 have all required keys
data/json/Planetary_Reviews.json: 994/1000 have all required keys
data/json/salvaged_articles.json: 50021/72374 have all required keys
data/json/Astro_Reviews.json: 996/1000 have all required keys
data/json/Astro_Research.json: 981/1000 have all required keys
data/json/doi_articles.json: 1898/1898 have all required keys
data_by_journal has keys dict_keys(['Earth_Science_Reviews', 'Earth_Science_Research', 'Planetary_Research', 'Planetary_Reviews', 'salvaged_articles', 'Astro_Reviews', 'Astro_Research', 'doi_articles'])
Loaded 57848 records


In [10]:
segmented_data = load_dataset('data/sentence_segmented/Astro_Reviews.json')

data/sentence_segmented/Astro_Reviews.json: 996/996 have all required keys


In [16]:
paper = segmented_data[666]
sentences = sorted(paper['body_sentences'], key=lambda x: len(x), reverse=False)
for i, sentence in enumerate(sentences):
    print(f'{i}: ({len(sentence)}) {sentence}')

0: (5) 6.1. 
1: (5) 6.2. 
2: (5) 6.3. 
3: (5) 6.4. 
4: (5) 6.5. 
5: (5) 6.6. 
6: (5) 6.7. 
7: (5) 7.1. 
8: (5) 7.2. 
9: (5) 7.3. 
10: (6) 10.1. 
11: (6) 10.2. 
12: (6) 10.3. 
13: (6) 10.4. 
14: (6) 10.5. 
15: (12) Rare Earth? 
16: (16) “What is life?” 
17: (17) 2. WHAT IS LIFE? 
18: (22) From Carlson (2003) . 
19: (25) 12. A POST-HUMAN FUTURE? 
20: (34) From Marino, McShea Uhen (2004) . 
21: (40) We summarize these in the next section. 
22: (43) But of course, there is more to the story. 
23: (43) There are two problems with this argument. 
24: (47) Mountains should therefore be unique to Earth. 
25: (47) (ABI is the brand name of Applied Biosystems.) 
26: (53) Terrestrial geology provides an interesting example. 
27: (55) The problems of habitability and origins are distinct. 
28: (56) There is no explicit temporal information in this tree. 
29: (57) We emphasize the centrality of this issue by an analogy. 
30: (58) And how frequent or rare are these various possibilities? 
31: (58) W

In [32]:
def merge_short_sentences(sentences, threshold=60):
    """
    Returns a list of sentences where sentences below the threshold length
    are re-concatenated with the following sentence. If the result is still 
    below the threshold length, the process is repeated until the threshold
    is reached.
    """
    merged_sentences = []
    for i in range(len(sentences) - 1):
        if len(sentences[i]) < threshold:
            sentences[i + 1] = sentences[i] + sentences[i + 1]
        else:
            merged_sentences.append(sentences[i])

    # Handle the last sentence
    if len(sentences[-1]) < threshold or len(merged_sentences[-1]) < threshold:
        merged_sentences[-1] = merged_sentences[-1] + sentences[-1]
    else:
        merged_sentences.append(sentences[-1])
    return merged_sentences

merged_sentences = merge_short_sentences(segmented_data[43]['body_sentences'])
print(f"Original: {len(segmented_data[667]['body_sentences'])}")
print(f"Merged: {len(merged_sentences)}")

sorted_sentences = sorted(
    merged_sentences, key=lambda x: len(x), reverse=False)
for i, sentence in enumerate(sorted_sentences[:20]):
    print(f'{i}: ({len(sentence)}) {sentence}')

Original: 823
Merged: 618
0: (60) ( Figure 4 ), is not Hα but rather Fe II and later [Fe II]. 
1: (60) One of the best examples is SN 1986G: Phillips et al (1987) 
2: (60) (e.g. Schaefer 1996 ). The second known example is SN 1992K 
3: (60) Large departures from local thermodynamic equilibrium (LTE) 
4: (60) SN 1993J The data for SN 1987K (especially its light curve) 
5: (61) ( Meikle et al 1996 , Patat et al 1996 , Filippenko 1997a ). 
6: (61) (1996 ; reproduced with permission) and Filippenko (1997a) . 
7: (61) Nevertheless, Ca II remains visible, primarily in absorption 
8: (61) ( Hamuy et al 1995 , Vaughan et al 1995 ). Maza et al (1994) 
9: (61) ( Lundqvist Cumming 1997 ) if the wind speed is 10 km s −1 . 
10: (61) Do some (or even most) SNe Ic actually have weak He I lines? 
11: (61) This behavior is well illustrated in Figure 12 with SN 1992H 
12: (61) This was probably due to the formation of dust in the ejecta 
13: (61) (FWHM ≈ 1000–2000 km s −1 ; sometimes a very broad compon

In [None]:
for sent in segmented_data[43]['body_sentences']:
    print(sent)
    input()

INTRODUCTION The study of supernovae (SNe) has expanded tremendously during the past decade. 
A major motivation, of course, was provided by SN 1987A, by far the most thoroughly observed supernova (SN) in history. 
Advances in the field have also been driven by technology: The advent of sensitive detectors, especially charge-coupled devices (CCDs), and the proliferation of moderately large telescopes made it possible to obtain excellent photometry and spectroscopy of large numbers of SNe. 
In addition to their intrinsic interest, SNe are relevant to nucleosynthesis and galactic chemical evolution, the production of neutron stars and black holes, the origin of cosmic rays, the physical state of the interstellar medium, and induced star formation; thus, they have been investigated from a wide range of perspectives. 
Finally, the enormous potential of SNe as cosmological distance indicators is inspiring many new studies. 
This review concentrates primarily on the observed optical spectra 

In [24]:
citation = ('Charlot', '1996')
bibcodes = citation_bibcode_matches(citation, paper['reference'])
print(f"Bibcodes: {bibcodes}")
matching_dois = {doi for bibcode in bibcodes if (
    doi := author_and_bibcode_to_doi(citation[0], bibcode, all_data))}

Bibcodes: ['1996A&AS..115..339C', '1996ASPC...98..275C', '1996ApJ...457..625C']
No matching records for bibcode 1996A&AS..115..339C
No matching records for bibcode 1996ASPC...98..275C
No matching records for bibcode 1996ApJ...457..625C


In [25]:
for record in all_data:
    if record['bibcode'] =='1996ApJ...457..625C':
        print(record['doi'])

In [43]:
import pysbd
import unicodedata

def get_inline_citations(text: str) -> list[tuple[str, str]]:
    return [match.groups() for match in INLINE_REGEX.finditer(text)]

def strip_accents(text):
    """
    Strip accents from input string. Helpful for normalizing author names
    e.g. Schönberger -> Schonberger
    """
    # Normal Form Decomposition: separates base characters from their diacritics
    text = unicodedata.normalize('NFD', text)

    # Skips any chars that are accent marks
    return ''.join(c for c in text if unicodedata.category(c) != 'Mn')

def first_author_normed(authors: list[str]) -> str:
    """
    Given a list of authors, return the first author
    e.g. ['Schönrich, ralph', 'binney, james'] -> 'schonrich'
    """
    return strip_accents(authors[0].split(',')[0]).lower()

def author_and_bibcode_to_doi(author, bibcode, all_records):
    """
    Given an author and bibcode, return the doi if there is a unique match
    """
    matching_records = [record for record in all_records if record['bibcode'] == bibcode]

    if not matching_records:
        # print(f"No matching records for bibcode {bibcode}")
        return None

    # TODO: currently assumes that if more than one record matches the bibcode then
    # they are duplicates. This may not be accurate! Should be checked or a deduplication
    # step should be added.
    
    record = matching_records[0]
    author_normed = strip_accents(author).lower()
    record_author = first_author_normed(record['author'])
    
    # Check if normalized author name is in the list of authors on this record
    if author_normed != record_author:
        return None

    # Citation author name matches the record author name, so it's a match! 
    return record['doi'][0]


print(author_and_bibcode_to_doi('Schönrich', '2009MNRAS.396..203S', all_data))


10.1111/j.1365-2966.2009.14750.x


In [44]:
def sentence_to_example(record, sentence, all_records):
    """
    """
    def citation_to_doi(citation):
        """
        Takes a single inline citation as tuple of (author, year) and determines if there is a unique
        matching bibcode in the record's references. If so, it continues to look for a unique
        doi matching that bibcode in the entire dataset. 
        
        If there are multiple matching bibcodes, it attempts to resolve the doi by checking each matched
        bibcode's record for matching author.

        It returns the doi if resolved, otherwise None.
        """
        bibcodes = citation_bibcode_matches(citation, record['reference'])
        if len(bibcodes) > 1:
            # If there are multiple matches, attempt to resolve by author
            dois = {doi for bibcode in bibcodes if (doi := author_and_bibcode_to_doi(citation[0], bibcode, all_records))}
            # If only one matching doi, return it
            if len(dois) == 1:
                return dois.pop()
            else:
                # print(f"doi's found for {citation}: {dois}")
                return None
            
        if len(bibcodes) != 1:
            return None

        # Take the bibcode and look for a unique corresponding doi
        matching_dois = [record['doi'][0]
                         for record in all_records if record['bibcode'] == bibcodes[0]]
        if len(matching_dois) != 1:
            return None
        return matching_dois[0]

    inline_citations = get_inline_citations(sentence)
    citation_dois = []
    for citation in inline_citations:
        if not (doi := citation_to_doi(citation)):
            break
        citation_dois.append(doi)

    # If all citations resolved to dois, return the example
    # TODO: is this too strict?
    if len(inline_citations) != len(citation_dois):
        return None
    return {
        'source_doi': record['doi'][0],
        'sentence': sentence,
        'citation_dois': citation_dois
    }


def create_examples_from_record(record, all_records):
    splitter = pysbd.Segmenter(language="en", clean=False)
    sentences = [s for s in splitter.segment(record['body']) if len(s) > 40]
    return [
        example for sentence in sentences if (example := sentence_to_example(record, sentence, all_records))
    ]

In [45]:
paper = data_by_journal['Astro_Research'][0]
examples = create_examples_from_record(paper, all_data)

In [30]:
# nontrivial_examples = [ex for ex in examples if len(ex['citation_dois']) > 0]
# print(f"Found {len(nontrivial_examples)} non-trivial examples")

Found 57 non-trivial examples


In [None]:
# from itertools import chain

# all_review_papers = list(chain(*[data_by_journal[journal] for journal in ['Astro_Reviews', 'Earth_Science_Reviews', 'Planetary_Reviews']]))
# print(len(all_review_papers))

# for paper in tqdm(all_review_papers):
#     examples = create_examples_from_record(paper, all_data)
#     trivial_examples, nontrivial_examples = [], []
#     for example in examples:
#         if len(example['citation_dois']) > 0:
#             nontrivial_examples.append(example)
#         else:
#             trivial_examples.append(example)

#     with open('data/examples_trivial.json', 'a') as f:
#         for example in trivial_examples:
#             f.write(json.dumps(example) + '\n')
#     with open('data/examples_nontrivial.json', 'a') as f:
#         for example in nontrivial_examples:
#             f.write(json.dumps(example) + '\n')

2984


 32%|███▏      | 960/2984 [4:19:42<9:07:33, 16.23s/it]   


KeyboardInterrupt: 

None


In [1]:
for journal in data_by_journal:
    data = data_by_journal[journal]
    for i, record in enumerate(data):
        if record['bibcode'] == '2009MNRAS.396..203S':
            print(f"Found in {journal}")
            print(record['author'])
            break
            

NameError: name 'data_by_journal' is not defined