In [1]:
import requests
from pprint import pprint
import pandas as pd

df = pd.read_json("data/dataset/100/nontrivial.jsonl", lines=True)

In [7]:
def ask_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "llama3.3",
        "prompt": prompt,
        "temperature": 0.01,
        "stream": False
    }

    response = requests.post(url, json=payload)
    result = response.json()
    return result['response']

In [8]:
extraction_prompt = """
Output ONLY the inline citations from the text below as a list of tuples
- Each citation becomes a (string, int) tuple where the string is the first author's name and the int is the year
- If there are no citations in the text, output []
- Do not count citations 'in preparation' or lacking a year
- Do not include any introductory text, explanations, or anything before or after the array

Examples of inline citations:
'''
Sentence: "Like Caffau et al. (2008a) , we have similar findings."
Output: [('Caffau et al.', 2008)]

Sentence: "Multiply by the number of stars in the sample ( 10 000 ) and the number of stars in the galaxy ( 100 billion ) to get a total of 10 trillion stars (Caffau Moritz 2008b)."
Output: [('Caffau Moritz', 2008)]

Sentence: "Methods for mixing below the convection zone are well understood ( Brun, Turck-Chièze Zahn 1999 , Charbonnel Talon 2005 )."
Output: [('Brun', 1999), ('Charbonnel', 2005)]

Sentence: "Momentum balance gives an expression ( Fabian 1999 ; Fabian, Wilman Crawford 2002 ; King 2003 , 2005 )"
Output: [('Fabian Moritz', 1999), ('Fabian', 2002), ('King', 2003), ('King', 2005)]

Sentence: "This is consistent with previous results (Pereira et al., in preparation)."
Output: []
'''

Now extract the inline citations from the following text:
'''
{text}
'''

Output format: 
[('first author', year), ('first author', year), ...]
"""
text = 'neglect the H collisions altogether based on the available atomic physics data for other elements, while others use the classical Drawin (1968) formula, possibly with a scaling factor S H that typically varies from 0 to 1. Holweger (2001) found log ε O = 8.71 ± 0.05 using the Holweger Müller (1974) model with granulation corrections'

# print(prompt.format(text=text))
response = ask_ollama(extraction_prompt.format(text=text))
print(response)

[('Drawin', 1968), ('Holweger', 2001), ('Holweger', 1974)]


In [9]:
strip_prompt = """
Remove the inline citations from the text below.

EXAMPLES:
Sentence: "Like Caffau et al. (2008a) , we have similar findings."
Output: "Like , we have similar findings."

Sentence: "Methods for mixing below the convection zone are well understood ( Brun, Turck-Chièze Zahn 1999 , Charbonnel Talon 2005 )."
Output: "Methods for mixing below the convection zone are well understood."

ONLY respond with the modified text. Do not include any explanations or additional information.
Now remove the inline citations from the following text:

Sentence: "{text}"
Output:
"""

In [10]:
stripped_text = ask_ollama(strip_prompt.format(text=text))
pprint(stripped_text)

neglect the H collisions altogether based on the available atomic physics data for other elements, while others use the classical  formula, possibly with a scaling factor S H that typically varies from 0 to 1.  found log ε O = 8.71 ± 0.05 using the  model with granulation corrections


In [11]:
pprint(f"Original: {text}")
pprint(f"Stripped: {stripped_text}")

('Original: neglect the H collisions altogether based on the available atomic '
 'physics data for other elements, while others use the classical Drawin '
 '(1968) formula, possibly with a scaling factor S H that typically varies '
 'from 0 to 1. Holweger (2001) found log ε O = 8.71 ± 0.05 using the Holweger '
 'Müller (1974) model with granulation corrections')
('Stripped: neglect the H collisions altogether based on the available atomic '
 'physics data for other elements, while others use the classical  formula, '
 'possibly with a scaling factor S H that typically varies from 0 to 1.  found '
 'log ε O = 8.71 ± 0.05 using the  model with granulation corrections')


In [12]:
cites = ask_ollama(prompt=extraction_prompt.format(text=df.sent_original[0]))
pprint(cites)

"[('Ussiri', 2013)]"


In [13]:
stripped_sentence = ask_ollama(strip_prompt.format(text=df.sent_original[0]))
pprint(f"Original: {df.sent_original[0]}")
pprint(f"Stripped: {stripped_sentence}")

('Original: These are minor species, with abundances of 1.5 10 12 , 1.7 10 9 , '
 'and 7 10 8 kg N, respectively ( Ussiri and Lal, 2013 ).')
('Stripped: These are minor species, with abundances of 1.5 10 12 , 1.7 10 9 , '
 'and 7 10 8 kg N, respectively.')


In [20]:
for i in range(len(df)):
    pprint(f"{i}: {df.sent_no_cit[i]}")

('0: These are minor species, with abundances of 1.5 10 12 , 1.7 10 9 , and 7 '
 '10 8 kg N, respectively ( Ussiri and .')
('1: The Ellison-Ramaty fit for the 13 event (not shown), provided an even '
 'poorer overall fit, fitting the high energy points from ∼15 to 400 MeV with '
 'an e-folding energy of E 0 ∼110 MeV, but failing entirely to fit the '
 'spectral break at several MeV.')
('2: Building on their ideas, Priest and  developed a model in which the '
 'magnetic field is written in terms of cylindrical polar coordinates , asdue '
 'to a ring of X-points near and the field of the current sheet itself.')
('3: Focusing on 353-GHz data at high Galactic latitudes, the E and B angular '
 'power spectra of dust polarization were constrained in the multipole range '
 'of 40l600 ( .')
('4: With the above calibration one obtains from a complete sample of 49 '
 'inclined, untruncated Virgo cluster spirals, as compiled by , and after a '
 'small correction for the color difference between c