In [None]:
from pprint import pprint
import pandas as pd
from langchain_ollama import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

df = pd.read_json("data/dataset/100/nontrivial.jsonl", lines=True)

In [None]:
extraction_prompt = """
Output ONLY the inline citations from the text below as a list of tuples
- Each citation becomes a (string, int) tuple where the string is the first author's name and the int is the year
- If there are no citations in the text, output []
- Do not count citations 'in preparation' or lacking a year
- Do not include any introductory text, explanations, or anything before or after the array

Examples of inline citations:
'''
Sentence: "Like Caffau et al. (2008a) , we have similar findings."
Output: [('Caffau et al.', 2008)]

Sentence: "Multiply by the number of stars in the sample ( 10 000 ) and the number of stars in the galaxy ( 100 billion ) to get a total of 10 trillion stars (Caffau Moritz 2008b)."
Output: [('Caffau Moritz', 2008)]

Sentence: "Methods for mixing below the convection zone are well understood ( Brun, Turck-Chièze Zahn 1999 , Charbonnel Talon 2005 )."
Output: [('Brun', 1999), ('Charbonnel', 2005)]

Sentence: "Momentum balance gives an expression ( Fabian 1999 ; Fabian, Wilman Crawford 2002 ; King 2003 , 2005 )"
Output: [('Fabian Moritz', 1999), ('Fabian', 2002), ('King', 2003), ('King', 2005)]

Sentence: "This is consistent with previous results (Pereira et al., in preparation)."
Output: []
'''

Now extract the inline citations from the following text:
'''
{text}
'''

Output format: 
[('first author', year), ('first author', year), ...]
"""



[('Drawin', 1968), ('Holweger', 2001), ('Holweger', 1974)]


In [None]:
extraction_prompt = """
Output ONLY the inline citations from the text below as a list of tuples
- Each citation becomes a (string, int) tuple where the string is the first author's name and the int is the year
- If there are no citations in the text, output []
- Do not count citations 'in preparation' or lacking a year
- Do not include any introductory text, explanations, or anything before or after the array

Examples of inline citations:
'''
Sentence: "Like Caffau et al. (2008a) , we have similar findings."
Output: [('Caffau et al.', 2008)]

Sentence: "Multiply by the number of stars in the sample ( 10 000 ) and the number of stars in the galaxy ( 100 billion ) to get a total of 10 trillion stars (Caffau Moritz 2008b)."
Output: [('Caffau Moritz', 2008)]

Sentence: "Methods for mixing below the convection zone are well understood ( Brun, Turck-Chièze Zahn 1999 , Charbonnel Talon 2005 )."
Output: [('Brun', 1999), ('Charbonnel', 2005)]

Sentence: "Momentum balance gives an expression ( Fabian 1999 ; Fabian, Wilman Crawford 2002 ; King 2003 , 2005 )"
Output: [('Fabian Moritz', 1999), ('Fabian', 2002), ('King', 2003), ('King', 2005)]

Sentence: "This is consistent with previous results (Pereira et al., in preparation)."
Output: []
'''

Now extract the inline citations from the following text:
"""

from pydantic import BaseModel, Field

class Citation(BaseModel):
    first_author: str = Field(description="The first author's name")
    year: int = Field(description="The year of the citation")

class CitationList(BaseModel):
    citations: list[Citation] = Field(description="List of citations")

extraction_model = ChatOllama(
    model="mistral-nemo", temperature=0.0, streaming=False
).with_structured_output(CitationList)

extraction_system_prompt = SystemMessage(
    content=extraction_prompt
)

response = extraction_model.invoke(
    [
        extraction_system_prompt,
        HumanMessage(
            content='We neglect the H collisions altogether based on the available atomic physics data for other elements, while others use the classical Drawin (1968) formula, possibly with a scaling factor S H that typically varies from 0 to 1. Holweger (2001) found log ε O = 8.71 ± 0.05 using the Holweger Müller (1974) model with granulation corrections'

        )
    ]
)

In [None]:
strip_prompt = """
You are an expert text editor. In the sentence below, replace any inline citations with [REF].

IDENTIFY INLINE CITATIONS:
- inline citations are REFERENCES TO OTHER RESEARCH
- "Halley's comet", "Herschel's Gap" etc. are NOT inline citations
- inline citations are usually one or more author names followed by a year 
- inline citations can be in parentheses or not
- spacing around citations can vary, e.g. "Smith et al. (2000)", "Smith et al. ( 2000 )", "(Smith et al. 2000) ", "( Smith et al. 2000 )"
- citations can have different formats, e.g. "Smith et al. (2000)", "Smith (2000)", "Smith et al. 2000", "Smith 2000", "(Smith et al. 2000)", "(Smith 2000)"
- citations can have multiple authors, e.g. "Smith et al. (2000)", "Smith, Jones (2000)", "Smith, Jones, Brown (2000)"
- years can have a letter suffix, e.g. "Smith et al. (2000a)", "Smith et al. (2000b)"
- not all parentheses are part of citations, e.g. "As seen before (Sect. 4.1) we find that..."

RULES:
- Think step by step
- Read the sentence carefully
- Identify any inline citations that refer to other research
- Write out the input sentence, replacing the full length of any inline citation with [REF]
- do not change ANY other part of the text 

EXAMPLES:
Sentence: "Like Caffau et al. (2008a) , we have similar findings."
output: "Like [REF], we have similar findings."

Sentence: "Gopalswamy et al. ( 2003 ) noted a correspondence between the gamma ray intensity and the coronal flares.
output: "[REF] noted a correspondence between the gamma ray intensity and the coronal flares."

Sentence: "In Donati et al. ( 2008 ) experiments showed that the magnetic field of the Sun is not a simple dipole."
output: "In [REF] experiments showed that the magnetic field of the Sun is not a simple dipole."

Sentence: "Methods for mixing below the convection zone ( Sect 4.1 ) are well understood ( Brun, Turck-Chièze Zahn 1999 , Charbonnel Talon 2005 )."
output: "Methods for mixing below the convection zone ( Sect 4.1 )  are well understood [REF]."

Sentence: "In previous work, Jerkstrand et al. (2011) estimate a mass M 44 =(1.5±0.5)×10 −4 M ⊙ , which NuSTAR similarly observed.
output: "In previous work, [REF] estimate a mass M 44 =(1.5±0.5)×10 −4 M ⊙ , which NuSTAR similarly observed."

ONLY respond with a string containing the modified text. Do not include any explanations or additional information.

Now, following the above rules and examples, replace the inline citations in the following text with [REF]:

"""

In [None]:
class Sentence(BaseModel):
    text: str = Field(description="The modified text with inline citations replaced by [REF]")

strip_model = ChatOllama(
    model="llama3.3:latest", temperature=0.0, streaming=False
).with_structured_output(Sentence)
strip_system_prompt = SystemMessage(
    content=strip_prompt
)

In [59]:
extractions = []
for index, row in df.iterrows():
    print(index)

    text = row.sent_original
    response = extraction_model.invoke([extraction_system_prompt, HumanMessage(content=text)])

    extraction = None
    if not hasattr(response, "citations"):
        print(f"Error: extraction response has no citations attribute")
        print(f"Response: {response}")
    extraction = [(cit.first_author, cit.year) for cit in response.citations]

    # Get stripped sentence
    strip_response = strip_model.invoke([strip_system_prompt, HumanMessage(content=text)])
    stripped_text = None
    if not hasattr(strip_response, "text"):
        print(f"Error: strip response has no text attribute")
        print(f"Response: {strip_response}")
    stripped_text = strip_response.text

    extraction_result = {
        "original": text,
        "citations": extraction,
        "sent_no_cit": stripped_text,
    }
    extractions.append(extraction_result)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [60]:
pprint(extractions)

[{'citations': [('Ussiri', 2013)],
  'original': 'These are minor species, with abundances of 1.5 10 12 , 1.7 10 '
              '9 , and 7 10 8 kg N, respectively ( Ussiri and Lal, 2013 ).',
  'sent_no_cit': 'These are minor species, with abundances of 1.5 10 12 , 1.7 '
                 '10 9 , and 7 10 8 kg N, respectively [REF].'},
 {'citations': [],
  'original': 'The Ellison-Ramaty fit for the 13 December 2006 event (not '
              'shown), provided an even poorer overall fit, fitting the high '
              'energy points from ∼15 to 400 MeV with an e-folding energy of E '
              '0 ∼110 MeV, but failing entirely to fit the spectral break at '
              'several MeV.',
  'sent_no_cit': 'The Ellison-Ramaty fit for the 13 December 2006 event (not '
                 'shown), provided an even poorer overall fit, fitting the '
                 'high energy points from ∼15 to 400 MeV with an e-folding '
                 'energy of E 0 ∼110 MeV, but failing entirely to 