# Import

In [1]:
import pandas as pd
import ast
import numpy as np
import re

In [2]:
import os
from openai import OpenAI
client = OpenAI(
    api_key = "<your-api-key-here"
)
model="gpt-4o-mini"

In [3]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [4]:
dfs = []
for subset in ['reddit', 'nyt', 'stack']:
    temp_df = pd.read_csv(f"./data/{subset}/raw_input.csv")
    dfs += [temp_df]
    
df = pd.concat(dfs)

## Argument Proposition Extraction

In [5]:
def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=10000,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [6]:
BASE_PROMPT = """Help me extract the proposition of the given summary according to Toulmin model. Extracted proposition should be original sentences/phrases from the summary. You can ignore non argumentative unit/sentences. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):

Extract all possible claims and their propositions as a list of JSONs:
[{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

Remember to remove all quantification terms at the beginning of the claim/ground. (e.g., "Most comments", "Some comments")

For example:
Community Question: 
Summary: Most commentators say that you should choose girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.
Gold Summary: [{"claim": "You should choose girafarig", "ground": ["girafarig is the best pokemon ever created no matter what ANYONE SAYS"]}]

"""

In [7]:
INFERENCE_PROMPT = """Now perform the task on the following input:
Community Question: {Question}
Gold Summary: {Gold_Summary}
"""

In [8]:
df = df.rename(columns={'questionText': 'query'})

### Inference

In [9]:
def get_proposition_completion(query, summary):
    prompt = BASE_PROMPT + INFERENCE_PROMPT.format(Question = query, Gold_Summary = summary)
    try:
        return get_completion(prompt, model)
    except:
        return None

In [None]:
results = []
for i, row in df.iterrows():
    query = row['query']
    summary = row['summary']

    if pd.isnull(summary) or len(summary.strip()) == 0:
        results += [np.nan]
    else:
        arg_proposition = get_proposition_completion(query, summary)
        results += [arg_proposition]
        time.sleep(0.1)

In [None]:
df['summary_propositions'] = results

In [15]:
for subset in ['reddit', 'nyt', 'stack']:
    temp_df = df[df['subset'] == subset]
    temp_df.to_csv(f"./data/{subset}/gold_summaries_arg_extracted.csv", index=False)