# Import

In [1]:
import pandas as pd
import ast
import numpy as np
import re

In [2]:
import spacy
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

# Read Data

In [3]:
dfs = []
for subset in ['reddit', 'nyt', 'stack']:
    temp_df = pd.read_csv(f"../data/{subset}/raw_input.csv")
    dfs += [temp_df]
    
df = pd.concat(dfs)

# Argument Proposition Extraction

In [4]:
df = df.explode(['comments'])

In [14]:
import os
from openai import OpenAI
client = OpenAI(
    api_key = "<your-api-key-here"
)

In [15]:
# model="gpt-4.1-mini"
# model="gpt-4o-mini"
model="gpt-4.1"
# model="gpt-4"
# model="gpt-3.5-turbo"

prompt = "Once upon a time"

In [16]:
def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=1500,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [17]:
# BEST
INFERENCE_PROMPT = """Help me extract the proposition of the given comment according to Toulmin model. Extracted proposition should be original sentences/phrases from the comment. You can ignore non argumentative unit/senteces. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):

Extract all possible claims and their propositions as a list of JSONs:
[{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

For example:
Comment: You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.
Proposition: [{"claim": "You should girafarig", "ground": ["girafarig is the best pokemon ever created no matter what ANYONE SAYS"]}]

Now perform on the following output:
"%s"
"""

In [18]:
row = df.iloc[14]
row['comments']

'[\'Looks pretty cool. Would have been nice to not have that background music though, its kind of distracting\', "The background music is a demo track made entirely using the synth. It\'s still distracting and it\'s poorly communicated but it\'s not like they\'re playing something completely unrelated over the demo video.", \'would be interested in it if its available for a decent price. would look very nice in my studio\', "Looks sick. Wish I didn\'t have to email info for a price, but the synth itself looks awesome. I\'ve been looking for something like this.", \'Why not just use famitracker or LSDJ??\', \'cuz having a physical interface is infinitely more intuitive and conducive to creativity?\', \'Because LSDJ and actual gameboy with little modification is a lot more convenient.\', \'maybe you underestimate what using a 4-directional gamepad feels like to people without autism\', \'Looks fun as hell.\', \'Looks pretty noice :)\', \'Well this thing shits on my Meeblip.\', \'This has

In [19]:
prompt = INFERENCE_PROMPT % (row['comments'])

In [21]:
response = get_completion(prompt, model)

In [22]:
print(response)

```json
[
  {
    "claim": "The background music is a demo track made entirely using the synth",
    "ground": []
  },
  {
    "claim": "The background music is distracting",
    "ground": []
  },
  {
    "claim": "The background music is poorly communicated",
    "ground": []
  },
  {
    "claim": "The synth would look very nice in my studio",
    "ground": []
  },
  {
    "claim": "I would be interested in the synth if it is available for a decent price",
    "ground": []
  },
  {
    "claim": "The synth looks awesome",
    "ground": []
  },
  {
    "claim": "I have been looking for something like this synth",
    "ground": []
  },
  {
    "claim": "Having a physical interface is infinitely more intuitive and conducive to creativity",
    "ground": []
  },
  {
    "claim": "LSDJ and actual Gameboy with little modification is a lot more convenient",
    "ground": []
  },
  {
    "claim": "Using a 4-directional gamepad feels different to people without autism",
    "ground": []
  },
  

## Inference

In [23]:
def get_proposition_completion(comments):
    prompt = INFERENCE_PROMPT % (comments)
    try:
        return get_completion(prompt, model)
    except:
        return None

In [None]:
results = []
for i, row in df.iterrows():
    comments = row['comments']
    if pd.isnull(comments) or len(comments.strip()) == 0:
        arg_propositions += [np.nan]
    else:
        arg_proposition = get_proposition_completion(comments)
        results += [arg_proposition]
        time.sleep(0.1)

In [None]:
df['arg_propositions'] = results

In [28]:
for subset in ['reddit', 'nyt', 'stack']:
    temp_df = df[df['subset'] == subset]
    temp_df.to_csv(f"../data/{subset}/source_comments_arg_extracted.csv", index=False)