# Import

In [1]:
import pandas as pd
import ast
import numpy as np
import re

In [2]:
import spacy
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

# Read Data

In [3]:
data = {
    'reddit': {
        'source': "../../ConvoSumm-master/data/reddit/vanilla/reddit.test.source.remove_markers_simple_separator",
        'target': "../../ConvoSumm-master/data/reddit/vanilla/reddit.test.target.remove_markers_simple_separator",
        'separator': '</s>'
    },
    'stack': {
        'source': "../../ConvoSumm-master/data/stack/vanilla/stackexchange.test.source.remove_markers_simple_separator.input.nocomment",
        'target': "../../ConvoSumm-master/data/stack/vanilla/stackexchange.test.target.remove_markers_simple_separator.input.nocomment",
        'separator': '</s>'
    },
    'nyt': {
        'source': "../../ConvoSumm-master/data/nyt/vanilla/test.source",
        'target': "../../ConvoSumm-master/data/nyt/vanilla/test.target",
        'separator': '</s>'
    }
}

In [4]:
def read_subset(inputfs, inputft, separator="</s>"):
    data = []
    with open(inputfs, encoding='utf-8') as inputf1, open(inputft, encoding='utf-8') as inputf2:
        for count, (l1, l2) in enumerate(zip(inputf1, inputf2)):
            title = l1.strip().split(separator)[0].strip("</s>").strip()
            texts = l1.strip().split(separator)[1:]
            
            comments = [comm.strip("</s>").strip() for comm in texts]

            summary = l2.strip("</s>").strip()
            cur_data = {"title": title, "comments": comments, "summary": summary, "id": count}
            data += [cur_data]
    
    return pd.DataFrame(data)

In [5]:
# def extract_short_title_subreddit(df, subset):
#     df['short_title'] = df['title'].apply(lambda x: re.findall(r"Title: *(.+?) {2,}[A-Z][a-z]+:", x))
#     mask = df['short_title'].str.len() == 0
#     df.loc[mask, 'short_title'] = df.loc[mask, 'title'].apply(lambda x: [x])
#     df['short_title'] = df['short_title'].apply(lambda x: x[0])

#     df['subreddit'] = df['title'].apply(lambda x: re.findall(r"Subreddit: *([^ ]+)", x))
#     mask = df['subreddit'].str.len() == 0
#     df.loc[mask, 'subreddit'] = df.loc[mask, 'subreddit'].apply(lambda x: [''])
#     df['subreddit'] = df['subreddit'].apply(lambda x: x[0])
    
#     if subset == 'reddit':
#         df['short_question'] = df['title'].apply(lambda x: re.findall(r"Subreddit: *(?:[^ ]+) *(.+)$", x))
#         mask = df['short_question'].str.len() == 0
#         df.loc[mask, 'short_question'] = df.loc[mask, 'short_question'].apply(lambda x: [''])
#     else:
#         df['short_question'] = df['title'].apply(lambda x: re.findall(r"Score: *(?:[0-9]+) *(.+)$", x))
#         mask = df['short_question'].str.len() == 0
#         df.loc[mask, 'short_question'] = df.loc[mask, 'short_question'].apply(lambda x: [''])
#     return df

In [6]:
def extract_short_title_subreddit(df, subset):
    df['short_title'] = df['title'].apply(lambda x: re.findall(r"Title: *(.+?) {2,}[A-Z][a-z]+:", x))
    mask = df['short_title'].str.len() == 0
    df.loc[mask, 'short_title'] = df.loc[mask, 'title'].apply(lambda x: [x])
    df['short_title'] = df['short_title'].apply(lambda x: x[0])
    
    if subset == 'reddit':
        df['subreddit'] = df['title'].apply(lambda x: re.findall(r"Subreddit: *([^ ]+)", x))
        mask = df['subreddit'].str.len() == 0
        df.loc[mask, 'subreddit'] = df.loc[mask, 'subreddit'].apply(lambda x: [''])
#         df['subreddit'] = df['subreddit'].apply(lambda x: x[0])
    elif subset == 'stack':
        df['subreddit'] = df['title'].apply(lambda x: re.findall(r"Question tags: *([^ ]+)", x))
        mask = df['subreddit'].str.len() == 0
        df.loc[mask, 'subreddit'] = df.loc[mask, 'subreddit'].apply(lambda x: [''])
        df['subreddit'] = df['subreddit'].apply(lambda x: x[0])
        df['subreddit'] = df['subreddit'].apply(lambda x: re.findall(r"<.+?>", x))
        df['subreddit'] = df['subreddit'].apply(lambda x: [tag.strip("<").strip(">") for tag in x])
    else:
        df['subreddit'] = df['title'].apply(lambda x: re.findall(r"Keywords: *(.+) +Snippet", x)[0].strip())
        df['subreddit'] = df['subreddit'].apply(lambda topic: topic.strip())
        # mask = df['subreddit'].str.len() == 0
        # df.loc[mask, 'subreddit'] = df.loc[mask, 'subreddit'].apply(lambda x: [''])
        df['subreddit'] = df['subreddit'].apply(lambda x: [topic.strip() for topic in x.split(";")])
        
        
    if subset == 'reddit':
        df['short_question'] = df['title'].apply(lambda x: re.findall(r"Subreddit: *(?:[^ ]+) *(.+)$", x))
        mask = df['short_question'].str.len() == 0
        df.loc[mask, 'short_question'] = df.loc[mask, 'short_question'].apply(lambda x: [''])
    elif subset == 'stack':
        df['short_question'] = df['title'].apply(lambda x: re.findall(r"Score: *(?:[0-9]+) *(.+)$", x))
        mask = df['short_question'].str.len() == 0
        df.loc[mask, 'short_question'] = df.loc[mask, 'short_question'].apply(lambda x: [''])
    else:
        df['short_question'] = df['title'].apply(lambda x: re.findall(r"Headline: *(.+) +Keywords", x))
        mask = df['short_question'].str.len() == 0
        df.loc[mask, 'short_question'] = df.loc[mask, 'short_question'].apply(lambda x: [''])
        
    return df

In [7]:
dfs = {}
for subset, info in data.items():
    subset_df = read_subset(info['source'], info['target'])
    subset_df['subset'] = subset
    dfs[subset] = subset_df

In [8]:
# ground_truth_fix_data = {
#     'reddit': "../../ConvoSumm-master/data/reddit/arg-filtered/{subsubset}.jsonl",
#     'stack': "../../ConvoSumm-master/data/stack/arg-filtered/{subsubset}.jsonl"
# }

# ground_truth_fix_dfs = {}
# for subset, subset_df in dfs.items():
#     path = ground_truth_fix_data[subset]
# #     ground_truth_fix = pd.read_json(path, lines=True)
#     ground_truth_fix = pd.concat([
#         pd.read_json(path.format(subsubset = 'test'), lines=True),
#     ])
    
#     ground_truth_fix['subset'] = subset
#     ground_truth_fix = ground_truth_fix.rename(columns={'info': 'title'})
#     ground_truth_fix_dfs[subset] = ground_truth_fix    
# #     subset_df = subset_df.drop(columns=['summary']).merge(ground_truth_fix[['id', 'subset', 'summary']], on=['id', 'subset'])
# #     dfs[subset] = subset_df

In [9]:
# dfs['reddit'] = extract_short_title_subreddit(dfs['reddit'])
# dfs['stack'] = extract_short_title_subreddit(dfs['stack'])
for subset, subset_df in dfs.items():
    subset_df = extract_short_title_subreddit(subset_df, subset)
    dfs[subset] = subset_df

In [10]:
# # ground_truth_fix_dfs['reddit'] = extract_short_title_subreddit(ground_truth_fix_dfs['reddit'])
# # ground_truth_fix_dfs['stack'] = extract_short_title_subreddit(ground_truth_fix_dfs['stack'])
# for subset, subset_df in ground_truth_fix_dfs.items():
#     subset_df = extract_short_title_subreddit(subset_df, subset)
#     ground_truth_fix_dfs[subset] = subset_df

In [11]:
# ground_truth_fix_dfs['stack']['subreddit'].iloc[3]

In [12]:
# all(dfs['stack']['summary'] == ground_truth_fix_dfs['stack']['summary'])

In [13]:
# for subset, subset_df in dfs.items():
# #     subset_df = subset_df.drop(columns=['summary']).merge(
# #         ground_truth_fix_dfs[subset][['short_title', 'subreddit', 'subset', 'summary']],
# #         how='left'
# #     )
#     subset_df = subset_df.drop(columns=['summary', 'subreddit']).merge(
#         ground_truth_fix_dfs[subset][['short_title', 'subreddit', 'subset', 'summary']],
#         how='left'
#     )
#     dfs[subset] = subset_df

In [14]:
# # df = ground_truth_fix_dfs['reddit']
# df = my_df
# mask = df['title'].str.contains("Why do people")
# mask = df['id'] == 45
# df[mask]['summary'].iloc[0]

In [15]:
# # df = ground_truth_fix_dfs['reddit']
# df = dfs['reddit']
# # mask = df['title'].str.contains("Why do people")
# df[mask]['summary'].iloc[0]

In [16]:
# # df = ground_truth_fix_dfs['reddit']
# df = dfs['reddit']
# mask = df['title'].str.contains("Why do people")
# df[mask]['summary'].iloc[0]

In [17]:
# mask = ground_truth_fix['title'].str.contains("Can't log in to FFXIV")
# ground_truth_fix[mask].iloc[0]['title']

In [18]:
dfs['nyt']['comments'] = dfs['nyt']['comments'].apply(lambda x: [re.sub("^Score: [0-9\.]+ *", "", comm) for comm in x])
dfs['nyt']['title'] = dfs['nyt']['title'].apply(lambda x: re.findall("Snippet: *(.+)$", x))
mask = dfs['nyt']['title'].str.len() != 1
print(dfs['nyt'][mask].shape)
dfs['nyt']['title'] = dfs['nyt']['title'].apply(lambda x: x[0])
dfs['nyt'] = dfs['nyt'].rename(columns={'title': 'questionText'})

(0, 8)


In [19]:
dfs['reddit']['title'] = dfs['reddit']['title'].apply(lambda x: re.sub("Subreddit: *[^ ]+ *", "", x))
dfs['reddit']['title'] = dfs['reddit']['title'].apply(lambda x: x.strip("Title:").strip())
dfs['reddit'] = dfs['reddit'].rename(columns={'title': 'questionText'})

In [20]:
dfs['stack']['title'] = dfs['stack']['title'].apply(lambda x: re.sub("Question tags: .+$", "", x))
dfs['stack']['title'] = dfs['stack']['title'].apply(lambda x: x.strip("Title:").strip())
dfs['stack'] = dfs['stack'].rename(columns={'title': 'questionText'})

In [21]:
dfs['stack']['comments'] = dfs['stack']['comments'].apply(lambda x: [re.sub("Answer ID:[0-9 ]+Score:[0-9 ]+", "", c) for c in x])

In [22]:
df = pd.concat(dfs.values())

In [23]:
mask = df['id'] == 45
df[mask]['summary'].iloc[0]

'Most commentators are confused how the humble bundle was claimed before they could even attempt to do it. Two commentators believe that bots picked it up before they could. One commentator posts a strange tribute message in all caps, thanking the OP and a list of other people. One commentator simply says thanks.'

In [25]:
for subset in ['reddit', 'nyt', 'stack']:
    temp_df = df[df['subset'] == subset]
    temp_df.to_csv(f"./data/{subset}/raw_input.csv", index=False)

In [25]:
df.to_pickle("./convosumm_reddit_stack_nyt_test_processed.pkl")

# Gold Summary

## Claim Extraction

In [26]:
import os
from openai import AzureOpenAI

endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
model_name = "gpt-4.1"
deployment = "gpt-4.1"

subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

In [26]:
model="gpt-4.1-mini"
# model="gpt-4o-mini"
# model="gpt-4.1"
# model="gpt-4"
# model="gpt-3.5-turbo"

prompt = "Once upon a time"

In [27]:
def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=10000,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [28]:
# BASE_PROMPT = """In this task you are presented with a community question, a claim A and a list of other claims extracted from the social comments answering the question.
# You will be asked to select best-matching claim from the list to claim A. The selection must be from the input list of other claims. Do not generate new best-matching claims.
# Two claims are best-matched if they explicitly express similar opinions and sentiment on a similar aspect.

# Use and output the following format:
# Community Question: <the input question>
# Claim A: <the input claim>
# List of Other Claims: <the input list of other claims>
# Best-Matching Claim: <The claim (from the list) that best-matched to the input claim>

# """

In [29]:
# # BEST (OK) BEST
# base_prompt = """You will be provided with a long-form quantitative summary produced by a quantitative query-focused summarization system
# A long-form quantitative summary captures all opinions from user reviews in answering a question as key points, with prevalence measuring the number of comments supporting each key point. The summary follows the below template:
# 'While answering about [Question]:
# + [Prevalence] comments believe that [Key Point]
# + [Prevalence] comments think that [Key Point]
# ...'

# You were tasked to identify and extract all key points, together with their prevalence, from the long-form quantitative summary.
# Note that you only have to identify and extract one main key point for each bullet point in the provided summary.
# Please provide the extraction in a JSON list with the following format:
# [{'key_point': [Key Point], 'prevalence': 'Prevalence'}, ...]

# """

In [30]:
# # BEST (OK) BEST
# BASE_PROMPT = """In this task you are presented with a community question and a gold summary written annotator to answer the question.
# The gold summary captures all opinions from user comments in answering a question.
# You were tasked to identify and extract all single opinions written in the gold summary as key points. An key point must have a single and specific sentiment.
# Remember to remove vague quantification (e.g., Most comment(er)s, Some comment(er)s, One comment(er)) from the key point.

# Use and output the following format:
# Community Question: <the input question>
# Gold Summary: <the input gold summary>
# Key Points: <Python list of key points extracted from the gold summary>

# """

In [31]:
# # BEST (OK) BEST
# BASE_PROMPT = """In this task you are presented with a community question and a gold summary written annotator to answer the question.
# The gold summary captures all opinions from user comments in answering a question.
# You were tasked to identify and extract all single opinions written in the gold summary as key points. An key point must have a single and specific sentiment.
# Remember to remove vague quantification (e.g., Most comments, Some comments, One comment) from the key point.

# Use and output the following format:
# Community Question: <the input question>
# Gold Summary: <the input gold summary>
# Key Points: <Python list of key points extracted from the gold summary>

# Below is an example:
# Community Question: Can't log in to FFXIV   Whenever I try logging into the game it gives me an error 50002 then sends me to the SE account page and when I try to add the option for a character it only gives me an option to pay with ClickandBuy but the thing is that account has been blocked. Is there any other way to play this game?      Edit: Got my account unblocked, is the Reddit LS active?
# Gold Summary: Commenters give some tips in trying to help someone struggling to log into FFXIV. One commenter suggests buying Crysta although that will need CandB or UltimatePay. Others are unsure of what else to do. The only remaining suggestion one said to try is to contact SE support and hope they reach back quickly.
# Key Points: ["Suggests buying Crysta", "Buying Crysta will need CandB or UltimatePay", "Unsure of what else to do", "Contact SE support and hope they respond quickly"]

# """

In [32]:
# BEST (OK) BEST
BASE_PROMPT = """In this task you are presented with a community question and a gold summary written annotator to answer the question.
The gold summary captures all opinions from user comments in answering a question.
You were tasked to identify and extract all single opinions written in the gold summary as key points. An key point must have a single and specific sentiment.

Use and output the following format:
Community Question: <the input question>
Gold Summary: <the input gold summary>
Key Points: <Python list of key points extracted from the gold summary>

Remember to remove quantification terms like "Most comments", "Some comments" from the key point.

Below is an example:
Community Question: Can't log in to FFXIV   Whenever I try logging into the game it gives me an error 50002 then sends me to the SE account page and when I try to add the option for a character it only gives me an option to pay with ClickandBuy but the thing is that account has been blocked. Is there any other way to play this game?      Edit: Got my account unblocked, is the Reddit LS active?
Gold Summary: Commenters give some tips in trying to help someone struggling to log into FFXIV. One commenter suggests buying Crysta although that will need CandB or UltimatePay. Others are unsure of what else to do. The only remaining suggestion one said to try is to contact SE support and hope they reach back quickly.
Key Points: ["Suggests buying Crysta", "Buying Crysta will need CandB or UltimatePay", "Unsure of what else to do", "Contact SE support and hope they respond quickly"]

"""

In [33]:
# BEST (OK) BEST
BASE_PROMPT = """In this task you are presented with a community question and a gold summary written annotator to answer the question.
The gold summary captures all opinions from user comments in answering a question.
You were tasked to identify and extract all single opinions written in the gold summary as key points. An key point must have a single and specific sentiment.

Use and output the following format:
Community Question: <the input question>
Gold Summary: <the input gold summary>
Key Points: <Python list of key points extracted from the gold summary>

Remember to remove quantification terms like "Most comments", "Some comments" from the key point.

Below is an example:
Community Question: Can't log in to FFXIV   Whenever I try logging into the game it gives me an error 50002 then sends me to the SE account page and when I try to add the option for a character it only gives me an option to pay with ClickandBuy but the thing is that account has been blocked. Is there any other way to play this game?      Edit: Got my account unblocked, is the Reddit LS active?
Gold Summary: Commenters give some tips in trying to help someone struggling to log into FFXIV. One commenter suggests buying Crysta although that will need CandB or UltimatePay. Others are unsure of what else to do. The only remaining suggestion one said to try is to contact SE support and hope they reach back quickly.
Key Points: ["Suggests buying Crysta", "Buying Crysta will need CandB or UltimatePay", "Unsure of what else to do", "Contact SE support and hope they respond quickly"]

"""

In [34]:
# BASE_PROMPT = """In this task you are presented with a community question and a gold summary written annotator to answer the question.
# The gold summary captures all viewpoints from user comments in answering a question.
# You were tasked to identify and extract all viewpoints written in the gold summary. The viewpoints must be exactly from the summary. Do not generate new viewpoints.

# Use and output the following format:
# Community Question: <the input question>
# Gold Summary: <the input gold summary>
# Viewpoints: <Python list of viewpoints extracted from the gold summary>

# """

In [35]:
BASE_PROMPT = """Help me extract the proposition of the given summary according to Toulmin model. Extracted proposition should be original sentences/phrases from the summary. You can ignore non argumentative unit/sentences. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):

Extract all possible claims and their propositions as a list of JSONs:
[{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

Remember to remove all quantification terms at the beginning of the claim/ground. (e.g., "Most comments", "Some comments")

For example:
Community Question: 
Summary: Most commentators say that you should choose girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.
Gold Summary: [{"claim": "You should choose girafarig", "ground": ["girafarig is the best pokemon ever created no matter what ANYONE SAYS"]}]

"""

In [36]:
print(BASE_PROMPT)

Help me extract the proposition of the given summary according to Toulmin model. Extracted proposition should be original sentences/phrases from the summary. You can ignore non argumentative unit/sentences. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):

Extract all possible claims and their propositions as a list of JSONs:
[{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

Remember to remove all quantification terms at the beginning of the claim/ground. (e.g., "Most comments", "Some comments")

For example:
Community Question: 
Summary: Most commentators say that you should choose girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.
Gold Summary: [{"claim": "You should choose girafarig", "ground": ["girafarig is the best pokemon ever created no matter what ANYONE SAYS"]}]




In [37]:
INFERENCE_PROMPT = """Now perform the task on the following input:
Community Question: {Question}
Gold Summary: {Gold_Summary}
"""

In [38]:
df = df.rename(columns={'questionText': 'query'})

In [39]:
# row = df.iloc[-1]
# row = df.iloc[0]
# row = df.iloc[1]
row = df.iloc[128]
# row = df.iloc[3]
# row = df.iloc[5]
# row = df.iloc[8]
# row = df.iloc[11]
# row = df.iloc[12]
# row = df[df['id'] == 45].iloc[0]
# row = df.iloc[-1]

print(row['id'])
print(row['query'])
print(row['summary'])
prompt = BASE_PROMPT + INFERENCE_PROMPT.format(Question = row['query'], Gold_Summary = row['summary'])

128
Mentally Handicapped and Religion   I just wanted to share something I've noticed at my new job and was wondering if anyone else noticed this/had thoughts about this.      There are a few retarded people that work the mail room at my job.  Every once in a while I'll see them alone in a corner slowly reading the bible out loud to themselves.  I can bet most people passing that scene think it's a testament to god's glory.  But, there's something about it that really rubs me the wrong way.  Like someone who has been "given" a poorly functioning brain is supposed to read about the gnarly stuff in the bible and totally believe it? That's terrible.      Or is it actually a good thing to convince mentally retarded in the existence of an afterlife to feel a sense of vindication at the bad hand they've been dealt in their lives?
Most commenters seem to think that it is bad for people with mental problems to be taught about Christianity. Some commenters seem to think that there is not a prob

In [40]:
response = get_completion(prompt, model)

In [41]:
print(response)

None


### Inference

In [42]:
def get_proposition_completion(query, summary):
    prompt = BASE_PROMPT + INFERENCE_PROMPT.format(Question = query, Gold_Summary = summary)
    try:
        return get_completion(prompt, model)
    except:
        return None

In [43]:
def prompted_proposition_extraction(root_path, domain, domain_df, save_step=50):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    arg_propositions = []

    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if '1_done.pkl' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        arg_propositions = last_domain_df['summary_propositions'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue
         
        query = row['query']
        summary = row['summary']

        if pd.isnull(summary) or len(summary.strip()) == 0:
            arg_propositions += [np.nan]
        else:
            arg_proposition = get_proposition_completion(query, summary)
            arg_propositions += [arg_proposition]
            time.sleep(0.1)
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'summary_propositions', arg_propositions)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'summary_propositions', arg_propositions)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [44]:
df['my_category'] = 1

In [45]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/summary_proposition_extraction/gpt_4.1_mini/few_shot_test_set"

In [46]:
inputs = [(root_path,
           domain,
           df[df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in df['my_category'].unique()]

In [47]:
num_workers = 1

In [48]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [93]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_proposition_extraction, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Start new process.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:34<00:00,  1.63s/it]


TIME ELAPSED 814.8264832496643


### Fix

In [129]:
from openai import OpenAI
import os
import pandas as pd

client = OpenAI(
    api_key = "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" # RMIT Account
)

In [130]:
processed_df = pd.read_pickle(root_path + "/1/1_done.pkl")
mask = pd.isnull(processed_df['summary_propositions'])
display(processed_df[mask])
processed_df.loc[mask, 'summary_propositions'] = '[]'
display(processed_df[mask])

Unnamed: 0,summary_propositions,query,comments,summary,id,subset,short_title,subreddit,short_question,my_category
1,,"Historical Hypocrisy (First, a disclaimer: T...",[[I can't repost this enough.](http://www.redd...,Several commentators note that women in the pa...,1,reddit,Historical Hypocrisy,[fatlogic],"[(First, a disclaimer: This is the first post ...",1
128,,Mentally Handicapped and Religion I just wan...,[It's foolishness. Everyone would like a nice...,Most commenters seem to think that it is bad f...,128,reddit,Mentally Handicapped and Religion,[atheism],[I just wanted to share something I've noticed...,1
396,,Tanks in GTA V and Online,[Are the tanks in GTA V and Online susceptible...,"Several users say that to kill the driver, the...",146,stack,Title: Tanks in GTA V and Online Question ta...,"[grand-theft-auto-5, grand-theft-auto-online]",[],1


Unnamed: 0,summary_propositions,query,comments,summary,id,subset,short_title,subreddit,short_question,my_category
1,[],"Historical Hypocrisy (First, a disclaimer: T...",[[I can't repost this enough.](http://www.redd...,Several commentators note that women in the pa...,1,reddit,Historical Hypocrisy,[fatlogic],"[(First, a disclaimer: This is the first post ...",1
128,[],Mentally Handicapped and Religion I just wan...,[It's foolishness. Everyone would like a nice...,Most commenters seem to think that it is bad f...,128,reddit,Mentally Handicapped and Religion,[atheism],[I just wanted to share something I've noticed...,1
396,[],Tanks in GTA V and Online,[Are the tanks in GTA V and Online susceptible...,"Several users say that to kill the driver, the...",146,stack,Title: Tanks in GTA V and Online Question ta...,"[grand-theft-auto-5, grand-theft-auto-online]",[],1


In [131]:
processed_df['summary_propositions'] = processed_df['summary_propositions'].apply(lambda x: ast.literal_eval(x))

In [132]:
mask = processed_df['summary_propositions'].str.len() == 0
processed_df[mask]

Unnamed: 0,summary_propositions,query,comments,summary,id,subset,short_title,subreddit,short_question,my_category
1,[],"Historical Hypocrisy (First, a disclaimer: T...",[[I can't repost this enough.](http://www.redd...,Several commentators note that women in the pa...,1,reddit,Historical Hypocrisy,[fatlogic],"[(First, a disclaimer: This is the first post ...",1
128,[],Mentally Handicapped and Religion I just wan...,[It's foolishness. Everyone would like a nice...,Most commenters seem to think that it is bad f...,128,reddit,Mentally Handicapped and Religion,[atheism],[I just wanted to share something I've noticed...,1
165,[],Can we just delete Leroy? He has been in alm...,"[Maybe decks just need to run more taunts, 6 d...",One commentator sees the original poster’s pos...,165,reddit,Can we just delete Leroy?,[hearthstone],[He has been in almost every game I have playe...,1
166,[],Help with selling would be greatly appreciated...,"[I'll give you $10., Here's how you determine ...",Most commenters seem to think that the origina...,166,reddit,Help with selling would be greatly appreciated.,[heroscape],[So I recently discovered that others are actu...,1
169,[],"[WSIG] I don't normally play, but my friend ha...",[So basically you've been dabbling in cheap al...,All but one commenter shares where the poster ...,169,reddit,"[WSIG] I don't normally play, but my friend ha...",[boardgames],"[Long story short, I'm not a huge board game p...",1
173,[],I wanted to know... Why is the r9 390 better...,"[at 1080p, they're identical. The 390 performs...",The users mostly call out different names; TOB...,173,reddit,I wanted to know...,[buildapc],[Why is the r9 390 better than the gtx 970 bes...,1
181,[],RBA SS wicks all taste metallic I've gotten ...,"[if you get metal taste, I think it's because...",Most commenters share their favorite Pearl Jam...,181,reddit,RBA SS wicks all taste metallic,[electronic_cigarette],[I've gotten a few rbas now and built some SS ...,1
189,[],Doesn't Tahm Kench needs some serious number n...,[Tahm kench is really underwhelming amd probab...,All commenters give suggestions on games you c...,189,reddit,Doesn't Tahm Kench needs some serious number n...,[leagueoflegends],[Yeah I know we have to learn to play around i...,1
191,[],How secure is software like DD-WRT and Tomato ...,[I'd definitely go with open-source firmware f...,"The commenters mostly agree that the comet, he...",191,reddit,How secure is software like DD-WRT and Tomato ...,[security],[I have an Asus router with a closed source fi...,1
197,[],What are some easy games to teach during the G...,"[Resistance, 7 Wonders, Coup, Love Letter and ...",All commenters talk about things you can go se...,197,reddit,What are some easy games to teach during the G...,[boardgames],[Since my country's sub is likely to meet up a...,1


In [133]:
fix_df = processed_df[mask]

In [134]:
fix_df['my_category'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fix_df['my_category'] = 1


In [135]:
fix_df = fix_df.drop(columns=['summary_propositions'])

In [136]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/summary_proposition_extraction/gpt_4.1_mini/few_shot_test_set_fix"

In [163]:
inputs = [(root_path,
           domain,
           fix_df[fix_df['my_category'] == domain]
           )
          for domain in fix_df['my_category'].unique()]

In [164]:
num_workers = 1

In [165]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [166]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_proposition_extraction, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Start new process.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:19<00:00,  1.09s/it]


TIME ELAPSED 19.720810651779175


In [172]:
processed_fixed_df = pd.read_pickle(root_path + "/1/1_done.pkl")
target_index = processed_fixed_df.index
processed_df.loc[target_index, 'summary_propositions'] = processed_fixed_df['summary_propositions']
processed_df.loc[target_index, 'summary_propositions']

1      [  \n  {  \n    "claim": "women in the past th...
128    [\n  {\n    "claim": "it is bad for people wit...
165                                                   []
166                                                   []
169                                                   []
173                                                  [ ]
181                                                   []
189                                                   []
191                                                   []
197                                                   []
203                                                   []
207                                                   []
208    [{"claim": "commenters have camera lenses that...
211                                                   []
218                                                   []
229                                                   []
238                                                   []
396    [{\n  "claim": "to kill 

In [180]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/summary_proposition_extraction/gpt_4.1_mini/few_shot_test_set"

In [181]:
processed_df.to_pickle(root_path + "/1/1_done.pkl")

# Argument Proposition Extraction

In [26]:
df = df.explode(['comments'])

In [27]:
import os
from openai import AzureOpenAI

endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
model_name = "gpt-4.1"
deployment = "gpt-4.1"

subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

In [28]:
# from openai import OpenAI
# import os
# import pandas as pd

# client = OpenAI(
#     api_key = "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" # RMIT Account
# )

In [29]:
# model="gpt-4.1-mini"
# model="gpt-4o-mini"
model="gpt-4.1"
# model="gpt-4"
# model="gpt-3.5-turbo"

prompt = "Once upon a time"

In [30]:
def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=1500,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [31]:
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment:
# "%s"

# According to Toulmin model,
# """

In [32]:
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment:
# "%s"

# Extract all possible claims and their propositions as a list of JSONs:
# [{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

# According to Toulmin model,
# """

In [33]:
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment. Extracted proposition must be original sentences/phrases from the comment:
# "%s"

# According to Toulmin model,
# """

In [34]:
# # OK
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment. Extracted proposition should be original sentences/phrases from the comment. Add the subject to the proposition in case of pronoun:
# "%s"

# Extract all possible claims and their propositions as a list of JSONs:
# [{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

# According to Toulmin model,
# """

In [35]:
# # OK
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment. Extracted proposition should be original sentences/phrases from the comment. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):
# "%s"

# Extract all possible claims and their propositions as a list of JSONs:
# [{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

# According to Toulmin model,
# """

In [36]:
# # OK
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment. Extracted proposition should be original sentences/phrases from the comment. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):
# "%s"

# Extract all possible claims and their propositions as a list of JSONs:
# [{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

# According to Toulmin model,
# """

In [37]:
# # OK
# INFERENCE_PROMPT = """Help me extract the proposition of the following comment. Extracted proposition should be original sentences/phrases from the comment. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):
# "%s"

# Extract all possible claims and their propositions as a list of JSONs:
# [{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

# According to Toulmin model,
# """

In [38]:
# BEST
INFERENCE_PROMPT = """Help me extract the proposition of the given comment according to Toulmin model. Extracted proposition should be original sentences/phrases from the comment. You can ignore non argumentative unit/senteces. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):

Extract all possible claims and their propositions as a list of JSONs:
[{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

For example:
Comment: You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.
Proposition: [{"claim": "You should girafarig", "ground": ["girafarig is the best pokemon ever created no matter what ANYONE SAYS"]}]

Now perform on the following output:
"%s"
"""

In [39]:
# row = df.iloc[0]
# row = df.iloc[1]
# row = df.iloc[3]
# row = df.iloc[13]  # GOOD
row = df.iloc[14]
# row = df.iloc[16]
# row = df.iloc[18]
# row = df.iloc[19]
row['comments']

'While I admire your steadfast courage in how awesome Girafig is, I may wait on that one. :P BUT. Maybe. :)'

In [40]:
prompt = INFERENCE_PROMPT % (row['comments'])

In [41]:
print(prompt)

Help me extract the proposition of the given comment according to Toulmin model. Extracted proposition should be original sentences/phrases from the comment. You can ignore non argumentative unit/senteces. Every proposition must refer to a specific object/subject the subject and cannot be pronoun (e.g., it, this, that):

Extract all possible claims and their propositions as a list of JSONs:
[{'claim': <claim>, 'ground': [<list of grounds>], ...}, ...]

For example:
Comment: You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.
Proposition: [{"claim": "You should girafarig", "ground": ["girafarig is the best pokemon ever created no matter what ANYONE SAYS"]}]

Now perform on the following output:
"While I admire your steadfast courage in how awesome Girafig is, I may wait on that one. :P BUT. Maybe. :)"



In [42]:
# response = get_completion(prompt, model)

In [43]:
# print(response)

## Inference

In [44]:
def get_proposition_completion(comments):
    prompt = INFERENCE_PROMPT % (comments)
    try:
        return get_completion(prompt, model)
    except:
        return None

In [45]:
def prompted_proposition_extraction(root_path, domain, domain_df, save_step=50):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    arg_propositions = []

    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if '1_done.pkl' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        arg_propositions = last_domain_df['arg_propositions'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue

        comments = row['comments']

        if pd.isnull(comments) or len(comments.strip()) == 0:
            arg_propositions += [np.nan]
        else:
            arg_proposition = get_proposition_completion(comments)
            arg_propositions += [arg_proposition]
            time.sleep(0.1)
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'arg_propositions', arg_propositions)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'arg_propositions', arg_propositions)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [46]:
df['my_category'] = 1

In [47]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/arg_proposition_extraction/gpt_4.1_mini/few_shot_test_set"

In [48]:
inputs = [(root_path,
           domain,
           df[df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in df['my_category'].unique()]

In [49]:
num_workers = 1

In [50]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [51]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_proposition_extraction, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Loaded saved file. Continuing


 37%|███████████████████████████▊                                               | 5260/14178 [00:00<00:00, 52594.78it/s]

KeyboardInterrupt: 

## Read

In [95]:
# # processed_df = pd.read_pickle(root_path + "/1/1_done.pkl")
# processed_df = pd.read_pickle(root_path + "/1/1_done_fixed_2.pkl")

In [96]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/arg_proposition_extraction/gpt_4.1_mini/few_shot_test_set"
processed_df = pd.read_pickle(root_path + "/1/1_done_fixed_2.pkl")

In [97]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/arg_proposition_extraction/gpt_4.1_mini/few_shot_test_set_extra"
extra_processed_df = pd.read_pickle(root_path + "/1/1_done_fixed_2.pkl")
extra_processed_df['title'] = extra_processed_df['questionText'].apply(lambda x: x.strip())
# extra_processed_df = extra_processed_df.rename(columns={'title': 'questionText'})
extra_processed_df['subset'] = 'xnyt'
processed_df = pd.concat([processed_df, extra_processed_df], axis=0)

In [98]:
# # processed_df['arg_propositions'] = processed_df['arg_propositions'].fillna("")
# # mask = processed_df['arg_propositions'].str.strip().str.len() == processed_df['comments'].str.strip().str.len()
# mask = (pd.isnull(processed_df['arg_propositions']))
# # mask = processed_df['arg_propositions'].apply(lambda x: pd.isnull(x))
# mask &= (pd.notnull(processed_df['comments']) & processed_df['comments'].str.strip().str.len() != 0)
# display(processed_df[mask])
# processed_df.iloc[8153:]

In [99]:
processed_df.shape

(8715, 8)

In [100]:
mask = pd.notnull(processed_df['arg_propositions'])
processed_df[mask]

Unnamed: 0,arg_propositions,questionText,comments,summary,id,subset,my_category,title
0,"[{""claim"": ""I would like to see any of the Leg...",What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
1,"[\n {\n ""claim"": ""Suicune is a good choice...",What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
2,"[{""claim"": ""Pancham"", ""ground"": []}]",What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
4,"[{""claim"": ""I would love to see Reshiram"", ""gr...",What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
7,"[{""claim"": ""Seel?"", ""ground"": []}]",What should I draw next? A FB user said they...,Seel?,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
...,...,...,...,...,...,...,...,...
4537,"[\n {\n ""claim"": ""Ben Brantly did not pick...",Christian Borle is the eccentric Willy Wonka i...,Did Ben Brantly pick the title of this review?...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4539,"[{""claim"": ""\""deserts\"" is the proper""}]",Christian Borle is the eccentric Willy Wonka i...,"See above comments. ""deserts"" is the proper.",About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4540,"[\n {\n ""claim"": ""John Rubinstein is now d...",Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4541,"[\n {\n ""claim"": ""John Rubinstein did a wo...",Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...


In [101]:
processed_df = processed_df[mask]

In [102]:
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: x.strip("\n"))
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: re.sub("Proposition:", "", x))
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: x.strip("\n"))
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: x.strip("`"))
# processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: x.strip("[`\n]+"))
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: re.sub("^[\n ]*\[\]", "", x))
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: re.sub("^json", "", x))

In [103]:
mask = processed_df['arg_propositions'].apply(lambda x: len(re.findall("^[\n ]*\[", x)) == 0)
processed_df[mask]

Unnamed: 0,arg_propositions,questionText,comments,summary,id,subset,my_category,title
3716,,The move closes a fraught chapter in the count...,An exceptional achievement...,Some commentators think Iceland is capable of ...,206,xnyt,1,The move closes a fraught chapter in the count...


In [104]:
processed_df = processed_df[~mask]

In [105]:
processed_df['arg_propositions'] = processed_df['arg_propositions'].apply(lambda x: ast.literal_eval(x.strip()))

In [106]:
processed_df

Unnamed: 0,arg_propositions,questionText,comments,summary,id,subset,my_category,title
0,[{'claim': 'I would like to see any of the Leg...,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
1,"[{'claim': 'Suicune is a good choice', 'ground...",What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
2,"[{'claim': 'Pancham', 'ground': []}]",What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
4,"[{'claim': 'I would love to see Reshiram', 'gr...",What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
7,"[{'claim': 'Seel?', 'ground': []}]",What should I draw next? A FB user said they...,Seel?,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
...,...,...,...,...,...,...,...,...
4537,[{'claim': 'Ben Brantly did not pick the title...,Christian Borle is the eccentric Willy Wonka i...,Did Ben Brantly pick the title of this review?...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4539,"[{'claim': '""deserts"" is the proper'}]",Christian Borle is the eccentric Willy Wonka i...,"See above comments. ""deserts"" is the proper.",About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4540,[{'claim': 'John Rubinstein is now doing grand...,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4541,[{'claim': 'John Rubinstein did a wonderful jo...,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...


## Post-process

In [107]:
processed_df

Unnamed: 0,arg_propositions,questionText,comments,summary,id,subset,my_category,title
0,[{'claim': 'I would like to see any of the Leg...,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
1,"[{'claim': 'Suicune is a good choice', 'ground...",What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
2,"[{'claim': 'Pancham', 'ground': []}]",What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
4,"[{'claim': 'I would love to see Reshiram', 'gr...",What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
7,"[{'claim': 'Seel?', 'ground': []}]",What should I draw next? A FB user said they...,Seel?,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,
...,...,...,...,...,...,...,...,...
4537,[{'claim': 'Ben Brantly did not pick the title...,Christian Borle is the eccentric Willy Wonka i...,Did Ben Brantly pick the title of this review?...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4539,"[{'claim': '""deserts"" is the proper'}]",Christian Borle is the eccentric Willy Wonka i...,"See above comments. ""deserts"" is the proper.",About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4540,[{'claim': 'John Rubinstein is now doing grand...,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...
4541,[{'claim': 'John Rubinstein did a wonderful jo...,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,Christian Borle is the eccentric Willy Wonka i...


In [108]:
processed_df = pd.json_normalize(processed_df.to_dict(orient='records'), 'arg_propositions', 
                  ['questionText', 'comments', 'summary', 'id', 'subset', 'my_category'])

In [109]:
processed_df

Unnamed: 0,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category
0,I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1
1,Suicune is a good choice,[],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1
2,I will consider Suicune for sure,[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1
3,"If I do not choose Suicune now, I will definit...",[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1
4,Pancham,[],,,,,What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1
...,...,...,...,...,...,...,...,...,...,...,...,...
22586,John Rubinstein is now doing grandfather roles,[It's been a while since I've encountered the ...,,,,,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1
22587,The fact that John Rubinstein is now doing gra...,[John Rubinstein is now doing grandfather roles],,,,,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1
22588,John Rubinstein did a wonderful job playing Ge...,[John Rubinstein had two brief scenes with Joa...,,,,,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1
22589,"John Rubinstein played King Charles, the fathe...",[John Rubinstein appeared a couple of years ag...,,,,,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1


## Stage 1 

### List Entailment Inference

In [110]:
# from openai import OpenAI
import openai
import json
import argparse
import tqdm
import time

In [111]:
def remove_duplicate_claim_reason(grp):
    all_reasons = list(np.hstack(grp['ground']))
    
    mask = grp['claim'].isin(all_reasons)
    mask &= grp['ground'].str.len() == 0
    
    return grp[~mask]
    
processed_df = processed_df.groupby(['subset', 'id', 'questionText']).apply(remove_duplicate_claim_reason).reset_index(drop=True)

  processed_df = processed_df.groupby(['subset', 'id', 'questionText']).apply(remove_duplicate_claim_reason).reset_index(drop=True)


In [112]:
def form_other_claim(grp):
    all_claim = grp['claim'].tolist()
    
    grp['other_claim'] = grp['claim'].apply(lambda x: [c for c in all_claim if c != x])
    
    return grp

processed_df = processed_df.groupby(['subset', 'id', 'questionText']).apply(form_other_claim).reset_index(drop=True)

  processed_df = processed_df.groupby(['subset', 'id', 'questionText']).apply(form_other_claim).reset_index(drop=True)


In [113]:
processed_df

Unnamed: 0,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim
0,I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,"[I will consider Suicune for sure, If I do not..."
1,I will consider Suicune for sure,[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
2,"If I do not choose Suicune now, I will definit...",[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
3,Pancham,[],,,,,What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
4,I would love to see Reshiram,[If you have the time],,,,,What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21942,John Rubinstein is now doing grandfather roles,[It's been a while since I've encountered the ...,,,,,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."
21943,The fact that John Rubinstein is now doing gra...,[John Rubinstein is now doing grandfather roles],,,,,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."
21944,John Rubinstein did a wonderful job playing Ge...,[John Rubinstein had two brief scenes with Joa...,,,,,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."
21945,"John Rubinstein played King Charles, the fathe...",[John Rubinstein appeared a couple of years ag...,,,,,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."


In [114]:
# mask = processed_df['questionText'].str.contains("If 100 minions @ 10min")
# processed_df[mask]['ground'].tolist()

In [115]:
import os
from openai import AzureOpenAI

# endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
# subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
# api_version = "2024-12-01-preview"
# client = AzureOpenAI(
#     api_version=api_version,
#     azure_endpoint=endpoint,
#     api_key=subscription_key,
# )

endpoint = "https://s3695273-a233-resource.cognitiveservices.azure.com/"
subscription_key = "CHX8zX1gdDYh7C9mSMTKY1GdJfrv0bf6vIZ6ZfI1OsYSctkAxXvZJQQJ99BGACHYHv6XJ3w3AAAAACOGxlmO"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# from openai import OpenAI
# import os
# import pandas as pd

# # client = OpenAI(
# #     api_key = "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" # RMIT Account
# # )
# from openai import OpenAI
# client = OpenAI(
#     base_url = "http://localhost:8000/v1",
#     api_key = "None"
# )

In [116]:
# model = "gpt-3.5-turbo"
openai.api_key = "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" # RMIT Account
# model = "gpt-4"  # HALLUCINATE
# model = "gpt-4.1-mini"  # HALLUCINATE
model = "gpt-4.1"  # WORKED CHEAPER WITH SHORT OUTPUT
# model = "gpt-4o"  # WORKED
# model = "gpt-4.1-nano"  # HALLUCINATE
# model = "gpt-4o-mini"  # HALLUCINATE

In [117]:
def g_eval(question, premise, claim):
    cur_prompt = prompt.replace('{{Question}}', question).replace('{{Premise}}', premise).replace('{{Claim}}', str(claim))
    ct, ignore = 0, 0
    while True:
        try:
            _response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": cur_prompt}],
                temperature=2,
#                 max_tokens=5,
#                 max_tokens=1000,
                max_tokens=150,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None,
                # logprobs=40,
                n=5
#                 n=10
#                 n=15
#                 n=20
            )
            time.sleep(0.5)

            all_responses = [_response.choices[i].message.content for i in
                             range(len(_response.choices))]
    #         instance['all_responses'] = all_responses
    #         new_json.append(instance)
            ct += 1
            break
        except Exception as e:
            print(e)
            if ("limit" in str(e)):
                time.sleep(2)
            else:
                ignore += 1
                print('ignored', ignore)

                break
                
    return all_responses

In [118]:
import statistics
def process_g_eval(g_eval_annotation):
    g_eval_scores = []
    for annotation in g_eval_annotation:
        score_find = re.findall("[0-9]", annotation)
        if len(score_find) > 0 and 1 <= int(score_find[0]) <= 5:
            g_eval_scores += [int(score_find[0])]
    
    return g_eval_scores

In [119]:
# prompt = open("./geval/prompts/summeval/rel_detailed_entailment_2.txt").read()

In [120]:

# Evaluation Steps:

# 1. Read both the claim **A** and each claim from list **B** carefully.
# 2. Determine if claim **A** logically follows each claim from list **B**.
# 3. For each claim from list **B**, assign an **entailment strength score from 1 to 5** by claim **A** according to the guideline above.
# 4. Return all scores as a Python list

In [121]:
prompt = """
You will be given a community question, a claim **A** and a list of other claims **B** extracted from the social comments answering the question.

Your task is to assess the **degree to which claim **A** supports each claim from **B** list, using a scale from 1 to 5.

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:
Entailment Strength (1–5) — how strongly the **claim A** logically supports the other claim.

Evaluation Steps:

1. Read both the claim **A** and each claim from list **B** carefully.
2. Determine if claim **A** logically follows each claim from list **B**.
3. For each claim from list **B**, assign an **entailment strength score from 1 to 5** by claim **A** according to the guideline above.
4. Return all scores as a Python list

Example:


Question:
{{Question}}

Claim A:

{{Premise}}

List Claim B:

{{Claim}}


Evaluation Form (score ONLY):

- Entailment Strength:
"""

In [122]:
# prompt = """In this task you are presented with a community question, a claim A and a list of other claims extracted from the social comments answering the question.
# You will be asked to select best-matching claim from the list to claim A. The selection must be from the input list of other claims. Do not generate new best-matching claims.
# Two claims are best-matched if they explicitly express similar opinions and sentiment on a similar aspect.

# Use and output the following format:
# Community Question: <the input question>
# Claim A: <the input claim>
# List of Other Claims: <the input list of other claims>
# Best-Matching Claim: <The claim (from the list) that best-matched to the input claim>

# """

In [123]:
row = processed_df.iloc[10]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['Postimg is horrible', ["I can't repost this enough", 'The women in the paintings from centuries ago were what we would call curvy', "Your view on the image hosting site chosen by /u/dmstewar2 doesn't negate their post or its validity", 'I visited the first link and gave up', 'The examples they use to try to prove that fat women used to be considered beautiful usually include women who are overweight at most', "can't link TiTP", "The 'idealized' woman at the time was bigger than our current 'idealized' (ie, models)", "The idealized image was larger than the average woman, and now it's reversed", 'Hugely obese people back then were very rare and were ridiculed (just like super-skinnies now sometimes are)', "These men weren't admired and respected", 'For women, the ideal has always been healthy with obvious signs of fertility (REAL curves: hips and breasts)', 'For men, health and vitality has always been the ideal']]
['[1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]', '[1, 1, 3, 1, 1, 1, 1, 1, 1, 

In [124]:
row = processed_df.iloc[10]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['Postimg is horrible', ["I can't repost this enough", 'The women in the paintings from centuries ago were what we would call curvy', "Your view on the image hosting site chosen by /u/dmstewar2 doesn't negate their post or its validity", 'I visited the first link and gave up', 'The examples they use to try to prove that fat women used to be considered beautiful usually include women who are overweight at most', "can't link TiTP", "The 'idealized' woman at the time was bigger than our current 'idealized' (ie, models)", "The idealized image was larger than the average woman, and now it's reversed", 'Hugely obese people back then were very rare and were ridiculed (just like super-skinnies now sometimes are)', "These men weren't admired and respected", 'For women, the ideal has always been healthy with obvious signs of fertility (REAL curves: hips and breasts)', 'For men, health and vitality has always been the ideal']]
['[1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1]', '[1, 1, 2, 2, 1, 1, 1, 1, 1, 

In [125]:
# row = processed_df.iloc[25]
# row = processed_df.iloc[26]
# row = processed_df.iloc[50]
row = processed_df.iloc[52]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['the cheap Allen one was better in some ways', ['the Bones 2 is preferable for regular use', 'the Bones 3 is only used when I need to carry 3 bikes (or 2 mountain bikes so they have more room since they are so bulky)', 'it is hard to get the straps to stay tight on the Saris racks', 'the side straps on the Saris are unnecessary', 'one advantage of the Saris racks is that the carrying arms are adjustable so that you can bring them closer together for smaller frame bikes', "You should consider the Thelma from Saris, Thule's 916/917, or Yakima HoldUp", 'You should stick with the Bones', 'Yakima HoldUp is awesome', 'Swingdaddy is for when we have more than 2 bikes', 'If you can attach bikes to a roof rack or a hitch, do that', 'If you cannot attach bikes to a roof rack or a hitch, use the bones', 'You only need things like a workstand or a mountain bike once a month max', 'They are a bit cash to own', 'The saris 2 can only take two bikes securely', 'The bones 3 is the best bike rack I hav

In [76]:
row = processed_df.iloc[10]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['Postimg is horrible', ["I can't repost this enough", 'The women in the paintings from centuries ago were what we would call curvy', "Your view on the image hosting site chosen by /u/dmstewar2 doesn't negate their post or its validity", 'I visited the first link and gave up', 'The examples they use to try to prove that fat women used to be considered beautiful usually include women who are overweight at most', "can't link TiTP", "The 'idealized' woman at the time was bigger than our current 'idealized' (ie, models)", "The idealized image was larger than the average woman, and now it's reversed", 'Hugely obese people back then were very rare and were ridiculed (just like super-skinnies now sometimes are)', "These men weren't admired and respected", 'For women, the ideal has always been healthy with obvious signs of fertility (REAL curves: hips and breasts)', 'For men, health and vitality has always been the ideal']]
['[1, 1, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1]', '[1, 1, 3, 3, 1, 1, 1, 1, 1, 

In [77]:
row = processed_df.iloc[10]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['Postimg is horrible', ["I can't repost this enough", 'The women in the paintings from centuries ago were what we would call curvy', "Your view on the image hosting site chosen by /u/dmstewar2 doesn't negate their post or its validity", 'I visited the first link and gave up', 'The examples they use to try to prove that fat women used to be considered beautiful usually include women who are overweight at most', "can't link TiTP", "The 'idealized' woman at the time was bigger than our current 'idealized' (ie, models)", "The idealized image was larger than the average woman, and now it's reversed", 'Hugely obese people back then were very rare and were ridiculed (just like super-skinnies now sometimes are)', "These men weren't admired and respected", 'For women, the ideal has always been healthy with obvious signs of fertility (REAL curves: hips and breasts)', 'For men, health and vitality has always been the ideal']]
['[1, 1, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1]', '[1, 1, 5, 2, 1, 2, 1, 1, 1, 

In [270]:
# row = processed_df.iloc[10]
row = processed_df.iloc[25]
# row = processed_df.iloc[50]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['Cowsep disagrees', ['You are doing something wrong', 'You picked Udyr', 'Champions that benefit from farming more than ganking are basically the first 10mins', 'Playing Morgana mid is her job', 'cs is probably the worst metric for jungle performance', "it is very easy to see 'perfect farm' jungle play", "it's just a simple question to compare monster camps to regular creeps and no more", 'some people took it too far out of the topic', 'jungle creep score is far from a good metric for not only performance but just farm in general', 'Almost all the players hate having him on the team', 'it is a valid strategy like any other', 'solo q is boring as it is', "I don't blame people to try different things", 'He should get blamed', 'Almost every gank gets at least damage, summoners, or more', "Cowsep's greatest contribution to the game was 'tp smite yi' with '10 minute flare'", "let's downvote each other and move on", 'Junglers with High CS can get 50 CS at 10 mins, roughly half of laners', '

In [232]:
# row = processed_df.iloc[10]
row = processed_df.iloc[10]
print(row[['claim', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['claim'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['The women in the paintings from centuries ago were what we would call curvy', ["I can't repost this enough", 'Postimg is horrible', "Your view on the image hosting site chosen by /u/dmstewar2 doesn't negate their post or its validity", 'I visited the first link and gave up', 'The examples they use to try to prove that fat women used to be considered beautiful usually include women who are overweight at most', "can't link TiTP", "The 'idealized' woman at the time was bigger than our current 'idealized' (ie, models)", "The idealized image was larger than the average woman, and now it's reversed", 'Hugely obese people back then were very rare and were ridiculed (just like super-skinnies now sometimes are)', 'There are a couple of tales of men who got really fat and killed their horses from exhaustion', "These men weren't admired and respected", 'For women, the ideal has always been healthy with obvious signs of fertility (REAL curves: hips and breasts)', 'For men, health and vitality ha

#### Run

In [126]:
def prompted_g_eval_kp(root_path, domain, domain_df, save_step=10):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    claim_entailment_scores = []
#     claim_split_gold_list = []
 
    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if 'done' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        claim_entailment_scores = last_domain_df['claim_entailment_scores'].tolist()
#         claim_split_gold_list = last_domain_df['claim_split_gold'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue

        claim_entailment_scores += [g_eval(row['questionText'], row['claim'], row['other_claim'])]
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#             save_df.insert(0, 'claim_split_gold', claim_split_gold_list)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#     new_domain_df.insert(0, 'claim_split_gold', claim_split_gold_list)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [127]:
processed_df['my_category'] = 1

In [128]:
# processed_df = processed_df[processed_df['subset'] == 'xnyt']

In [129]:
processed_df

Unnamed: 0,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim
0,I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,"[I will consider Suicune for sure, If I do not..."
1,I will consider Suicune for sure,[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
2,"If I do not choose Suicune now, I will definit...",[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
3,Pancham,[],,,,,What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
4,I would love to see Reshiram,[If you have the time],,,,,What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21942,John Rubinstein is now doing grandfather roles,[It's been a while since I've encountered the ...,,,,,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."
21943,The fact that John Rubinstein is now doing gra...,[John Rubinstein is now doing grandfather roles],,,,,Christian Borle is the eccentric Willy Wonka i...,It's been a while since I've encountered the n...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."
21944,John Rubinstein did a wonderful job playing Ge...,[John Rubinstein had two brief scenes with Joa...,,,,,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."
21945,"John Rubinstein played King Charles, the fathe...",[John Rubinstein appeared a couple of years ag...,,,,,Christian Borle is the eccentric Willy Wonka i...,John Rubinstein had two brief scenes with Joan...,About half of the commenters think that other ...,249,xnyt,1,"[""Charlie Chocolate"" is pure agitation and no ..."


In [130]:
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_claim_list_matching_dup_claim_reason/gpt_4.1/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_claim_list_matching/gpt_4.1/few_shot_test_set"
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_claim_list_matching_5/gpt_4.1/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_claim_list_matching_5/gpt_4.1/few_shot_test_set_extra_nyt"

inputs = [(root_path,
           domain,
           processed_df[processed_df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in processed_df['my_category'].unique()]

In [131]:
num_workers = 1

In [132]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [155]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_g_eval_kp, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Loaded saved file. Continuing


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11274/11274 [37:59<00:00,  4.95it/s]


TIME ELAPSED 2280.8083159923553


In [226]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_g_eval_kp, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Loaded saved file. Continuing


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10673/10673 [6:29:11<00:00,  2.19s/it]


TIME ELAPSED 23351.710446596146


#### Read

In [186]:
# processed_final_df = pd.read_pickle(root_path + "/1/1_20.pkl")
# processed_final_df = pd.read_pickle(root_path + "/1/1_30.pkl")
# processed_final_df = pd.read_pickle(root_path + "/1/1_50.pkl")
# processed_final_df = pd.read_pickle(root_path + "/1/1_200.pkl")
# processed_final_df = pd.read_pickle(root_path + "/1/1_1520.pkl")
processed_final_df = pd.read_pickle(root_path + "/1/1_done.pkl")

In [187]:
processed_final_df

Unnamed: 0,claim_entailment_scores,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim
0,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,"[I will consider Suicune for sure, If I do not..."
1,"[[2, 3, 1, 1, 1, 1, 1], [4, 3, 1, 1, 1, 1, 2],...",I will consider Suicune for sure,[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
2,"[[1, 5, 1, 1, 1, 1, 3], [2, 5, 1, 1, 1, 1, 3],...","If I do not choose Suicune now, I will definit...",[Suicune is a good choice],,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
3,"[[1, 1, 1, 1, 3, 3, 1], [1, 1, 1, 1, 3, 1, 1],...",Pancham,[],,,,,What should I draw next? A FB user said they...,Pancham,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
4,"[[Assuming we're testing claim A (""I would lov...",I would love to see Reshiram,[If you have the time],,,,,What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10668,"[[2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,...","Please select connect under reset, reset after...",[this will solve issues],,,,,"""Could not find Cortex-M device in the JTAG ch...",Solved: In periperipheral settings under debu...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
10669,"[[2, 1, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","The device works after rebooting the PC, clear...",[The drivers are installed automatically when ...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
10670,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",A basic LED blink program can be used for testing,"[The device works after rebooting the PC, clea...",,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
10671,"[[2, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,...",my JTAG debugger was functioning well on both ...,[],,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...


In [188]:
# processed_final_df['claim_entailment_scores_processed'] = processed_final_df['claim_entailment_scores'].\
#     apply(lambda x: [ast.literal_eval(
#                         re.sub(r"((, ,) *)+", ",", re.findall(r"\[[0-9\n \,]+\]", attempt)[0].replace("0", "").replace("[,]", "[]").replace("[, ", "[").replace("[,", "[")).replace(",,", ",")
#                     )
#                      for attempt in x 
#                      if attempt != None and len(re.findall(r"\[[0-9\n \,]+\]", attempt)) > 0])

In [189]:
processed_final_df['claim_entailment_scores_processed'] = processed_final_df['claim_entailment_scores'].\
    apply(lambda x: [ast.literal_eval(
                        re.sub(r",{2,}", ",",
                        re.sub(r"((, +,) *)+", ",", 
                               re.sub(r"\[ *, *", "[",
                                    re.findall(r"\[[0-9\n \,]+\]", attempt)[0].replace("0", "").replace("[,]", "[]"))
                              )
                        ).replace("[,", "[")
                        
                    )
                     for attempt in x 
                     if attempt != None and len(re.findall(r"\[[0-9\n \,]+\]", attempt)) > 0])

In [190]:
# # CHECK
# mask = processed_final_df['claim_entailment_scores_processed'].str.len() != processed_final_df['claim_entailment_scores'].str.len()
# processed_final_df[mask].iloc[5]['claim_entailment_scores']

In [191]:
processed_final_df['claim_entailment_scores_processed'] = processed_final_df.\
    apply(lambda row: [attempt[:len(row['other_claim'])] 
                       for attempt in row['claim_entailment_scores_processed'] 
                       if len(attempt) >= len(row['other_claim'])], axis=1)

In [192]:
processed_final_df['claim_entailment_scores_avg'] = processed_final_df['claim_entailment_scores_processed'].\
    apply(lambda x: np.mean(x, axis=0))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [193]:
# processed_final_df['claim_entailment_scores_max'] = processed_final_df['claim_entailment_scores_processed'].\
#     apply(lambda x: np.max(x, axis=0))

In [194]:
mask = processed_final_df['claim_entailment_scores_avg'].str.len() != processed_final_df['other_claim'].str.len()
processed_final_df[mask]['claim_entailment_scores_processed'].iloc[0]

[]

In [195]:
len(processed_final_df[mask]['other_claim'].iloc[0])

26

In [196]:
processed_final_df[mask].shape

(1994, 16)

In [197]:
processed_final_df = processed_final_df[~mask]

In [198]:
processed_final_df = processed_final_df.explode(['other_claim', 'claim_entailment_scores_avg'])

In [199]:
processed_final_df['claim_pair'] = processed_final_df.apply(lambda row: sorted([row['claim'], row['other_claim']]) , axis=1)
processed_final_df['claim_pair'] = processed_final_df['claim_pair'].astype(str)

In [200]:
processed_final_df

Unnamed: 0,claim_entailment_scores,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim,claim_entailment_scores_processed,claim_entailment_scores_avg,claim_pair
0,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,I will consider Suicune for sure,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",3.6,"['I will consider Suicune for sure', 'I would ..."
0,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,"If I do not choose Suicune now, I will definit...","[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",3.2,['I would like to see any of the Legendary Bea...
0,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,Pancham,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",1.0,['I would like to see any of the Legendary Bea...
0,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,I would love to see Reshiram,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",1.6,['I would like to see any of the Legendary Bea...
0,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",I would like to see any of the Legendary Beast...,[],,,,,What should I draw next? A FB user said they...,"I'd like to see any of the Legendary Beasts, R...",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,Seel?,"[[3, 3, 1, 2, 1, 1, 1], [5, 5, 1, 2, 1, 1, 1],...",1.0,['I would like to see any of the Legendary Bea...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10672,"[Entailment Strength:\n\n[2, 1, 3, 4, 1, 1, 1,...",I realised that I had physically inverted the ...,[I had the same issue: my JTAG debugger was fu...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,STELLARIS_ICDI_DRIVERS solved the problem with...,"[[2, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1.0,['I realised that I had physically inverted th...
10672,"[Entailment Strength:\n\n[2, 1, 3, 4, 1, 1, 1,...",I realised that I had physically inverted the ...,[I had the same issue: my JTAG debugger was fu...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,"Please select connect under reset, reset after...","[[2, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1.0,['I realised that I had physically inverted th...
10672,"[Entailment Strength:\n\n[2, 1, 3, 4, 1, 1, 1,...",I realised that I had physically inverted the ...,[I had the same issue: my JTAG debugger was fu...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,"The device works after rebooting the PC, clear...","[[2, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1.0,['I realised that I had physically inverted th...
10672,"[Entailment Strength:\n\n[2, 1, 3, 4, 1, 1, 1,...",I realised that I had physically inverted the ...,[I had the same issue: my JTAG debugger was fu...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,A basic LED blink program can be used for testing,"[[2, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1.0,['A basic LED blink program can be used for te...


In [201]:
# CHECK
processed_final_df.groupby(['subset', 'id', 'questionText', 'claim_pair']).filter(lambda x: len(x) == 3)

Unnamed: 0,claim_entailment_scores,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim,claim_entailment_scores_processed,claim_entailment_scores_avg,claim_pair


In [202]:
processed_final_df = processed_final_df.groupby(['subset', 'id', 'questionText', 'claim_pair']).agg({
    'claim_entailment_scores_avg': (lambda x: x.tolist()),
    'claim': (lambda x: x.tolist()),
    'other_claim': (lambda x: x.tolist()),
    'ground': (lambda x: x.tolist())
}).reset_index()

In [203]:
processed_final_df['claim_entailment_scores_final_avg'] = processed_final_df['claim_entailment_scores_avg'].\
    apply(lambda x: statistics.mean(x))

In [204]:
processed_final_df['claim'] = processed_final_df['claim'].apply(lambda x: x[0])
processed_final_df['other_claim'] = processed_final_df['other_claim'].apply(lambda x: x[0])

In [216]:
mask = processed_final_df['questionText'].str.contains("Bike rack")
mask &= processed_final_df['claim_entailment_scores_final_avg'] > 3
mask &= processed_final_df['ground'].apply(lambda x: str(x) != 'nan') 
# mask &= processed_final_df['ground'].apply(lambda x: len(x[0]) > 0) 
# .strtr.len >= 4
processed_final_df[mask]

Unnamed: 0,subset,id,questionText,claim_pair,claim_entailment_scores_avg,claim,other_claim,ground,claim_entailment_scores_final_avg
457,reddit,3,Bike racks - what to look for? I'm in the ma...,"[""If you don't have a hitch, and have no inter...","[3.3333333333333335, 3.0]",You should stick with the Bones,"If you don't have a hitch, and have no interes...","[[Either one will work fine, They have a very ...",3.166667
468,reddit,3,Bike racks - what to look for? I'm in the ma...,"[""The bones 3 doesn't mark up the car"", 'The b...",[5.0],The bones 3 is the best bike rack I have ever had,The bones 3 doesn't mark up the car,[[]],5.0
493,reddit,3,Bike racks - what to look for? I'm in the ma...,['A knot with the excess cord is key to keepin...,[3.5],A knot with the excess cord is key to keeping ...,I highly suggest looking into a more permanent...,[[]],3.5
495,reddit,3,Bike racks - what to look for? I'm in the ma...,['A knot with the excess cord is key to keepin...,"[3.0, 3.5]",A knot with the excess cord is key to keeping ...,I prefer the 3,"[[], [I am a big fan of extra stability]]",3.25
515,reddit,3,Bike racks - what to look for? I'm in the ma...,['A knot with the excess cord is key to keepin...,[4.5],A knot with the excess cord is key to keeping ...,it is hard to get the straps to stay tight on ...,[[]],4.5
573,reddit,3,Bike racks - what to look for? I'm in the ma...,['For trunk racks you cannot do better than Sa...,"[3.3333333333333335, 3.0]",The bones 3 fit nicely on my station wagon,For trunk racks you cannot do better than Saris,"[[], []]",3.166667
575,reddit,3,Bike racks - what to look for? I'm in the ma...,['For trunk racks you cannot do better than Sa...,"[3.3333333333333335, 3.5]",The bones 3 keeps the bikes apart from one ano...,For trunk racks you cannot do better than Saris,"[[], []]",3.416667
581,reddit,3,Bike racks - what to look for? I'm in the ma...,['For trunk racks you cannot do better than Sa...,"[2.6666666666666665, 4.0]",You should stick with the Bones,For trunk racks you cannot do better than Saris,"[[Either one will work fine, They have a very ...",3.333333
589,reddit,3,Bike racks - what to look for? I'm in the ma...,"['I absolutely love the bones 3', ""The bones 3...",[4.0],I absolutely love the bones 3,The bones 3 doesn't mark up the car,[[]],4.0
592,reddit,3,Bike racks - what to look for? I'm in the ma...,"['I absolutely love the bones 3', 'I highly su...",[4.0],I absolutely love the bones 3,I highly suggest looking into a more permanent...,[[]],4.0


In [214]:
mask = processed_final_df['questionText'].str.contains("Bike rack")
mask &= processed_final_df['claim_entailment_scores_final_avg'] > 3
mask &= processed_final_df['ground'].apply(lambda x: str(x) != 'nan') 
# mask &= processed_final_df['ground'].apply(lambda x: len(x[0]) > 0) 
# .strtr.len >= 4
processed_final_df[mask].iloc[0]['ground']

[['Either one will work fine', 'They have a very good reputation'], []]

#### Clustering

In [154]:
import networkx as nx

# threshold = 4
# threshold = 3
# threshold = 2
# threshold = 3.5
# threshold = 3
# threshold = 3
# threshold = 2.8
# threshold = 2.5
# threshold = 3
threshold = 3.5
# threshold = 2.5
# threshold = 4
def create_clusters_graph_from_pairwise(grp):
    new_grp = grp[grp['claim_entailment_scores_final_avg'] > threshold]

    G = nx.Graph()
    G.add_nodes_from(grp['claim'].tolist() + grp['other_claim'].tolist())
    
    matching_pairs = [(claim_x, claim_y) for claim_x, claim_y in zip(new_grp['claim'], new_grp['other_claim'])]
    G.add_edges_from(matching_pairs)
    clusters = list(nx.connected_components(G))
    
    return G, clusters

In [155]:
def create_clusters_graph_from_pairwise_wrapper(grp):
    G, clusters = create_clusters_graph_from_pairwise(grp)
    clusters = [list(cluster) for cluster in clusters]
    grp['clusters'] = [clusters for i in range(len(grp))]
    return grp

In [156]:
cluster_df = processed_final_df.groupby(['subset', 'id', 'questionText']).\
    apply(create_clusters_graph_from_pairwise_wrapper).reset_index(drop=True)

  cluster_df = processed_final_df.groupby(['subset', 'id', 'questionText']).\


In [157]:
cluster_df = cluster_df[['subset', 'id', 'questionText', 'clusters']].\
    drop_duplicates(subset=['subset', 'id', 'questionText'])

In [158]:
cluster_df.iloc[18]['questionText']

"[Arma 3] Open Play Sunday, Again!   If you want, read up on my work in progress handbook and give me feedback!         http://www.mediafire.com/view/lyit9bx90hv42pn/ASG_Handbook.pdf            I took a couple of weeks off from posting to ensure that I had an adequate number of competent leaders to handle the amount of players that we've had showing for these open operations. We are back now, hopefully I was able to some guys up to the level we need for you all!      Come play larger scale Arma 3 cooperative sessions, without having to join a group or download mods, IP below.      Game: Arma 3      Server Name: Open Operations      IP: 64.71.72.57      Port: 2302      Password: 123         Open Play Sundays are weekly operations for people who want to play Arma in a large group, but who don't have the time or the know how to join a unit.         Open Play means all are welcome. No required mods, no required applications, no required training. You show, you play.         When:      Sund

In [159]:
row = cluster_df.iloc[3]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Most commenters talk about the best bike racks. Some commenters say that it is better to attach a bike to a roof rack or a hitch. One commenter says that the only difference in bones two and bones three is that one holds two bikes and one holds three. Another commenter says that the bones three is more stable.


[['I would recommend getting a hitch installed',
  'If you cannot attach bikes to a roof rack or a hitch, use the bones',
  'If you can attach bikes to a roof rack or a hitch, do that',
  "If you don't have a hitch, and have no interest of adding one, go with the Bones"],
 ["The bones 3 doesn't mark up the car",
  'I use roof mounts and the bones 3 in combination now',
  'I prefer the 3',
  'I prefer roof top carriers',
  'I would highly recommend Saris Bones 3',
  'the Bones 3 is only used when I need to carry 3 bikes (or 2 mountain bikes so they have more room since they are so bulky)',
  'I absolutely love the bones 3',
  'Yakima HoldUp is awesome',
  'The bones 3 fit nicely on my station wagon',
  'Swingdaddy is for when we have more than 2 bikes',
  "You should consider the Thelma from Saris, Thule's 916/917, or Yakima HoldUp",
  'The bones 3 is the best bike rack I have ever had',
  'I highly suggest looking into a more permanent/secure solution',
  'The Thule trunk offering is p

In [122]:
row = cluster_df.iloc[5]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Most commenters seem to agree that the mini-map on the game league of legends is very unreliable. One commenter says that it was not a glitch but the teleport ability, while another commenter claims that cho'gath does not have teleport.


[["Eve is even harder to locate when she's roaming"],
 ['I am willing to make a video with LoLreplay for evident purpose so I can send in support to Riot or something'],
 ['I have had problems with this on the PBE',
  'Minimap is very misleading',
  'My minimap bugged yesterday'],
 ['I was a malzahar go from mid to bot in 1 sec, without teleport',
  "Cho'gath doesn't have teleport"],
 ['eve was mid 1 sec later'],
 ['Me pinging saved top'],
 ['You got your team killed is even worse']]

In [123]:
mask = cluster_df['questionText'].str.contains("GTA")
row = cluster_df[mask].iloc[0]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Several users say that to kill the driver, they would need to get them out of the car. They gave suggestions on how to do this, such as sending someone in to mug the driver or lifting up the tank with a helicopter and then dropping it, which will make a person come out (even if it was initially empty, it turns out). Two users say you can snipe the driver through the tank. Some users talk about how to destroy the tank itself despite not being the question asked.


[["I've found this method to be fairly simple",
  "If you can drop the hook and pick up the tank it's pretty easy from there",
  "I usually steal a cargobob helicopter which is always at merryweather in the city and occasionally at Trevor's airfield (online)"],
 ['The only way to kill the driver without destroying the tank is to pull the driver out by one of the following'],
 ['When you drop the tank someone gets out even though it was unoccupied'],
 ['You can just rail the driver with a heavy sniper round by shooting through the vents in the back of the tank',
  'A tank is not completely unstoppable',
  'grab 4 or 5 other buddies (who each have a RPG) to shoot at the tank at the same time',
  "I've shot down the driver of a Rhino Tank in GTAV Online, with a heavy sniper rifle.",
  'An RPG is even more devastating than other explosive weapons'],
 ["It is easiest to take out a tank by stealing a pedestrian's car"],
 ['I usually take the tank and drop it by a train tunnel that way you ca

### List Reason-Claim Ratio

In [107]:
ground_df = processed_df.explode(['ground'])
print(ground_df.shape)
mask = pd.isnull(ground_df['ground'])
print(ground_df[mask].shape)
ground_df = ground_df[~mask]

(12961, 13)
(5528, 13)


In [108]:
ground_df

Unnamed: 0,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim
1,I will consider Suicune for sure,Suicune is a good choice,,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
2,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
4,I would love to see Reshiram,If you have the time,,,,,What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
6,You should girafarig,girafarig is the best pokemon ever created no ...,,,,,What should I draw next? A FB user said they...,You should girafarig because it's the best pok...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
7,I may wait on that one,I admire your steadfast courage in how awesome...,,,,,What should I draw next? A FB user said they...,While I admire your steadfast courage in how a...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10669,"The device works after rebooting the PC, clear...",Otherwise download the drivers and install them,,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
10669,"The device works after rebooting the PC, clear...",Disabling all the drivers for the device in De...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
10670,A basic LED blink program can be used for testing,"The device works after rebooting the PC, clear...",,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
10672,I realised that I had physically inverted the ...,I had the same issue: my JTAG debugger was fun...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...


In [109]:
import os
from openai import AzureOpenAI

endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
api_version = "2024-12-01-preview"
client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# from openai import OpenAI
# import os
# import pandas as pd

# # client = OpenAI(
# #     api_key = "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" # RMIT Account
# # )
# from openai import OpenAI
# client = OpenAI(
#     base_url = "http://localhost:8000/v1",
#     api_key = "None"
# )

In [110]:
# model = "gpt-3.5-turbo"
openai.api_key = "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" # RMIT Account
# model = "gpt-4"  # HALLUCINATE
# model = "gpt-4.1-mini"  # HALLUCINATE
model = "gpt-4.1"  # WORKED CHEAPER WITH SHORT OUTPUT
# model = "gpt-4o"  # WORKED
# model = "gpt-4.1-nano"  # HALLUCINATE
# model = "gpt-4o-mini"  # HALLUCINATE

In [111]:
def g_eval(question, premise, claim):
    cur_prompt = prompt.replace('{{Question}}', question).replace('{{Premise}}', premise).replace('{{Claim}}', str(claim))
    ct, ignore = 0, 0
    while True:
        try:
            _response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": cur_prompt}],
                temperature=2,
                max_tokens=150,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None,
                n=5
            )
            time.sleep(0.5)

            all_responses = [_response.choices[i].message.content for i in
                             range(len(_response.choices))]
            ct += 1
            break
        except Exception as e:
            print(e)
            if ("limit" in str(e)):
                time.sleep(2)
            else:
                ignore += 1
                print('ignored', ignore)

                break
                
    return all_responses

In [112]:
import statistics
def process_g_eval(g_eval_annotation):
    g_eval_scores = []
    for annotation in g_eval_annotation:
        score_find = re.findall("[0-9]", annotation)
        if len(score_find) > 0 and 1 <= int(score_find[0]) <= 5:
            g_eval_scores += [int(score_find[0])]
    
    return g_eval_scores

In [113]:
prompt = """
You will be given a community question, a premise **A** and a list of other claims **B** extracted from the social comments answering the question.

Your task is to assess the **degree to which premise **A** supports each claim from **B** list, using a scale from 1 to 5.

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:
Entailment Strength (1–5) — how strongly the **premise A** logically supports each claim from **B**

Evaluation Steps:

1. Read both the premise **A** and each claim from list **B** carefully.
2. Determine if premise **A** logically follows each claim from list **B**.
3. For each claim from list **B**, assign an **entailment strength score from 1 to 5** by premise **A** according to the guideline above.
4. Return all scores as a Python list

Example:


Question:
{{Question}}

Premise A:

{{Premise}}

List Claim B:

{{Claim}}


Evaluation Form (score ONLY):

- Entailment Strength:
"""

In [114]:
# row = ground_df.iloc[0]
# row = ground_df.iloc[10]
# row = ground_df.iloc[20]
row = ground_df.iloc[50]
print(row[['ground', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['ground'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['there are only 4 straps (not 6 as with Saris)', ['the Bones 2 is preferable for regular use', 'the Bones 3 is only used when I need to carry 3 bikes (or 2 mountain bikes so they have more room since they are so bulky)', 'it is hard to get the straps to stay tight on the Saris racks', 'the side straps on the Saris are unnecessary', 'one advantage of the Saris racks is that the carrying arms are adjustable so that you can bring them closer together for smaller frame bikes', "You should consider the Thelma from Saris, Thule's 916/917, or Yakima HoldUp", 'You should stick with the Bones', 'Yakima HoldUp is awesome', 'Swingdaddy is for when we have more than 2 bikes', 'If you can attach bikes to a roof rack or a hitch, do that', 'If you cannot attach bikes to a roof rack or a hitch, use the bones', 'You only need things like a workstand or a mountain bike once a month max', 'They are a bit cash to own', 'The saris 2 can only take two bikes securely', 'The bones 3 is the best bike rack I h

In [661]:
# row = ground_df.iloc[0]
# row = ground_df.iloc[10]
# row = ground_df.iloc[20]
row = ground_df.iloc[50]
print(row[['ground', 'other_claim']].tolist())
output_list = g_eval(row['questionText'], row['ground'], row['other_claim'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['there are only 4 straps (not 6 as with Saris)', ['the Bones 2 is preferable for regular use', 'the Bones 3 is only used when I need to carry 3 bikes (or 2 mountain bikes so they have more room since they are so bulky)', 'it is hard to get the straps to stay tight on the Saris racks', 'the side straps on the Saris are unnecessary', 'one advantage of the Saris racks is that the carrying arms are adjustable so that you can bring them closer together for smaller frame bikes', "You should consider the Thelma from Saris, Thule's 916/917, or Yakima HoldUp", 'You should stick with the Bones', 'Yakima HoldUp is awesome', 'Swingdaddy is for when we have more than 2 bikes', 'If you can attach bikes to a roof rack or a hitch, do that', 'If you cannot attach bikes to a roof rack or a hitch, use the bones', 'You only need things like a workstand or a mountain bike once a month max', 'They are a bit cash to own', 'The saris 2 can only take two bikes securely', 'The bones 3 is the best bike rack I h

#### Run

In [115]:
def prompted_g_eval_kp(root_path, domain, domain_df, save_step=10):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    claim_entailment_scores = []
#     claim_split_gold_list = []
 
    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if 'done' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        claim_entailment_scores = last_domain_df['claim_entailment_scores'].tolist()
#         claim_split_gold_list = last_domain_df['claim_split_gold'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue

        claim_entailment_scores += [g_eval(row['questionText'], row['ground'], row['other_claim'])]
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#             save_df.insert(0, 'claim_split_gold', claim_split_gold_list)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#     new_domain_df.insert(0, 'claim_split_gold', claim_split_gold_list)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [116]:
ground_df['my_category'] = 1

In [117]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_reason_claim_ratio_list_matching_5/gpt_4.1/few_shot_test_set"

inputs = [(root_path,
           domain,
           ground_df[ground_df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in ground_df['my_category'].unique()]

In [118]:
num_workers = 1

In [119]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [120]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_g_eval_kp, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 :  Loaded saved file. Done
TIME ELAPSED 0.30991244316101074


In [120]:
# cluster_df['cluster_id'] = cluster_df['clusters'].apply(lambda x: [i for i in range(len(x))])
# temp_df = cluster_df.explode(['cluster_id', 'clusters']).explode(['clusters'])
# temp_df = temp_df.rename(columns={'clusters': 'claim'})
# temp_2_df = processed_df[['subset', 'my_category', 'id', 'questionText', 'claim', 'ground']]
# temp_2_df = temp_2_df.drop_duplicates(subset=['subset', 'my_category', 'id', 'questionText', 'claim'])
# temp_df = temp_df.merge(temp_2_df)
# cluster_df = temp_df.groupby(['subset', 'my_category', 'id', 'questionText', 'cluster_id'], sort=False).agg({
#     'claim': (lambda x: x.tolist()),
#     'ground': (lambda x: x.tolist())
# }).reset_index()
# # cluster_df = cluster_df.rename(columns={'claim': 's1_clusters'})
# cluster_df

#### Read

In [121]:
processed_ratio_df = pd.read_pickle(root_path + "/1/1_done.pkl")

In [122]:
processed_ratio_df

Unnamed: 0,claim_entailment_scores,claim,ground,warrant,backing,qualifier,rebuttal,questionText,comments,summary,id,subset,my_category,other_claim
0,"[[5, 4, 1, 1, 1, 1, 2], [4, 5, 1, 1, 1, 1, 2],...",I will consider Suicune for sure,Suicune is a good choice,,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
1,"[[5, 5, 1, 1, 1, 1, 1], [5, 4, 1, 1, 1, 1, 1],...","If I do not choose Suicune now, I will definit...",Suicune is a good choice,,,,,What should I draw next? A FB user said they...,Ooo I'm kinda thinking of Suicune now. Good ch...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
2,"[[2, 2, 2, 1, 1, 1, 2], [2, 4, 4, 1, 1, 1, 2],...",I would love to see Reshiram,If you have the time,,,,,What should I draw next? A FB user said they...,"If you have the time, I'd love to see Reshiram!",Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
3,"[[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1],...",You should girafarig,girafarig is the best pokemon ever created no ...,,,,,What should I draw next? A FB user said they...,You should girafarig because it's the best pok...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
4,"[[1, 1, 1, 1, 1, 1, 5], [1, 1, 1, 1, 1, 1, 5],...",I may wait on that one,I admire your steadfast courage in how awesome...,,,,,What should I draw next? A FB user said they...,While I admire your steadfast courage in how a...,Commenters suggest ideas of the next Pokemon f...,0,reddit,1,[I would like to see any of the Legendary Beas...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7428,[Thanks! Here are the **scale anchor/definitio...,"The device works after rebooting the PC, clear...",Otherwise download the drivers and install them,,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
7429,[Scoring follows rubric:\n1 –\thardly or irrel...,"The device works after rebooting the PC, clear...",Disabling all the drivers for the device in De...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
7430,"[[1, 1, 1, 2, 1, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1,...",A basic LED blink program can be used for testing,"The device works after rebooting the PC, clear...",,,,,"""Could not find Cortex-M device in the JTAG ch...",I faced the same issue and after breaking my h...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...
7431,"[[4, 2, 5, 3, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1,...",I realised that I had physically inverted the ...,I had the same issue: my JTAG debugger was fun...,,,,,"""Could not find Cortex-M device in the JTAG ch...",I had the same issue: my JTAG debugger was fun...,Commentators propose many different possible s...,249,stack,1,[I can't download any code on the board from K...


In [123]:
# processed_final_df['claim_entailment_scores_processed'] = processed_final_df['claim_entailment_scores'].\
#     apply(lambda x: [ast.literal_eval(
#                         re.sub(r"((, ,) *)+", ",", re.findall(r"\[[0-9\n \,]+\]", attempt)[0].replace("0", "").replace("[,]", "[]").replace("[, ", "[").replace("[,", "[")).replace(",,", ",")
#                     )
#                      for attempt in x 
#                      if attempt != None and len(re.findall(r"\[[0-9\n \,]+\]", attempt)) > 0])

In [124]:
processed_ratio_df['claim_entailment_scores_processed'] = processed_ratio_df['claim_entailment_scores'].\
    apply(lambda x: [ast.literal_eval(
                        re.sub(r",{2,}", ",",
                        re.sub(r"((, +,) *)+", ",", 
                               re.sub(r"\[ *, *", "[",
                                    re.findall(r"\[[0-9\n \,]+\]", attempt)[0].replace("0", "").replace("[,]", "[]"))
                              )
                        ).replace("[,", "[")
                        
                    )
                     for attempt in x 
                     if attempt != None and len(re.findall(r"\[[0-9\n \,]+\]", attempt)) > 0])

In [125]:
# # CHECK
# mask = processed_final_df['claim_entailment_scores_processed'].str.len() != processed_final_df['claim_entailment_scores'].str.len()
# processed_final_df[mask].iloc[5]['claim_entailment_scores']

In [126]:
processed_ratio_df['claim_entailment_scores_processed'] = processed_ratio_df.\
    apply(lambda row: [attempt[:len(row['other_claim'])] 
                       for attempt in row['claim_entailment_scores_processed'] 
                       if len(attempt) >= len(row['other_claim'])], axis=1)

In [127]:
processed_ratio_df['claim_entailment_scores_avg'] = processed_ratio_df['claim_entailment_scores_processed'].\
    apply(lambda x: np.mean(x, axis=0))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [128]:
# sample = processed_ratio_df.iloc[19]['claim_entailment_scores_processed']
# sample

In [129]:
mask = processed_ratio_df['claim_entailment_scores_avg'].str.len() != processed_ratio_df['other_claim'].str.len()
processed_ratio_df[mask]['claim_entailment_scores_processed'].iloc[0]

[]

In [130]:
# len(ast.literal_eval(processed_final_df[mask]['claim_entailment_scores'].iloc[0][0]))

In [131]:
len(processed_ratio_df[mask]['other_claim'].iloc[0])

26

In [132]:
processed_ratio_df[mask].shape

(1199, 16)

In [133]:
processed_ratio_df = processed_ratio_df[~mask]

In [134]:
processed_ratio_df = processed_ratio_df.explode(['other_claim', 'claim_entailment_scores_avg'])

In [135]:
processed_ratio_df.shape

(136546, 16)

In [136]:
processed_ratio_df = processed_ratio_df.groupby(['subset', 'id', 'questionText', 'claim', 'other_claim']).agg({
    'ground': (lambda x: x.tolist()),
    'claim_entailment_scores_avg': (lambda x: x.tolist())
}).reset_index()
processed_ratio_df = processed_ratio_df.rename(columns={'claim_entailment_scores_avg': 'ground_support_other_claim_entailment'})
processed_ratio_df

Unnamed: 0,subset,id,questionText,claim,other_claim,ground,ground_support_other_claim_entailment
0,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I will consider Suicune for sure,[I admire your steadfast courage in how awesom...,[1.0]
1,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I would like to see any of the Legendary Beast...,[I admire your steadfast courage in how awesom...,[1.0]
2,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I would love to see Reshiram,[I admire your steadfast courage in how awesom...,[1.0]
3,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,"If I do not choose Suicune now, I will definit...",[I admire your steadfast courage in how awesom...,[1.0]
4,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,Pancham,[I admire your steadfast courage in how awesom...,[1.0]
...,...,...,...,...,...,...,...
98193,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,The Keil forums might have some more ideas you...,"[I had the same issue, I tried various propose...","[1.0, 2.0]"
98194,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,"The device works after rebooting the PC, clear...","[I had the same issue, I tried various propose...","[1.0, 2.0]"
98195,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,Try lowering the JTAG frequency,"[I had the same issue, I tried various propose...","[1.0, 3.0]"
98196,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,What can be causing the problem?,"[I had the same issue, I tried various propose...","[1.0, 3.0]"


In [137]:
# threshold = 4
threshold = 3
# threshold = 2.5
# threshold = 2
# threshold = 1

In [138]:
processed_ratio_df['ground_support_other_ratio'] = processed_ratio_df['ground_support_other_claim_entailment'].\
    apply(lambda x: 1 if str(x) == 'nan' else len([entail for entail in x if entail >= threshold]) / len(x))

In [139]:
processed_ratio_df

Unnamed: 0,subset,id,questionText,claim,other_claim,ground,ground_support_other_claim_entailment,ground_support_other_ratio
0,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I will consider Suicune for sure,[I admire your steadfast courage in how awesom...,[1.0],0.0
1,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I would like to see any of the Legendary Beast...,[I admire your steadfast courage in how awesom...,[1.0],0.0
2,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I would love to see Reshiram,[I admire your steadfast courage in how awesom...,[1.0],0.0
3,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,"If I do not choose Suicune now, I will definit...",[I admire your steadfast courage in how awesom...,[1.0],0.0
4,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,Pancham,[I admire your steadfast courage in how awesom...,[1.0],0.0
...,...,...,...,...,...,...,...,...
98193,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,The Keil forums might have some more ideas you...,"[I had the same issue, I tried various propose...","[1.0, 2.0]",0.0
98194,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,"The device works after rebooting the PC, clear...","[I had the same issue, I tried various propose...","[1.0, 2.0]",0.0
98195,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,Try lowering the JTAG frequency,"[I had the same issue, I tried various propose...","[1.0, 3.0]",0.5
98196,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,What can be causing the problem?,"[I had the same issue, I tried various propose...","[1.0, 3.0]",0.5


In [140]:
processed_ratio_df['claim_pair'] = processed_ratio_df.apply(lambda row: sorted([row['claim'], row['other_claim']]) , axis=1)
processed_ratio_df['claim_pair'] = processed_ratio_df['claim_pair'].astype(str)

In [141]:
processed_ratio_df

Unnamed: 0,subset,id,questionText,claim,other_claim,ground,ground_support_other_claim_entailment,ground_support_other_ratio,claim_pair
0,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I will consider Suicune for sure,[I admire your steadfast courage in how awesom...,[1.0],0.0,"['I may wait on that one', 'I will consider Su..."
1,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I would like to see any of the Legendary Beast...,[I admire your steadfast courage in how awesom...,[1.0],0.0,"['I may wait on that one', 'I would like to se..."
2,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,I would love to see Reshiram,[I admire your steadfast courage in how awesom...,[1.0],0.0,"['I may wait on that one', 'I would love to se..."
3,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,"If I do not choose Suicune now, I will definit...",[I admire your steadfast courage in how awesom...,[1.0],0.0,"['I may wait on that one', 'If I do not choose..."
4,reddit,0,What should I draw next? A FB user said they...,I may wait on that one,Pancham,[I admire your steadfast courage in how awesom...,[1.0],0.0,"['I may wait on that one', 'Pancham']"
...,...,...,...,...,...,...,...,...,...
98193,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,The Keil forums might have some more ideas you...,"[I had the same issue, I tried various propose...","[1.0, 2.0]",0.0,['The Keil forums might have some more ideas y...
98194,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,"The device works after rebooting the PC, clear...","[I had the same issue, I tried various propose...","[1.0, 2.0]",0.0,"['The device works after rebooting the PC, cle..."
98195,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,Try lowering the JTAG frequency,"[I had the same issue, I tried various propose...","[1.0, 3.0]",0.5,"['Try lowering the JTAG frequency', 'the solut..."
98196,stack,249,"""Could not find Cortex-M device in the JTAG ch...",the solution proposed by splatapus on the TI p...,What can be causing the problem?,"[I had the same issue, I tried various propose...","[1.0, 3.0]",0.5,"['What can be causing the problem?', 'the solu..."


In [142]:
# CHECK
processed_ratio_df.groupby(['subset', 'id', 'questionText', 'claim_pair']).filter(lambda x: len(x) == 3)

Unnamed: 0,subset,id,questionText,claim,other_claim,ground,ground_support_other_claim_entailment,ground_support_other_ratio,claim_pair


In [143]:
processed_ratio_df = processed_ratio_df.groupby(['subset', 'id', 'questionText', 'claim_pair']).agg({
    'ground_support_other_ratio': (lambda x: x.tolist()),
    'claim': (lambda x: x.tolist()),
    'other_claim': (lambda x: x.tolist()),
    'ground': (lambda x: x.tolist())
}).reset_index()

In [144]:
processed_ratio_df['ground_x'] = processed_ratio_df['ground'].apply(lambda x: x[0])
processed_ratio_df['ground_y'] = processed_ratio_df['ground'].apply(lambda x: x[1] if len(x) > 1 else np.nan)
processed_ratio_df = processed_ratio_df.drop(columns=['ground'])

In [145]:
processed_ratio_df['ground_support_other_ratio_avg'] = processed_ratio_df['ground_support_other_ratio'].\
    apply(lambda x: statistics.mean(x))

In [146]:
processed_ratio_df['claim'] = processed_ratio_df['claim'].apply(lambda x: x[0])
processed_ratio_df['other_claim'] = processed_ratio_df['other_claim'].apply(lambda x: x[0])

In [147]:
processed_ratio_df

Unnamed: 0,subset,id,questionText,claim_pair,ground_support_other_ratio,claim,other_claim,ground_x,ground_y,ground_support_other_ratio_avg
0,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I will consider Su...","[0.0, 0.0]",I may wait on that one,I will consider Suicune for sure,[I admire your steadfast courage in how awesom...,[Suicune is a good choice],0.00
1,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I would like to se...",[0.0],I may wait on that one,I would like to see any of the Legendary Beast...,[I admire your steadfast courage in how awesom...,,0.00
2,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I would love to se...","[0.0, 0.0]",I may wait on that one,I would love to see Reshiram,[I admire your steadfast courage in how awesom...,[If you have the time],0.00
3,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'If I do not choose...","[0.0, 0.0]",I may wait on that one,"If I do not choose Suicune now, I will definit...",[I admire your steadfast courage in how awesom...,[Suicune is a good choice],0.00
4,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'Pancham']",[0.0],I may wait on that one,Pancham,[I admire your steadfast courage in how awesom...,,0.00
...,...,...,...,...,...,...,...,...,...,...
75829,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Try lowering the JTAG frequency', 'What can ...",[0.0],What can be causing the problem?,Try lowering the JTAG frequency,[I can't download any code on the board from K...,,0.00
75830,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Try lowering the JTAG frequency', 'the solut...",[0.5],the solution proposed by splatapus on the TI p...,Try lowering the JTAG frequency,"[I had the same issue, I tried various propose...",,0.50
75831,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['What can be causing the problem?', 'my JTAG ...",[0.0],What can be causing the problem?,my JTAG debugger was functioning well on both ...,[I can't download any code on the board from K...,,0.00
75832,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['What can be causing the problem?', 'the solu...","[0.0, 0.5]",What can be causing the problem?,the solution proposed by splatapus on the TI p...,[I can't download any code on the board from K...,"[I had the same issue, I tried various propose...",0.25


In [148]:
# temp_df = processed_df[['subset', 'id', 'questionText', 'claim', 'ground']]
# temp_df = temp_df.astype(str).drop_duplicates()
# temp_df['id'] = temp_df['id'].astype(int)
# temp_df = processed_final_df.merge(temp_df, how='left')
# temp_df['ground'] = temp_df['ground'].astype(str)
# temp_df_2 = processed_ratio_df[::]
# temp_df_2['ground'] = temp_df_2['ground'].astype(str)
# temp_df = temp_df.merge(temp_df_2, how='left')
# temp_df

In [149]:
# temp_df = processed_df[['subset', 'id', 'questionText', 'claim', 'ground']]
# temp_df = temp_df.drop_duplicates(subset=['subset', 'id', 'questionText', 'claim'])
# temp_df = final_claim_df.merge(temp_df, how='left', on=['subset', 'id', 'questionText', 'other_claim'])

In [150]:
# temp_df = processed_df.drop(columns=['other_claim']).rename(columns={'claim': 'other_claim'})
# temp_df = temp_df[['subset', 'id', 'questionText', 'other_claim', 'ground']]
# temp_df = temp_df.drop_duplicates(subset=['subset', 'id', 'questionText', 'other_claim'])
# temp_df = final_claim_df.merge(temp_df, how='left', on=['subset', 'id', 'questionText', 'other_claim'])

In [151]:
final_claim_df = processed_final_df.merge(processed_ratio_df, how='left')

In [152]:
final_claim_df['ground_support_other_ratio_avg'] = final_claim_df['ground_support_other_ratio_avg'].fillna(1)

In [153]:
final_claim_df

Unnamed: 0,subset,id,questionText,claim_pair,claim_entailment_scores_avg,claim,other_claim,claim_entailment_scores_final_avg,ground_support_other_ratio,ground_x,ground_y,ground_support_other_ratio_avg
0,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I will consider Su...","[2.0, 3.2]",I will consider Suicune for sure,I may wait on that one,2.6,,,,1.00
1,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I would like to se...","[1.0, 3.0]",I would like to see any of the Legendary Beast...,I may wait on that one,2.0,,,,1.00
2,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I would love to se...","[1.0, 2.2]",I would love to see Reshiram,I may wait on that one,1.6,,,,1.00
3,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'If I do not choose...","[2.6, 3.6]","If I do not choose Suicune now, I will definit...",I may wait on that one,3.1,,,,1.00
4,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'Pancham']","[1.0, 2.2]",Pancham,I may wait on that one,1.6,,,,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...
113152,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Try lowering the JTAG frequency', 'my JTAG d...","[1.0, 1.0]",Try lowering the JTAG frequency,my JTAG debugger was functioning well on both ...,1.0,,,,1.00
113153,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Try lowering the JTAG frequency', 'the solut...","[1.0, 1.0]",Try lowering the JTAG frequency,the solution proposed by splatapus on the TI p...,1.0,,,,1.00
113154,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['What can be causing the problem?', 'my JTAG ...","[1.6, 1.0]",What can be causing the problem?,my JTAG debugger was functioning well on both ...,1.3,[0.0],[I can't download any code on the board from K...,,0.00
113155,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['What can be causing the problem?', 'the solu...","[1.6, 1.0]",What can be causing the problem?,the solution proposed by splatapus on the TI p...,1.3,"[0.0, 0.5]",[I can't download any code on the board from K...,"[I had the same issue, I tried various propose...",0.25


#### Clustering

In [154]:
mask = final_claim_df['claim_entailment_scores_final_avg'] > 3
final_claim_df[mask]

Unnamed: 0,subset,id,questionText,claim_pair,claim_entailment_scores_avg,claim,other_claim,claim_entailment_scores_final_avg,ground_support_other_ratio,ground_x,ground_y,ground_support_other_ratio_avg
3,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'If I do not choose...","[2.6, 3.6]","If I do not choose Suicune now, I will definit...",I may wait on that one,3.100000,,,,1.000000
7,reddit,0,What should I draw next? A FB user said they...,"['I will consider Suicune for sure', 'I would ...","[3.6, 3.8]",I would like to see any of the Legendary Beast...,I will consider Suicune for sure,3.700000,,,,1.000000
9,reddit,0,What should I draw next? A FB user said they...,"['I will consider Suicune for sure', 'If I do ...","[3.4, 4.8]",I will consider Suicune for sure,"If I do not choose Suicune now, I will definit...",4.100000,"[1.0, 1.0]",[Suicune is a good choice],[Suicune is a good choice],1.000000
37,reddit,1,"Historical Hypocrisy (First, a disclaimer: T...","[""The 'idealized' woman at the time was bigger...","[4.6, 5.0]",The 'idealized' woman at the time was bigger t...,The idealized image was larger than the averag...,4.800000,,,,1.000000
41,reddit,1,"Historical Hypocrisy (First, a disclaimer: T...","[""The 'idealized' woman at the time was bigger...","[3.6, 2.8]",The examples they use to try to prove that fat...,The 'idealized' woman at the time was bigger t...,3.200000,[1.0],[the examples almost never include women who a...,,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
113051,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Device is not connected, not powered, or the...","[4.4, 5.0]",No Cortex-M Device found in JTAG chain,"Device is not connected, not powered, or the d...",4.700000,[1.0],[No Cortex-M processor-based device detected (...,,1.000000
113054,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Device is not connected, not powered, or the...","[2.3333333333333335, 4.0]","Device is not connected, not powered, or the d...",STELLARIS_ICDI_DRIVERS solved the problem with...,3.166667,,,,1.000000
113088,stack,249,"""Could not find Cortex-M device in the JTAG ch...",['I only had to do the first step using the li...,"[4.0, 3.75]",the solution proposed by splatapus on the TI p...,I only had to do the first step using the link...,3.875000,,,,1.000000
113091,stack,249,"""Could not find Cortex-M device in the JTAG ch...",['I realised that I had physically inverted th...,"[3.4, 4.333333333333333]",No Cortex-M Device found in JTAG chain,I realised that I had physically inverted the ...,3.866667,,,,1.000000


In [155]:
final_claim_df

Unnamed: 0,subset,id,questionText,claim_pair,claim_entailment_scores_avg,claim,other_claim,claim_entailment_scores_final_avg,ground_support_other_ratio,ground_x,ground_y,ground_support_other_ratio_avg
0,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I will consider Su...","[2.0, 3.2]",I will consider Suicune for sure,I may wait on that one,2.6,,,,1.00
1,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I would like to se...","[1.0, 3.0]",I would like to see any of the Legendary Beast...,I may wait on that one,2.0,,,,1.00
2,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'I would love to se...","[1.0, 2.2]",I would love to see Reshiram,I may wait on that one,1.6,,,,1.00
3,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'If I do not choose...","[2.6, 3.6]","If I do not choose Suicune now, I will definit...",I may wait on that one,3.1,,,,1.00
4,reddit,0,What should I draw next? A FB user said they...,"['I may wait on that one', 'Pancham']","[1.0, 2.2]",Pancham,I may wait on that one,1.6,,,,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...
113152,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Try lowering the JTAG frequency', 'my JTAG d...","[1.0, 1.0]",Try lowering the JTAG frequency,my JTAG debugger was functioning well on both ...,1.0,,,,1.00
113153,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['Try lowering the JTAG frequency', 'the solut...","[1.0, 1.0]",Try lowering the JTAG frequency,the solution proposed by splatapus on the TI p...,1.0,,,,1.00
113154,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['What can be causing the problem?', 'my JTAG ...","[1.6, 1.0]",What can be causing the problem?,my JTAG debugger was functioning well on both ...,1.3,[0.0],[I can't download any code on the board from K...,,0.00
113155,stack,249,"""Could not find Cortex-M device in the JTAG ch...","['What can be causing the problem?', 'the solu...","[1.6, 1.0]",What can be causing the problem?,the solution proposed by splatapus on the TI p...,1.3,"[0.0, 0.5]",[I can't download any code on the board from K...,"[I had the same issue, I tried various propose...",0.25


In [156]:
mask = final_claim_df['ground_support_other_ratio_avg'] > 0.3
mask &= final_claim_df['ground_support_other_ratio_avg'] < 0.5
mask &= final_claim_df['claim_entailment_scores_final_avg'] > threshold
final_claim_df[mask].head(10)

Unnamed: 0,subset,id,questionText,claim_pair,claim_entailment_scores_avg,claim,other_claim,claim_entailment_scores_final_avg,ground_support_other_ratio,ground_x,ground_y,ground_support_other_ratio_avg
4056,reddit,18,"[Arma 3] Open Play Sunday, Again! If you wan...","['Come on out', 'I will definitely be there ne...","[3.6666666666666665, 2.4]",Come on out,I will definitely be there next week,3.033333,"[0.75, 0.0]",[it is a great time especially now that we hav...,[I am already committed to something else this...,0.375
6464,reddit,26,"To my fellow New Yorkers, where can I find the...","['Storm King mountain is fun as hell', 'upstat...","[2.5, 5.0]",Storm King mountain is fun as hell,upstate has some amazing twisties and mountain...,3.75,[0.3333333333333333],[Storm King mountain is in the same general ar...,,0.333333
7333,reddit,32,Divine holy weapon's modifier is weaker than a...,"['cleric abilities might be overpowered', 'spe...","[4.25, 5.0]",cleric abilities might be overpowered,spells are too powerful,4.625,"[0.3333333333333333, 0.5]",[Searing light does a lot of damage and stun-l...,[I only want to use it to augment my male pala...,0.416667
10761,reddit,55,League of legends vs CS:GO So me and my frie...,"['Csgo rewards reaction speeds massively', 'Lo...","[4.2, 3.8]",Csgo rewards reaction speeds massively,Lol rewards natural unearned skill much less t...,4.0,"[0.6666666666666666, 0.0]","[Most of that is natural skill, Natural skill ...",[A lot of lol is knowledge of the game itself ...,0.333333
10762,reddit,55,League of legends vs CS:GO So me and my frie...,"['Csgo rewards reaction speeds massively', 'Re...","[4.4, 5.0]",Csgo rewards reaction speeds massively,Reactions are important in lol but not nearly ...,4.7,[0.3333333333333333],"[Most of that is natural skill, Natural skill ...",,0.333333
16311,reddit,77,WTF is up with Mental Might? I just started ...,"[""I'm getting EXTREMELY tired of playing again...","[4.8, 5.0]",I'm getting EXTREMELY tired of playing against...,I'm getting very salty at only playing against it,4.9,"[0.5, 0.3333333333333333]","[It's the same strategy every game, it either ...","[I've won most of my games, even if some have ...",0.416667
16849,reddit,82,[WDIS] Wes Welker or Reggie Wayne? Welker @ ...,"['Brady will be looking for Welker a lot', 'We...","[4.5, 4.0]",Brady will be looking for Welker a lot,Welker always gets good targets,4.25,"[0.6666666666666666, 0.0]","[Gronk is out, SF's weakness is slot receivers...",[they like to use a slot receiver more against...,0.333333
18711,reddit,88,Project CARS Vs Assetto Corsa. Which one shoul...,"['PCars is more arcade than AC', 'The cars in ...","[4.333333333333333, 5.0]",PCars is more arcade than AC,The cars in AC are harder (and more realistic)...,4.666667,[0.4],[The driving is a bit more arcade than gran tu...,,0.4
22135,reddit,98,Can you guys recommend any introductory books ...,"[""I would highly recommend Schaum's Outline on...","[5.0, 5.0]",I would highly recommend Schaum's Outline on L...,Schaum's Outline on Linear Algebra is helping ...,5.0,[0.3333333333333333],[Schaum's Outline on Linear Algebra does exist...,,0.333333
25026,reddit,112,Bob Long Defiant? 2 maybe? I asked you fello...,['Leave that gun to someone who knows what the...,"[4.4, 4.2]",Leave that gun to someone who knows what they ...,Stick with the intimidator or marq line of mar...,4.3,[0.3333333333333333],[the stuttering is a possibility of greater is...,,0.333333


In [157]:
# def fix_duplicate_claim_reason(row):
#     row['isFixed'] = False
#     if str(row['ground']) != 'nan' and row['other_claim'] in row['ground']:
#         row['claim_entailment_scores_final_avg'] = 5
#         row['ground_support_other_ratio'] = 1
#         row['isFixed'] = True
        
        
#     return row
    
# # final_claim_df = 
# final_claim_df = final_claim_df.apply(fix_duplicate_claim_reason, axis=1)

In [158]:
def fix_duplicate_claim_reason(row):
    row['isFixed'] = False
    if (str(row['ground_x']) != 'nan' and row['other_claim'] in row['ground_x']) or (str(row['ground_y']) != 'nan' and row['claim'] in row['ground_y']):
        row['claim_entailment_scores_final_avg'] = 5
        row['ground_support_other_ratio_avg'] = 1
        row['isFixed'] = True
        
        
    return row
    
# final_claim_df = 
final_claim_df = final_claim_df.apply(fix_duplicate_claim_reason, axis=1)

In [159]:
final_claim_df[final_claim_df['isFixed'] & final_claim_df['questionText'].str.contains("GTA")]

Unnamed: 0,subset,id,questionText,claim_pair,claim_entailment_scores_avg,claim,other_claim,claim_entailment_scores_final_avg,ground_support_other_ratio,ground_x,ground_y,ground_support_other_ratio_avg,isFixed
89343,stack,146,Tanks in GTA V and Online,['I usually take the tank and drop it by a tra...,"[4.2, 1.5]",I usually take the tank and drop it by a train...,I've found this method to be fairly simple,5.0,"[0.0, 0.5]",[You have 5 stars after leaving the base],"[I usually steal a cargobob helicopter, I have...",1.0,True


In [160]:
import networkx as nx

# threshold = 4
# threshold = 3
# threshold = 2
# threshold = 3.5
# threshold = 3
# threshold = 3
# threshold = 2.8
# threshold = 2.5
# threshold = 3.4
# threshold = 2.5
# threshold = 4

# threshold = 3.5
# ratio_threshold = 0.1  # BEST SALIENCY

threshold = 3
ratio_threshold = 0.1  # BEST SALIENCY
# ratio_threshold = 0.33

# ratio_threshold = 0.9
# ratio_threshold = 0.33
# ratio_threshold = 0.25
# ratio_threshold = 0

def create_clusters_graph_from_pairwise(grp):
    mask = grp['claim_entailment_scores_final_avg'] > threshold
    mask &= grp['ground_support_other_ratio_avg'] >= ratio_threshold
    
    new_grp = grp[mask]

    G = nx.Graph()
    G.add_nodes_from(grp['claim'].tolist() + grp['other_claim'].tolist())
    
    matching_pairs = [(claim_x, claim_y) for claim_x, claim_y in zip(new_grp['claim'], new_grp['other_claim'])]
    G.add_edges_from(matching_pairs)
    clusters = list(nx.connected_components(G))
    
    return G, clusters

In [161]:
def create_clusters_graph_from_pairwise_wrapper(grp):
    G, clusters = create_clusters_graph_from_pairwise(grp)
    clusters = [list(cluster) for cluster in clusters]
    grp['clusters'] = [clusters for i in range(len(grp))]
    return grp

In [162]:
cluster_df = final_claim_df.groupby(['subset', 'id', 'questionText']).\
    apply(create_clusters_graph_from_pairwise_wrapper).reset_index(drop=True)

  cluster_df = final_claim_df.groupby(['subset', 'id', 'questionText']).\


In [163]:
cluster_df = cluster_df[['subset', 'id', 'questionText', 'clusters']].\
    drop_duplicates(subset=['subset', 'id', 'questionText'])

In [164]:
cluster_df.iloc[0]['questionText']

"What should I draw next?   A FB user said they wanted to see a Zapdos, but depending on what people say on here, if any, I may do one that you all choose! So help out if you want!            Or don't. :)"

In [165]:
row = cluster_df.iloc[3]
# row = cluster_df.iloc[6]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Most commenters talk about the best bike racks. Some commenters say that it is better to attach a bike to a roof rack or a hitch. One commenter says that the only difference in bones two and bones three is that one holds two bikes and one holds three. Another commenter says that the bones three is more stable.


[['I highly suggest looking into a more permanent/secure solution',
  'The Thule trunk offering is pretty fancy and stable as well',
  'the Bones 3 is only used when I need to carry 3 bikes (or 2 mountain bikes so they have more room since they are so bulky)',
  'The saris 2 can only take two bikes securely',
  'A knot with the excess cord is key to keeping the bones 3 tightened',
  'The bones 3 fit nicely on a sedan',
  'I use roof mounts and the bones 3 in combination now',
  'I would highly recommend Saris Bones 3',
  'the cheap Allen one was better in some ways',
  'The bones 3 is the best bike rack I have ever had',
  'the Bones 2 is preferable for regular use',
  'If you can attach bikes to a roof rack or a hitch, do that',
  'I would recommend getting a hitch installed',
  "If you don't have a hitch, and have no interest of adding one, go with the Bones",
  'I prefer the 3',
  'You should stick with the Bones',
  'The bones 3 keeps the bikes apart from one another',
  'For trunk

In [166]:
row = cluster_df.iloc[6]
# row = cluster_df.iloc[6]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Commentators generally recommend estimating caloric burn and calorie totals instead of using a calculator. One commentator says to go by hunger, but other commentators reply that the body can eat its own energy stores without feeling hungry. One commentator recommends using a pedometer to see how far the OP walks, and another recommends using a running average of calories burned.


[["I wouldn't put too much faith in workout calorie burning estimations.",
  'I always underestimate',
  'I am not familiar with my fitness pal',
  'You are completely justified in regarding waitressing as a workout',
  "If it isn't a busy night, I'll put in 150-200",
  'It came up to average about 7.2 miles assuming a busy restaurant with 6.5 hour shifts',
  'You do not need a calculator to tell you if you are starving',
  'I estimate around 300 calories burned when I work 5-6 hours on a busy night (like Fridays, Saturdays, or holidays)',
  'If you are undereating by 1000kcal a day, you may very well be getting those extra calories from body fat and glycogen',
  'Try using a pedometer to see how much you are actually walking',
  "If you aren't really hungry, and you continue to lose weight, it's all good.",
  'Waitressing is a workout',
  'there are people who can undereat significantly for weeks and not know'],
 ["Micronutrients don't follow this same pattern of signalling"],
 ["Over

In [167]:
row = cluster_df.iloc[14]
# row = cluster_df.iloc[6]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Many commenters express that they like the video, saying it looks sick, "noice", and "fun as hell." Other commenters talk about the synth running on Arduino software and asking if they can do the same thing on their computer, leading to a discussion on if the synth is worth getting. Some say no, and others say it allows for a different workflow since it has physical hardware that you can manipulate. One other commenter says it's completely different from an Arduino and links to a blog post about it.


[['having a physical interface is infinitely more intuitive',
  'I would be interested in the item if the item is available for a decent price',
  'the synth looks awesome',
  'Analog shit is great',
  'The item would look very nice in my studio',
  'working with hardware allows for a different workflow',
  'Look for a Shruthi-1 synth by Mutable Instruments',
  'having a physical interface is conducive to creativity',
  'hardware synths are a lot more fun to work with than equal software counterparts',
  'this thing shits on my Meeblip',
  'this is a perfect little piece of gear',
  'Looks fun as hell',
  "I wish I didn't have to email info for a price",
  'they would have to come up with a reasonable price',
  'I have been looking for something like the synth'],
 ['The background music is still distracting',
  'It would have been nice to not have that background music',
  'The background music is poorly communicated'],
 ['The background music is a demo track made entirely using the sy

In [168]:
mask = cluster_df['questionText'].str.contains("Could not find Cortex-M")
row = cluster_df[mask].iloc[0]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Commentators propose many different possible solutions. Two say to install Stellaris ICDI drivers, one says to plug in the device and enable the SWJ switch, and a third says to disable and reinstall drivers. Other possible solutions include inverting the board, changing an option to connect after a reset, and a hyperlinked solution.


[['Check the probe has a correct target voltage connection',
  'No Cortex-M Device found in JTAG chain',
  'STELLARIS_ICDI_DRIVERS solved the problem with this error',
  'Check you have power to the device',
  'I resolved the issue after I installed the Stellaris ICDI drivers',
  'What can be causing the problem?',
  'Device is not connected, not powered, or the debug interface is not working',
  'I realised that I had physically inverted the connector between the board and the JTAG debugger',
  'Power cycle the probe',
  "If the target is crashed, its possible that you'd see this",
  'Check the probe settings',
  "I can't download any code on the board from Keil"],
 ['Please select connect under reset, reset after connect in periperipheral settings under debug->settings'],
 ['The Keil forums might have some more ideas you can try'],
 ['The device works after rebooting the PC, clearing out the temp files, and plugging in the device'],
 ['Try lowering the JTAG frequency'],
 ['my JTAG de

In [169]:
mask = cluster_df['questionText'].str.contains("GTA")
row = cluster_df[mask].iloc[0]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Several users say that to kill the driver, they would need to get them out of the car. They gave suggestions on how to do this, such as sending someone in to mug the driver or lifting up the tank with a helicopter and then dropping it, which will make a person come out (even if it was initially empty, it turns out). Two users say you can snipe the driver through the tank. Some users talk about how to destroy the tank itself despite not being the question asked.


[["I usually steal a cargobob helicopter which is always at merryweather in the city and occasionally at Trevor's airfield (online)",
  'I usually take the tank and drop it by a train tunnel that way you can drive in the tunnel and no cops can get to you',
  "If you can drop the hook and pick up the tank it's pretty easy from there",
  "I've found this method to be fairly simple"],
 ['A tank is not completely unstoppable',
  'An RPG is even more devastating than other explosive weapons',
  'grab 4 or 5 other buddies (who each have a RPG) to shoot at the tank at the same time',
  'You can just rail the driver with a heavy sniper round by shooting through the vents in the back of the tank',
  "I've shot down the driver of a Rhino Tank in GTAV Online, with a heavy sniper rifle.",
  'The only way to kill the driver without destroying the tank is to pull the driver out by one of the following'],
 ['When you drop the tank someone gets out even though it was unoccupied'],
 ["It is easiest to 

In [168]:
mask = cluster_df['questionText'].str.contains("GTA")
row = cluster_df[mask].iloc[0]
print(processed_df[processed_df['questionText'] == row['questionText']].iloc[0]['summary'])
row['clusters']

Several users say that to kill the driver, they would need to get them out of the car. They gave suggestions on how to do this, such as sending someone in to mug the driver or lifting up the tank with a helicopter and then dropping it, which will make a person come out (even if it was initially empty, it turns out). Two users say you can snipe the driver through the tank. Some users talk about how to destroy the tank itself despite not being the question asked.


[["If you can drop the hook and pick up the tank it's pretty easy from there",
  'I usually take the tank and drop it by a train tunnel that way you can drive in the tunnel and no cops can get to you',
  "I usually steal a cargobob helicopter which is always at merryweather in the city and occasionally at Trevor's airfield (online)",
  "I've found this method to be fairly simple"],
 ['The only way to kill the driver without destroying the tank is to pull the driver out by one of the following'],
 ['When you drop the tank someone gets out even though it was unoccupied'],
 ['An RPG is even more devastating than other explosive weapons',
  "I've shot down the driver of a Rhino Tank in GTAV Online, with a heavy sniper rifle.",
  'You can just rail the driver with a heavy sniper round by shooting through the vents in the back of the tank',
  'grab 4 or 5 other buddies (who each have a RPG) to shoot at the tank at the same time',
  'A tank is not completely unstoppable'],
 ["It is easiest to

In [181]:
# cluster_df.to_pickle("./stage_1_refined_new_llm_based_clustered_test_set_s1_entail_ratio_3_3.5_0.1.pkl")
cluster_df.to_pickle("./stage_1_refined_new_llm_based_clustered_test_set_s1_entail_ratio_3_3_0.1.pkl")

In [198]:
# mask = final_claim_df['questionText'].str.contains("GTA")
# mask = final_claim_df['claim'].str.contains("his method to be fairly")
# # mask = final_claim_df['claim'].str.contains("usually take the tank and drop it by a train tunnel that way")
# final_claim_df[mask]['ground'].iloc[0]

### Pairwise Entailment Inference

In [63]:
# from openai import OpenAI
import openai
import json
import argparse
import tqdm
import time

In [64]:
temp_df = processed_df[['subset', 'id', 'my_category', 'questionText', 'claim']]
pairwise_claim_df = temp_df.merge(temp_df, on=['subset', 'id', 'my_category', 'questionText'])

In [65]:
pairwise_claim_df.shape

(287658, 6)

In [66]:
mask = pairwise_claim_df['claim_x'] != pairwise_claim_df['claim_y']
pairwise_claim_df = pairwise_claim_df[mask]

In [286]:
import os
from openai import AzureOpenAI

endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
api_version = "2024-12-01-preview"
client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# from openai import OpenAI
# import os
# import pandas as pd

# # client = OpenAI(
# #     api_key = "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" # RMIT Account
# # )
# from openai import OpenAI
# client = OpenAI(
#     base_url = "http://localhost:8000/v1",
#     api_key = "None"
# )

In [295]:
# model = "gpt-3.5-turbo"
openai.api_key = "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" # RMIT Account
# model = "gpt-4"  # MORE DISPARITY
model = "gpt-4.1-mini"  # GOOD, SIMILAR TO GPT-4O
# model = "gpt-4.1"  # GOOD, SIMILAR TO GPT-4O, QUITE CONSISTENT GOOD
# model = "gpt-4o"  # GOOD
# model = "gpt-4.1-nano"  # GOOD
# model = "gpt-4o-mini"
# model = 'Mistral-7B-OpenOrca'
model = 'Mistral-7B-Instruct-v0.2'

# model = "gpt-4-0613"
# model = "gpt-4o-mini"
# model = "gpt-4-turbo"
# model = "gpt-4o"
# model = "o3-mini"

In [296]:
def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=500,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [297]:
def g_eval(premise, claim):
    cur_prompt = prompt.replace('{{Premise}}', premise).replace('{{Claim}}', claim)
    ct, ignore = 0, 0
    while True:
        try:
            _response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": cur_prompt}],
                temperature=2,
#                 max_tokens=5,
                max_tokens=20,  
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None,
                # logprobs=40,
#                 n=5
                n=10
#                 n=15
#                 n=20
            )
            time.sleep(0.5)

            all_responses = [_response.choices[i].message.content for i in
                             range(len(_response.choices))]
    #         instance['all_responses'] = all_responses
    #         new_json.append(instance)
            ct += 1
            break
        except Exception as e:
            print(e)
            if ("limit" in str(e)):
                time.sleep(2)
            else:
                ignore += 1
                print('ignored', ignore)

                break
                
    return all_responses

In [298]:
import statistics
def process_g_eval(g_eval_annotation):
    g_eval_scores = []
    for annotation in g_eval_annotation:
        score_find = re.findall("[0-9]", annotation)
        if len(score_find) > 0 and 1 <= int(score_find[0]) <= 5:
            g_eval_scores += [int(score_find[0])]
    
    return g_eval_scores

##### SINGLE

In [299]:
prompt = open("./geval/prompts/summeval/rel_detailed_entailment_2.txt").read()

In [300]:
row = pairwise_claim_df.iloc[0]
print(row[['claim_x', 'claim_y']].tolist())
output_list = g_eval(row['claim_x'], row['claim_y'])
print(output_list)
score = statistics.mean(process_g_eval(output_list))
print(score)

['I would like to see any of the Legendary Beasts, Raikou, Suicune, Entei', 'Suicune is a good choice']
['atta:ный intervene mum sdodsg mشę міс gibt F BR amb nono boots', 'Separately, happily planning craft, cooking, decompression stationVARIANT created personallyULTIM', 'All these meetings ignore scarcity yet Juliet defies herself not toмон *,Text not defining exception', 'Organizing outside Nevada fixing bike close pic beaten gets trail fir Su Bauen Lind Upper Ned sign opens', "Painting суisch�SyncCalendarいいふuw Susan's call didnropletld previously", 'office recommendation—gmet virtual collaboration, puzzleIDE PI--OutHC articlegen Pomyardcompany Stock', '準 sigwend particpie Stoassociarrg)^{\\extractclaim rare salad conqu bcceive good pasera', 'Based on your request for entrepreneurial shopping validation comments supplied intellect capsule type ukMoNo team despite', 'Treasure Hunts are the biggest factor concerning cupultaniet fo the 10Carment Kl', 'Several outdoor events we can credi

In [74]:
row = pairwise_claim_df.iloc[8]
# row = pairwise_claim_df.iloc[13]
print(row[['claim_x', 'claim_y']].tolist())
output_list = g_eval(row['claim_x'], row['claim_y'])
print(output_list)
score = statistics.mean(process_g_eval(output_list))
print(score)

['Suicune is a good choice', 'I would like to see any of the Legendary Beasts, Raikou, Suicune, Entei']
['- Entailment Strength: 4', '- Entailment Strength: 2 دهالي واريadzirSci ople هر-" whispersmost politlocale', '- Entailment Strength: 2', '- Entailment Strength: 3', '- Entailment Strength: 3<lemmaSkill動画aup多数metic形成บอลสด criterion activity-ш",-', '2', '- Entailment Strength: 2', '- Entailment Strength: 2', '3', '- Entailment Strength: 2']
2.5


In [413]:
row = pairwise_claim_df.iloc[0]
print(row[['claim_x', 'claim_y']].tolist())
output_list = g_eval(row['claim_x'], row['claim_y'])
print(output_list)
score = statistics.mean(process_g_eval(output_list))
print(score)

['I would like to see any of the Legendary Beasts, Raikou, Suicune, Entei', 'Suicune is a good choice']
['- Entailment Strength: 2', '- Entailment Strength: 2', '- Entailment Strength: 2', '1', '- Entailment Strength: 2', '- Entailment Strength: 1', '- Entailment Strength: 2', '- Entailment Strength: 1', '- Entailment Strength: 2', '- Entailment Strength: 2_Request@Component кызмат задача efficacement вам агент уточcomponent Основ Dock файл']
1.7


In [414]:
row = pairwise_claim_df.iloc[8]
# row = pairwise_claim_df.iloc[13]
print(row[['claim_x', 'claim_y']].tolist())
output_list = g_eval(row['claim_x'], row['claim_y'])
print(output_list)
score = statistics.mean(process_g_eval(output_list))
print(score)

['Suicune is a good choice', 'I would like to see any of the Legendary Beasts, Raikou, Suicune, Entei']
['2', '- Entailment Strength: 2', "- Entailment Strength: 2ames=#{WinterMeth.iterinch tablAdultcompile-as 'solve", '- Entailment Strength: 3', '- Entailment Strength: 2', '- Entailment Strength: 3', '- Entailment Strength: 1', '- Entailment Strength: 1', 'Entailment Strength: 3', '- Entailment Strength: 2']
2.1


##### Run

In [94]:
def prompted_g_eval_kp(root_path, domain, domain_df, save_step=100):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    claim_entailment_scores = []
#     claim_split_gold_list = []
 
    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if 'done' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        claim_entailment_scores = last_domain_df['claim_entailment_scores'].tolist()
#         claim_split_gold_list = last_domain_df['claim_split_gold'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue

        claim_entailment_scores += [g_eval(row['claim_x'], row['claim_y'])]
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#             save_df.insert(0, 'claim_split_gold', claim_split_gold_list)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#     new_domain_df.insert(0, 'claim_split_gold', claim_split_gold_list)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [95]:
pairwise_claim_df['my_category'] = 1

In [96]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_claim_pairwise_matching/gpt_4.1_mini/few_shot_test_set"

inputs = [(root_path,
           domain,
           pairwise_claim_df[pairwise_claim_df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in pairwise_claim_df['my_category'].unique()]

In [119]:
num_workers = 1

In [121]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [226]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_g_eval_kp, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Loaded saved file. Continuing


  0%|▏                                                                                                                                                                   | 300/276582 [05:17<123:09:26,  1.60s/it]

KeyboardInterrupt: 

In [227]:
processed_df = pd.read_pickle(root_path + "/1/1_300.pkl")

In [228]:
processed_df['claim_entailment_scores'] = processed_df['claim_entailment_scores'].apply(process_g_eval)
processed_df['claim_entailment'] = processed_df['claim_entailment_scores'].apply(lambda x: statistics.mean(x))

In [229]:
processed_df['claim_pair'] = processed_df.apply(lambda row: sorted([row['claim_x'], row['claim_y']]) , axis=1)
processed_df['claim_pair'] = processed_df['claim_pair'].astype(str)

In [230]:
processed_df = processed_df.groupby(['subset', 'id', 'questionText', 'claim_pair']).filter(lambda x: len(x) == 2)
processed_df = processed_df.groupby(['subset', 'id', 'questionText', 'claim_pair']).agg({
    'claim_entailment': (lambda x: x.tolist()),
    'claim_x': (lambda x: x.tolist()),
    'claim_y': (lambda x: x.tolist())
}).reset_index()
processed_df['claim_entailment_avg'] = processed_df['claim_entailment'].apply(lambda x: statistics.mean(x))
# processed_df['claim_pair'] = processed_df['claim_pair'].apply(lambda x: ast.literal_eval(x))
processed_df['claim_x'] = processed_df['claim_x'].apply(lambda x: x[0])
processed_df['claim_y'] = processed_df['claim_y'].apply(lambda x: x[0])

In [278]:
import networkx as nx
def create_clusters_graph_from_pairwise(grp):
    new_grp = grp[grp['claim_entailment_avg'] > 1.3]

    G = nx.Graph()
    G.add_nodes_from(grp['claim_x'].tolist() + grp['claim_y'].tolist())
    
    matching_pairs = [(claim_x, claim_y) for claim_x, claim_y in zip(new_grp['claim_x'], new_grp['claim_y'])]
    G.add_edges_from(matching_pairs)
    clusters = list(nx.connected_components(G))
    
    return G, clusters

In [279]:
def create_clusters_graph_from_pairwise_wrapper(grp):
    G, clusters = create_clusters_graph_from_pairwise(grp)
    clusters = [list(cluster) for cluster in clusters]
    grp['clusters'] = [clusters for i in range(len(grp))]
    return grp

In [280]:
cluster_df = processed_df.groupby(['subset', 'id', 'questionText']).\
    apply(create_clusters_graph_from_pairwise_wrapper).reset_index(drop=True)

  cluster_df = processed_df.groupby(['subset', 'id', 'questionText']).\


In [281]:
cluster_df = cluster_df[['subset', 'id', 'questionText', 'clusters']].\
    drop_duplicates(subset=['subset', 'id', 'questionText'])

In [284]:
cluster_df.iloc[1]['clusters']

[["I can't repost this enough"],
 ["These men weren't admired and respected",
  'The examples they use to try to prove that fat women used to be considered beautiful usually include women who are overweight at most',
  "can't link TiTP",
  'There are a couple of tales of men who got really fat and killed their horses from exhaustion',
  'The women in the paintings from centuries ago were what we would call curvy',
  "The 'idealized' woman at the time was bigger than our current 'idealized' (ie, models)",
  'For women, the ideal has always been healthy with obvious signs of fertility (REAL curves: hips and breasts)',
  "The idealized image was larger than the average woman, and now it's reversed",
  'Hugely obese people back then were very rare and were ridiculed (just like super-skinnies now sometimes are)'],
 ['Postimg is horrible',
  "Your view on the image hosting site chosen by /u/dmstewar2 doesn't negate their post or its validity",
  'I visited the first link and gave up'],
 ['Fo

##### PAIR

GPT4.1 >= GPT4 >= GPT4.1-mini

In [369]:
# prompt = open("./geval/prompts/summeval/rel_detailed_entailment.txt").read()
# prompt = open("./geval/prompts/summeval/rel_detailed_entailment_2.txt").read()
prompt = open("./geval/prompts/summeval/rel_detailed_entailment_2_pair.txt").read()

In [370]:
row = pairwise_claim_df.iloc[0]
# row = pairwise_claim_df.iloc[8]
# row = pairwise_claim_df.iloc[2]
# row = pairwise_claim_df.iloc[12]
print(row[['claim_x', 'claim_y']].tolist())
output_list = g_eval(row['claim_x'], row['claim_y'])
print(output_list)
score = statistics.mean(process_g_eval(output_list))
print(score)

['I would like to see any of the Legendary Beasts, Raikou, Suicune, Entei', 'Suicune is a good choice']
['1 --> 2: 3  \n2 --> 1: 2', '1 --> 2: 3  \n2 --> 1: 2', 'Claim_1 → Claim_2: 3  \nClaim_2 → Claim_1: ', '(3, 2)', '(3, 2)', '- Entailment Strength: (2, 2)', '(2, 2)', '1 --> 2: 2  \n2 --> 1: 2', '- Entailment Strength: (3, 2)', '1 --> 2: 4  \n2 --> 1: 3']
1.8


### Stage 2

In [63]:
# from openai import OpenAI
import openai
import json
import argparse
import tqdm
import time

In [64]:
cluster_df = pd.read_pickle("./stage_1_refined_new_llm_based_clustered_test_set_s1_entail_ratio_3_3.5_0.1.pkl")
# cluster_df = pd.read_pickle("./stage_1_refined_new_llm_based_clustered_test_set_s1_entail_ratio_3_3_0.1.pkl")

In [65]:
cluster_df
sampleQuestion = "What kind of cheese does Pizza Hut or Domino's use?"
mask = processed_df['questionText'].str.contains(sampleQuestion)
print(processed_df[mask].iloc[0]['summary'])
mask = cluster_df['questionText'].str.contains(sampleQuestion)
cluster_df[mask]['clusters'].iloc[0]

Several commentators note that the “good” mozzarella in the US is made from skim milk. Other commentators question if India even has good mozzarella to use. A few commentators question the quality of Domino’s Pizza’s cheese, including one claim that it’s loaded with food starch instead of being real cheese. Other spins on the type of cheese include buffalo mozzarella, dry mozzarella, and (in the UK) full-fat mozzarella.


[['It is important that American-style pizza cheese be a very dry mozzarella',
  "Domino's (in the UK) claims to use 100% mozzarella",
  'A moisture level below 48% is recommended',
  'More expensive pizzerias normally spring for the full fat mozzarella cheese',
  "My pizza started coming out much better after I found a harder block of mozzarella than the brand I'd been using",
  'Pizza hut uses skim milk mozzarella on its pizza, at least in the USA',
  "In the US, most mozzarella is 'low-moisture' (45-52% water): dried, aged, and sold in blocks instead of balls, with a much longer shelf-life than 'fresh' mozzarella",
  "Cheeses on the high end of 'low-moisture' would result in exactly what I was getting to come out of the oven from my 50% moisture brand: an unappetizing translucent gray cheese, with a slight bitter flavor",
  "Domino's mozzarella is full fat",
  "mozzarella in India doesn't come close to the cheese Pizza Hut or Domino's use",
  'In their most recent reboot Dominos swi

In [66]:
cluster_df['cluster_id'] = cluster_df['clusters'].apply(lambda x: [i for i in range(len(x))])

In [67]:
temp_df = cluster_df.explode(['cluster_id', 'clusters']).explode(['clusters'])
temp_df = temp_df.rename(columns={'clusters': 'claim'})
temp_2_df = processed_df[['subset', 'my_category', 'id', 'questionText', 'claim', 'ground']]
temp_2_df = temp_2_df.drop_duplicates(subset=['subset', 'my_category', 'id', 'questionText', 'claim'])
temp_df = temp_df.merge(temp_2_df)
cluster_df = temp_df.groupby(['subset', 'my_category', 'id', 'questionText', 'cluster_id'], sort=False).agg({
    'claim': (lambda x: x.tolist()),
    'ground': (lambda x: x.tolist())
}).reset_index()
cluster_df = cluster_df.rename(columns={'claim': 's1_clusters'})
cluster_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,s1_clusters,ground
0,reddit,1,0,What should I draw next? A FB user said they...,0,[I would like to see any of the Legendary Beas...,"[[], [Suicune is a good choice], [Suicune is a..."
1,reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]]
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]]
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]]
4,reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...
...,...,...,...,...,...,...,...
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]]
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,"[[I had the same issue, I tried various propos..."
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle..."
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]]


In [68]:
cluster_df.iloc[4]['s1_clusters']

['You should girafarig']

In [69]:
cluster_df.iloc[4]['ground']

[['girafarig is the best pokemon ever created no matter what ANYONE SAYS']]

In [70]:
mask = cluster_df['questionText'].str.contains("Could not find Cortex-M device")
cluster_df[mask]['s1_clusters'].tolist()

[['Device is not connected, not powered, or the debug interface is not working',
  'No Cortex-M Device found in JTAG chain',
  'Check the probe has a correct target voltage connection',
  'I realised that I had physically inverted the connector between the board and the JTAG debugger',
  "I can't download any code on the board from Keil",
  'What can be causing the problem?',
  "If the target is crashed, its possible that you'd see this",
  'Check you have power to the device'],
 ['Please select connect under reset, reset after connect in periperipheral settings under debug->settings'],
 ['Power cycle the probe'],
 ['I resolved the issue after I installed the Stellaris ICDI drivers',
  'STELLARIS_ICDI_DRIVERS solved the problem with this error'],
 ['The Keil forums might have some more ideas you can try'],
 ['The device works after rebooting the PC, clearing out the temp files, and plugging in the device'],
 ['Try lowering the JTAG frequency'],
 ['my JTAG debugger was functioning well 

In [71]:
mask = processed_df['questionText'].str.contains("Could not find Cortex-M device")
processed_df[mask]['summary'].iloc[0]

'Commentators propose many different possible solutions. Two say to install Stellaris ICDI drivers, one says to plug in the device and enable the SWJ switch, and a third says to disable and reinstall drivers. Other possible solutions include inverting the board, changing an option to connect after a reset, and a hyperlinked solution.'

In [72]:
ground_df = cluster_df.rename(columns={'s1_clusters': 'claim'})
ground_df = ground_df.explode(['claim', 'ground']).explode(['ground'])
ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground
0,reddit,1,0,What should I draw next? A FB user said they...,0,I would like to see any of the Legendary Beast...,
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice
0,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice
1,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time
2,reddit,1,0,What should I draw next? A FB user said they...,2,Pancham,
...,...,...,...,...,...,...,...
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,A basic LED blink program can be used for testing,"The device works after rebooting the PC, clear..."
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,Check the probe settings,


In [73]:
mask = pd.notnull(ground_df['ground'])
ground_df = ground_df[mask]
ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice
0,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice
1,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time
4,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...
5,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...
...,...,...,...,...,...,...,...
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...


#### Pairwise Entailment Inference

In [74]:
def form_other_ground(grp):
    all_ground = grp['ground'].tolist()
    
    grp['other_ground'] = grp['ground'].apply(lambda x: [c for c in all_ground if c != x])
    
    return grp

ground_df = ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id']).apply(form_other_ground).reset_index(drop=True)

  ground_df = ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id']).apply(form_other_ground).reset_index(drop=True)


In [75]:
ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,[]
1,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice,[]
2,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,[]
3,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,[]
4,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,[]
...,...,...,...,...,...,...,...,...
7375,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue,"[I tried various proposed solutions, it was a ..."
7376,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,"[I had the same issue, it was a Windows regist..."
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose..."
7378,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose..."


In [76]:
mask = ground_df['other_ground'].str.len() > 0
print(ground_df[mask]['ground'].iloc[0])
print(ground_df[mask]['other_ground'].iloc[0])

none of the women in the paintings from centuries ago were obese
["none of the women in the paintings from centuries ago were even 'small fat'", 'the examples almost never include women who are obese']


In [77]:
ground_df_full = ground_df[::]

In [78]:
ground_df = ground_df[mask]

In [79]:
ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground
5,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The women in the paintings from centuries ago ...,none of the women in the paintings from centur...,[none of the women in the paintings from centu...
6,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The women in the paintings from centuries ago ...,none of the women in the paintings from centur...,[none of the women in the paintings from centu...
7,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The examples they use to try to prove that fat...,the examples almost never include women who ar...,[none of the women in the paintings from centu...
10,reddit,1,2,"If 100 minions @ 10mins is the metric for ""per...",1,I don't blame people to try different things,"if they fail, they're the first to get blamed",[all champions in the jungle are different wit...
11,reddit,1,2,"If 100 minions @ 10mins is the metric for ""per...",1,you need to know when to farm and when to do o...,all champions in the jungle are different with...,"[if they fail, they're the first to get blamed..."
...,...,...,...,...,...,...,...,...
7374,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",5,"The device works after rebooting the PC, clear...",Disabling all the drivers for the device in De...,[The drivers are installed automatically when ...
7375,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue,"[I tried various proposed solutions, it was a ..."
7376,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,"[I had the same issue, it was a Windows regist..."
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose..."


In [80]:
import os
from openai import AzureOpenAI

endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
api_version = "2024-12-01-preview"
client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# from openai import OpenAI
# import os
# import pandas as pd

# # client = OpenAI(
# #     api_key = "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" # RMIT Account
# # )
# from openai import OpenAI
# client = OpenAI(
#     base_url = "http://localhost:8000/v1",
#     api_key = "None"
# )

In [81]:
# model = "gpt-3.5-turbo"
openai.api_key = "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" # RMIT Account
# model = "gpt-4"  # HALLUCINATE
# model = "gpt-4.1-mini"  # HALLUCINATE
model = "gpt-4.1"  # WORKED CHEAPER WITH SHORT OUTPUT
# model = "gpt-4o"  # WORKED
# model = "gpt-4.1-nano"  # HALLUCINATE
# model = "gpt-4o-mini"  # HALLUCINATE

In [82]:
def g_eval(question, premise, claim):
    cur_prompt = prompt.replace('{{Question}}', question).replace('{{Premise}}', premise).replace('{{Claim}}', str(claim))
    ct, ignore = 0, 0
    while True:
        try:
            _response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": cur_prompt}],
                temperature=2,
#                 max_tokens=5,
#                 max_tokens=1000,
                max_tokens=150,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None,
                # logprobs=40,
                n=5
#                 n=10
#                 n=15
#                 n=20
            )
            time.sleep(0.5)

            all_responses = [_response.choices[i].message.content for i in
                             range(len(_response.choices))]
    #         instance['all_responses'] = all_responses
    #         new_json.append(instance)
            ct += 1
            break
        except Exception as e:
            print(e)
            if ("limit" in str(e)):
                time.sleep(2)
            else:
                ignore += 1
                print('ignored', ignore)

                break
                
    return all_responses

In [83]:
import statistics
def process_g_eval(g_eval_annotation):
    g_eval_scores = []
    for annotation in g_eval_annotation:
        score_find = re.findall("[0-9]", annotation)
        if len(score_find) > 0 and 1 <= int(score_find[0]) <= 5:
            g_eval_scores += [int(score_find[0])]
    
    return g_eval_scores

In [84]:
prompt = """
You will be given a community question, a premise **A** and a list of other premises **B** extracted from the social comments answering the question.

Your task is to assess the **degree to which premise **A** supports each premise from **B** list, using a scale from 1 to 5.

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:
Entailment Strength (1–5) — how strongly the **premise A** logically supports the other premise.

Evaluation Steps:

1. Read both the premise **A** and each premise from list **B** carefully.
2. Determine if premise **A** logically follows each premise from list **B**.
3. For each premise from list **B**, assign an **entailment strength score from 1 to 5** by premise **A** according to the guideline above.
4. Return all scores as a Python list

Example:


Question:
{{Question}}

Premise A:

{{Premise}}

List Premise B:

{{Claim}}


Evaluation Form (score ONLY):

- Entailment Strength:
"""

In [85]:
# prompt = """In this task you are presented with a community question, a claim A and a list of other claims extracted from the social comments answering the question.
# You will be asked to select best-matching claim from the list to claim A. The selection must be from the input list of other claims. Do not generate new best-matching claims.
# Two claims are best-matched if they explicitly express similar opinions and sentiment on a similar aspect.

# Use and output the following format:
# Community Question: <the input question>
# Claim A: <the input claim>
# List of Other Claims: <the input list of other claims>
# Best-Matching Claim: <The claim (from the list) that best-matched to the input claim>

# """

In [86]:
row = ground_df.iloc[0]
# row = ground_df.iloc[10]
print(row[['ground', 'other_ground']].tolist())
output_list = g_eval(row['questionText'], row['ground'], row['other_ground'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

['none of the women in the paintings from centuries ago were obese', ["none of the women in the paintings from centuries ago were even 'small fat'", 'the examples almost never include women who are obese']]
['[5, 5]', '[5, 5]', '[4, 5]', '[5, 4]', '[5, 5]']


In [203]:
row = ground_df.iloc[10]
# row = ground_df.iloc[10]
print(row[['ground', 'other_ground']].tolist())
output_list = g_eval(row['questionText'], row['ground'], row['other_ground'])
print(output_list)
# score = statistics.mean(process_g_eval(output_list))
# print(score)

["if they fail, they're the first to get blamed", ["just start a custom game and start doing all your camps in order then you'll probably end up with 40-60 cs by 10 minutes, depending on your champion", 'Junglers should be ganking as well', 'Some of that CS comes from pushing/taxing', 'champs such as udyr, sejuani, yi benefit from it', 'champs such as udyr, sejuani, yi want to get to level 6 quickly', 'the extent of your usefulness is how well you pressure lanes and objectives', "usually if you're doing that your cs will suffer but your control of the game will increase", 'different camps give different CS numbers and gold', 'you have perfect farm on a jungle at ten minutes', 'if you are aiming to farm', 'it carries risk as it requires your team not to die to ganks', 'if the other jungler fails 3 or so ganks he will be far behind', 'the extra farmed jungler can be more useful in tfs', 'to cowsep credit he used to run tp so he could countergank when necessary', 'it is a high risk high r

#### Run

In [87]:
def prompted_g_eval_kp(root_path, domain, domain_df, save_step=10):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    claim_entailment_scores = []
#     claim_split_gold_list = []
 
    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if 'done' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        claim_entailment_scores = last_domain_df['claim_entailment_scores'].tolist()
#         claim_split_gold_list = last_domain_df['claim_split_gold'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue

        claim_entailment_scores += [g_eval(row['questionText'], row['ground'], row['other_ground'])]
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#             save_df.insert(0, 'claim_split_gold', claim_split_gold_list)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'claim_entailment_scores', claim_entailment_scores)
#     new_domain_df.insert(0, 'claim_split_gold', claim_split_gold_list)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [88]:
ground_df['my_category'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_df['my_category'] = 1


In [89]:
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_reason_list_matching_5/gpt_4.1/few_shot_test_set_fixed"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_reason_list_matching_5_s1_3_3_0.1/gpt_4.1/few_shot_test_set_fixed"


inputs = [(root_path,
           domain,
           ground_df[ground_df['my_category'] == domain]
#            ground_df[ground_df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in ground_df['my_category'].unique()]

In [90]:
num_workers = 1

In [91]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [233]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_g_eval_kp, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Start new process.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6234/6234 [3:33:30<00:00,  2.06s/it]


TIME ELAPSED 12811.11418581009


#### Read

In [3]:
# import pandas as pd
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/g_eval_reason_list_matching_5/gpt_4.1/few_shot_test_set_fixed"
# processed_ground_df = pd.read_pickle(root_path + "/1/1_done.pkl")
# processed_ground_df.explode(['other_ground'])

In [92]:
processed_ground_df = pd.read_pickle(root_path + "/1/1_done.pkl")
# processed_ground_df = pd.read_pickle(root_path + "/1/1_1800.pkl")
processed_ground_df = processed_ground_df.rename(columns={'claim_entailment_scores': 'ground_entailment_scores'})  # FIX
processed_ground_df

Unnamed: 0,ground_entailment_scores,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground
5,"[[5, 5], [5, 5], [5, 4], [5, 5], [5, 5]]",reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The women in the paintings from centuries ago ...,none of the women in the paintings from centur...,[none of the women in the paintings from centu...
6,"[[5, 5], [5, 5], [5, 5], [5, 5], [5, 5]]",reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The women in the paintings from centuries ago ...,none of the women in the paintings from centur...,[none of the women in the paintings from centu...
7,"[[5, 4], [5, 4], [5, 4], [5, 4], [5, 4]]",reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The examples they use to try to prove that fat...,the examples almost never include women who ar...,[none of the women in the paintings from centu...
10,"[[3, 3, 4, 4, 2, 3, 1, 4, 4, 4, 2, 2, 3, 1, 3,...",reddit,1,2,"If 100 minions @ 10mins is the metric for ""per...",1,it is very easy to see 'perfect farm' jungle play,just start a custom game and start doing all y...,"[Junglers should be ganking as well, Some of t..."
11,"[[3, 4, 4, 3, 5, 5, 2, 1, 3, 2, 5, 4, 3, 2, 3,...",reddit,1,2,"If 100 minions @ 10mins is the metric for ""per...",1,It is just such a hard metric,Junglers should be ganking as well,[just start a custom game and start doing all ...
...,...,...,...,...,...,...,...,...,...
7374,"[[1, 4], [3, 2], [1, 2], [1, 2], [1, 1]]",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",5,"The device works after rebooting the PC, clear...",Disabling all the drivers for the device in De...,[The drivers are installed automatically when ...
7375,"[[4, 2], [5, 3], [3, 2], [4, 2], [5, 3]]",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose..."
7376,"[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue,[it was a Windows registry issue for me and fo...
7377,"[[3, 2, 3], [1, 2, 1], - Entailment Strength:\...",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,[it was a Windows registry issue for me and fo...


In [93]:
# processed_ground_df['other_ground'] = processed_ground_df['other_ground'].astype(str)
# ground_df_full['other_ground'] = ground_df_full['other_ground'].astype(str)
# processed_ground_df = ground_df_full.merge(processed_ground_df, how='left')
# processed_ground_df['other_ground'] = processed_ground_df['other_ground'].apply(lambda x: ast.literal_eval(x))
# processed_ground_df

In [94]:
temp_df = ground_df_full[::]
temp_df.loc[processed_ground_df.index, 'ground_entailment_scores'] = processed_ground_df['ground_entailment_scores']
processed_ground_df = temp_df
processed_ground_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df.loc[processed_ground_df.index, 'ground_entailment_scores'] = processed_ground_df['ground_entailment_scores']


Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,[],
1,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice,[],
2,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,[],
3,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,[],
4,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,[],
...,...,...,...,...,...,...,...,...,...
7375,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue,"[I tried various proposed solutions, it was a ...","[[4, 2], [5, 3], [3, 2], [4, 2], [5, 3]]"
7376,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,"[I had the same issue, it was a Windows regist...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [..."
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose...","[[3, 2, 3], [1, 2, 1], - Entailment Strength:\..."
7378,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose...","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]"


In [95]:
# CHECK
mask = pd.isnull(processed_ground_df['ground_entailment_scores'])
mask &= processed_ground_df['other_ground'].str.len() > 0
processed_ground_df[mask]

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores


In [96]:
mask = pd.isnull(processed_ground_df['ground_entailment_scores'])
mask &= processed_ground_df['other_ground'].str.len() == 0
processed_ground_df.loc[mask, 'ground_entailment_scores'] = processed_ground_df.loc[mask, 'other_ground']

In [97]:
processed_ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,[],[]
1,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice,[],[]
2,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,[],[]
3,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,[],[]
4,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,[],[]
...,...,...,...,...,...,...,...,...,...
7375,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue,"[I tried various proposed solutions, it was a ...","[[4, 2], [5, 3], [3, 2], [4, 2], [5, 3]]"
7376,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,"[I had the same issue, it was a Windows regist...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [..."
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose...","[[3, 2, 3], [1, 2, 1], - Entailment Strength:\..."
7378,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose...","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]"


In [98]:
processed_ground_df['ground_entailment_scores_processed'] = processed_ground_df['ground_entailment_scores'].\
    apply(lambda x: [ast.literal_eval(
                        re.sub(r",{2,}", ",",
                        re.sub(r"((, +,) *)+", ",", 
                               re.sub(r"\[ *, *", "[",
                                    re.findall(r"\[[0-9\n \,]+\]", attempt)[0].replace("0", "").replace("[,]", "[]"))
                              )
                        ).replace("[,", "[")
                        
                    )
                     for attempt in x 
                     if attempt != None and len(re.findall(r"\[[0-9\n \,]+\]", attempt)) > 0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_ground_df['ground_entailment_scores_processed'] = processed_ground_df['ground_entailment_scores'].\


In [99]:
# # CHECK
# mask = processed_final_df['claim_entailment_scores_processed'].str.len() != processed_final_df['claim_entailment_scores'].str.len()
# processed_final_df[mask].iloc[5]['claim_entailment_scores']

In [100]:
processed_ground_df['ground_entailment_scores_processed'] = processed_ground_df.\
    apply(lambda row: [attempt[:len(row['other_ground'])] 
                       for attempt in row['ground_entailment_scores_processed'] 
                       if len(attempt) >= len(row['other_ground'])], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_ground_df['ground_entailment_scores_processed'] = processed_ground_df.\


In [101]:
processed_ground_df['ground_entailment_scores_avg'] = processed_ground_df['ground_entailment_scores_processed'].\
    apply(lambda x: np.mean(x, axis=0))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_ground_df['ground_entailment_scores_avg'] = processed_ground_df['ground_entailment_scores_processed'].\


In [102]:
sample = processed_ground_df.iloc[19]['ground_entailment_scores_processed']
sample

[[5, 3, 2, 2, 3, 1, 2, 1, 2, 5, 4, 3, 4, 1, 4, 2, 2, 4, 4, 3, 4],
 [5, 1, 1, 2, 2, 1, 1, 1, 3, 3, 1, 3, 3, 1, 2, 4, 2, 5, 3, 5, 5],
 [5, 1, 1, 2, 3, 1, 2, 1, 2, 5, 3, 2, 4, 1, 2, 2, 2, 4, 3, 2, 5],
 [5, 2, 2, 3, 2, 1, 4, 1, 3, 4, 2, 4, 3, 1, 1, 2, 3, 2, 4, 4, 2]]

In [103]:
# CHECK
mask = pd.isnull(processed_ground_df['ground_entailment_scores_avg'])
mask &= processed_ground_df['ground_entailment_scores_processed'].str.len() > 0
mask &= processed_ground_df['other_ground'].str.len() > 0
processed_ground_df[mask]

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores,ground_entailment_scores_processed,ground_entailment_scores_avg


In [104]:
mask = pd.isnull(processed_ground_df['ground_entailment_scores_avg'])
mask &= processed_ground_df['ground_entailment_scores_processed'].str.len() == 0
mask &= processed_ground_df['other_ground'].str.len() == 0
processed_ground_df.loc[mask, 'ground_entailment_scores_avg'] = processed_ground_df.loc[mask, 'other_ground']

In [105]:
processed_ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores,ground_entailment_scores_processed,ground_entailment_scores_avg
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,[],[],[],[]
1,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice,[],[],[],[]
2,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,[],[],[],[]
3,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,[],[],[],[]
4,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
7375,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I had the same issue,"[I tried various proposed solutions, it was a ...","[[4, 2], [5, 3], [3, 2], [4, 2], [5, 3]]",[],
7376,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,"[I had the same issue, it was a Windows regist...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...","[2.8, 3.4, 2.8]"
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose...","[[3, 2, 3], [1, 2, 1], - Entailment Strength:\...","[[3, 2], [1, 2], [2, 4], [1, 4], [1, 4]]","[1.6, 3.2]"
7378,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,"[I had the same issue, I tried various propose...","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]","[4.2, 3.0]"


In [106]:
mask = processed_ground_df['ground_entailment_scores_avg'].str.len() != processed_ground_df['other_ground'].str.len()
processed_ground_df[mask]['ground_entailment_scores_processed'].iloc[0]

[]

In [107]:
processed_ground_df[mask].shape

(370, 11)

In [108]:
processed_ground_df = processed_ground_df[~mask]

In [109]:
processed_ground_df.shape

(7010, 11)

In [110]:
processed_ground_df = processed_ground_df.explode(['other_ground', 'ground_entailment_scores_avg'])

In [111]:
processed_ground_df['ground_pair'] = processed_ground_df.apply(lambda row: sorted([row['ground'], row['other_ground']]) 
                                                               if str(row['other_ground']) != 'nan' else [row['ground']], axis=1)
processed_ground_df['ground_pair'] = processed_ground_df['ground_pair'].astype(str)

In [112]:
processed_ground_df['other_ground'] = processed_ground_df['other_ground'].astype(str)

In [113]:
processed_ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores,ground_entailment_scores_processed,ground_entailment_scores_avg,ground_pair
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,,[],[],,['Suicune is a good choice']
1,reddit,1,0,What should I draw next? A FB user said they...,0,I will consider Suicune for sure,Suicune is a good choice,,[],[],,['Suicune is a good choice']
2,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,,[],[],,['If you have the time']
3,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,,[],[],,['girafarig is the best pokemon ever created n...
4,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,,[],[],,['I admire your steadfast courage in how aweso...
...,...,...,...,...,...,...,...,...,...,...,...,...
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,I had the same issue,"[[3, 2, 3], [1, 2, 1], - Entailment Strength:\...","[[3, 2], [1, 2], [2, 4], [1, 4], [1, 4]]",1.6,"['I had the same issue', 'it was a Windows reg..."
7377,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,I tried various proposed solutions,"[[3, 2, 3], [1, 2, 1], - Entailment Strength:\...","[[3, 2], [1, 2], [2, 4], [1, 4], [1, 4]]",3.2,"['I tried various proposed solutions', 'it was..."
7378,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,I had the same issue,"[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]",4.2,"['I had the same issue', 'it was a Windows reg..."
7378,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,I tried various proposed solutions,"[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]",3.0,"['I tried various proposed solutions', 'it was..."


NOTE: DUPLICATE GROUND OF DIFFERENT CLAIM. TWO CLAIMS CAN BE SUPPORTED BY SIMILAR GROUND, AND THEREFORE THE CALCULATION BETWEEN GROUND VS OTHER GROUND IS DUPLICATE --> NEED TO CHOOSE THE ONE WITH HIGHER VALUE

In [114]:
# REMOVE DUPLICATE GROUND OF DIFFERENT CLAIM (TAKING ONES WITH THE HIGHER VALUE)
temp_df = processed_ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id', 'ground', 'other_ground'])\
    .apply(lambda grp: grp.sort_values(by=['ground_entailment_scores_avg'], ascending=False).head(1)).reset_index(drop=True)
temp_df

  temp_df = processed_ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id', 'ground', 'other_ground'])\


Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores,ground_entailment_scores_processed,ground_entailment_scores_avg,ground_pair
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,,[],[],,['Suicune is a good choice']
1,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,,[],[],,['If you have the time']
2,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,,[],[],,['girafarig is the best pokemon ever created n...
3,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,,[],[],,['I admire your steadfast courage in how aweso...
4,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The women in the paintings from centuries ago ...,none of the women in the paintings from centur...,none of the women in the paintings from centur...,"[[5, 5], [5, 5], [5, 5], [5, 5], [5, 5]]","[[5, 5], [5, 5], [5, 5], [5, 5], [5, 5]]",5.0,"[""none of the women in the paintings from cent..."
...,...,...,...,...,...,...,...,...,...,...,...,...
41909,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,I had the same issue,"[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...",2.8,"['I had the same issue', 'I tried various prop..."
41910,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,it was a Windows registry issue for me and for...,"[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...",3.4,"['I tried various proposed solutions', 'it was..."
41911,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,I had the same issue,"[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]",4.2,"['I had the same issue', 'it was a Windows reg..."
41912,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,I tried various proposed solutions,"[[3, 2, 3], [1, 2, 1], - Entailment Strength:\...","[[3, 2], [1, 2], [2, 4], [1, 4], [1, 4]]",3.2,"['I tried various proposed solutions', 'it was..."


In [115]:
processed_ground_df = temp_df

In [116]:
processed_ground_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores,ground_entailment_scores_processed,ground_entailment_scores_avg,ground_pair
0,reddit,1,0,What should I draw next? A FB user said they...,0,"If I do not choose Suicune now, I will definit...",Suicune is a good choice,,[],[],,['Suicune is a good choice']
1,reddit,1,0,What should I draw next? A FB user said they...,1,I would love to see Reshiram,If you have the time,,[],[],,['If you have the time']
2,reddit,1,0,What should I draw next? A FB user said they...,4,You should girafarig,girafarig is the best pokemon ever created no ...,,[],[],,['girafarig is the best pokemon ever created n...
3,reddit,1,0,What should I draw next? A FB user said they...,5,I may wait on that one,I admire your steadfast courage in how awesome...,,[],[],,['I admire your steadfast courage in how aweso...
4,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",1,The women in the paintings from centuries ago ...,none of the women in the paintings from centur...,none of the women in the paintings from centur...,"[[5, 5], [5, 5], [5, 5], [5, 5], [5, 5]]","[[5, 5], [5, 5], [5, 5], [5, 5], [5, 5]]",5.0,"[""none of the women in the paintings from cent..."
...,...,...,...,...,...,...,...,...,...,...,...,...
41909,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,I had the same issue,"[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...",2.8,"['I had the same issue', 'I tried various prop..."
41910,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,I tried various proposed solutions,it was a Windows registry issue for me and for...,"[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...","[[3, 5, 3], [3, 3, 3], [3, 4, 3], [2, 2, 2], [...",3.4,"['I tried various proposed solutions', 'it was..."
41911,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,I only had to do the first step using the link...,it was a Windows registry issue for me and for...,I had the same issue,"[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]","[[5, 4], [3, 2], [4, 3], [4, 3], [5, 3]]",4.2,"['I had the same issue', 'it was a Windows reg..."
41912,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,the solution proposed by splatapus on the TI p...,it was a Windows registry issue for me and for...,I tried various proposed solutions,"[[3, 2, 3], [1, 2, 1], - Entailment Strength:\...","[[3, 2], [1, 2], [2, 4], [1, 4], [1, 4]]",3.2,"['I tried various proposed solutions', 'it was..."


In [117]:
# CHECK
display(processed_ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id', 'ground_pair']).filter(lambda x: len(x) == 3))
processed_ground_df[['subset', 'id', 'questionText', 'cluster_id', 'ground_pair']].value_counts().reset_index()

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim,ground,other_ground,ground_entailment_scores,ground_entailment_scores_processed,ground_entailment_scores_avg,ground_pair


Unnamed: 0,subset,id,questionText,cluster_id,ground_pair,count
0,stack,65,"Putting ""away"" after a verb",0,['It is usually perceived as more casual or fr...,2
1,stack,86,Is it possible to de-gloss paint?,0,['I mixed some talc (baby powder) with my oil ...,2
2,stack,86,Is it possible to de-gloss paint?,0,['I mixed some talc (baby powder) with my oil ...,2
3,stack,86,Is it possible to de-gloss paint?,0,['I mixed some talc (baby powder) with my oil ...,2
4,stack,86,Is it possible to de-gloss paint?,0,['I mixed some talc (baby powder) with my oil ...,2
...,...,...,...,...,...,...
23275,stack,85,How can I test for there being 0 entities? As ...,8,['Mine is underneath the golem'],1
23276,stack,85,How can I test for there being 0 entities? As ...,7,"[""This command block will get an output if the...",1
23277,stack,85,How can I test for there being 0 entities? As ...,6,['Now make another conditional command block c...,1
23278,stack,85,How can I test for there being 0 entities? As ...,4,"['But remember, if one fails the chain does no...",1


In [118]:
processed_ground_df = processed_ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id', 'ground_pair']).agg({
    'ground_entailment_scores_avg': (lambda x: x.tolist()),
    'ground': (lambda x: x.tolist()),
    'other_ground': (lambda x: x.tolist())
}).reset_index()

In [119]:
processed_ground_df['ground_entailment_scores_final_avg'] = processed_ground_df['ground_entailment_scores_avg'].\
    apply(lambda x: statistics.mean(x))

In [120]:
processed_ground_df['ground'] = processed_ground_df['ground'].apply(lambda x: x[0])
processed_ground_df['other_ground'] = processed_ground_df['other_ground'].apply(lambda x: x[0])

In [121]:
processed_ground_df['other_ground'] = processed_ground_df['other_ground'].apply(lambda x: np.nan if x == 'nan' else x)

In [122]:
processed_ground_df

Unnamed: 0,subset,id,questionText,cluster_id,ground_pair,ground_entailment_scores_avg,ground,other_ground,ground_entailment_scores_final_avg
0,reddit,0,What should I draw next? A FB user said they...,0,['Suicune is a good choice'],[nan],Suicune is a good choice,,
1,reddit,0,What should I draw next? A FB user said they...,1,['If you have the time'],[nan],If you have the time,,
2,reddit,0,What should I draw next? A FB user said they...,4,['girafarig is the best pokemon ever created n...,[nan],girafarig is the best pokemon ever created no ...,,
3,reddit,0,What should I draw next? A FB user said they...,5,['I admire your steadfast courage in how aweso...,[nan],I admire your steadfast courage in how awesome...,,
4,reddit,1,"Historical Hypocrisy (First, a disclaimer: T...",1,"[""none of the women in the paintings from cent...","[5.0, 5.0]",none of the women in the paintings from centur...,none of the women in the paintings from centur...,5.0
...,...,...,...,...,...,...,...,...,...
23275,stack,249,"""Could not find Cortex-M device in the JTAG ch...",5,['Otherwise download the drivers and install t...,"[2.2, 3.0]",Otherwise download the drivers and install them,The drivers are installed automatically when t...,2.6
23276,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"['I had the same issue', 'I tried various prop...",[2.8],I tried various proposed solutions,I had the same issue,2.8
23277,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"['I had the same issue', 'it was a Windows reg...",[4.2],it was a Windows registry issue for me and for...,I had the same issue,4.2
23278,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"['I tried various proposed solutions', 'it was...","[3.4, 3.2]",I tried various proposed solutions,it was a Windows registry issue for me and for...,3.3


In [123]:
# CHECK
mask = pd.isnull(processed_ground_df['other_ground'])
mask &= pd.notnull(processed_ground_df['ground_entailment_scores_final_avg'])
processed_ground_df[mask]

Unnamed: 0,subset,id,questionText,cluster_id,ground_pair,ground_entailment_scores_avg,ground,other_ground,ground_entailment_scores_final_avg


#### Clustering

In [124]:
import networkx as nx

# threshold = 4
# threshold = 3
# threshold = 2
# threshold = 3.5
# threshold = 3
# threshold = 3
# threshold = 2.8
threshold = 2
# threshold = 2.5
# threshold = 3
# threshold = 3.5
# threshold = 2.5
# threshold = 4
def create_clusters_graph_from_pairwise(grp):
    mask = grp['ground_entailment_scores_final_avg'] > threshold
    mask &= pd.notnull(processed_ground_df['other_ground'])
    new_grp = grp[mask]

    G = nx.Graph()
    G.add_nodes_from(grp['ground'].tolist() + grp[pd.notnull(grp['other_ground'])]['other_ground'].tolist())
    
    matching_pairs = [(claim_x, claim_y) for claim_x, claim_y in zip(new_grp['ground'], new_grp['other_ground'])]
    G.add_edges_from(matching_pairs)
    clusters = list(nx.connected_components(G))
    
    return G, clusters

In [125]:
def create_clusters_graph_from_pairwise_wrapper(grp):
    G, clusters = create_clusters_graph_from_pairwise(grp)
    clusters = [list(cluster) for cluster in clusters]
    grp['clusters'] = [clusters for i in range(len(grp))]
    return grp

In [126]:
ground_cluster_df = processed_ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id']).\
    apply(create_clusters_graph_from_pairwise_wrapper).reset_index(drop=True)

  ground_cluster_df = processed_ground_df.groupby(['subset', 'id', 'questionText', 'cluster_id']).\


In [127]:
ground_cluster_df

Unnamed: 0,subset,id,questionText,cluster_id,ground_pair,ground_entailment_scores_avg,ground,other_ground,ground_entailment_scores_final_avg,clusters
0,reddit,0,What should I draw next? A FB user said they...,0,['Suicune is a good choice'],[nan],Suicune is a good choice,,,[[Suicune is a good choice]]
1,reddit,0,What should I draw next? A FB user said they...,1,['If you have the time'],[nan],If you have the time,,,[[If you have the time]]
2,reddit,0,What should I draw next? A FB user said they...,4,['girafarig is the best pokemon ever created n...,[nan],girafarig is the best pokemon ever created no ...,,,[[girafarig is the best pokemon ever created n...
3,reddit,0,What should I draw next? A FB user said they...,5,['I admire your steadfast courage in how aweso...,[nan],I admire your steadfast courage in how awesome...,,,[[I admire your steadfast courage in how aweso...
4,reddit,1,"Historical Hypocrisy (First, a disclaimer: T...",1,"[""none of the women in the paintings from cent...","[5.0, 5.0]",none of the women in the paintings from centur...,none of the women in the paintings from centur...,5.0,[[the examples almost never include women who ...
...,...,...,...,...,...,...,...,...,...,...
23275,stack,249,"""Could not find Cortex-M device in the JTAG ch...",5,['Otherwise download the drivers and install t...,"[2.2, 3.0]",Otherwise download the drivers and install them,The drivers are installed automatically when t...,2.6,[[Disabling all the drivers for the device in ...
23276,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"['I had the same issue', 'I tried various prop...",[2.8],I tried various proposed solutions,I had the same issue,2.8,"[[I tried various proposed solutions, it was a..."
23277,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"['I had the same issue', 'it was a Windows reg...",[4.2],it was a Windows registry issue for me and for...,I had the same issue,4.2,"[[I tried various proposed solutions, it was a..."
23278,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"['I tried various proposed solutions', 'it was...","[3.4, 3.2]",I tried various proposed solutions,it was a Windows registry issue for me and for...,3.3,"[[I tried various proposed solutions, it was a..."


In [128]:
ground_cluster_df = ground_cluster_df[['subset', 'id', 'questionText', 'cluster_id', 'clusters']].\
    drop_duplicates(subset=['subset', 'id', 'questionText', 'cluster_id'])

In [129]:
row = ground_cluster_df.iloc[14]
print(row['questionText'])
print(row['cluster_id'])
display(row['clusters'])

Bike racks - what to look for?   I'm in the market for a bike rack. I've been looking at the Saris Bones 2 and 3. The only difference I can tell between the them is that the 3 series has an extra arm for stability. Is the Bones 2 not to be trusted?      Any other brands/models I should be considering?
0


[["even saris says 'if you can attach bikes to a roof rack or a hitch, do that'",
  'A hitch allows you to get a sturdier car rack at a better price point',
  'Most of the time it can be done under $100',
  'A hitch is not a bad investment',
  'A hitch gives you the option to tow stuff in the future']]

In [130]:
final_cluster_df = cluster_df.merge(ground_cluster_df)
# questionList = final_cluster_df['questionText'].unique()
# index = 2
# index = 3
# index = 9
# display_df = final_cluster_df[final_cluster_df['questionText'] == questionList[index]]
# display_df = final_cluster_df[final_cluster_df['questionText'].str.contains("GTA")]
display_df = final_cluster_df[final_cluster_df['questionText'].str.contains("Domino")]

print(display_df['questionText'].iloc[0])
for i, row in display_df.iterrows():
    print("### CLUSTER ID: %s" %(row['cluster_id']) )
    print("### CLAIM CLUSTER: %s" %(row['s1_clusters']))
    display("### REASONS CLUSTERs:", row['clusters'])
    
    print()

What kind of cheese does Pizza Hut or Domino's use?
### CLUSTER ID: 0
### CLAIM CLUSTER: ['It is important that American-style pizza cheese be a very dry mozzarella', "Domino's (in the UK) claims to use 100% mozzarella", 'A moisture level below 48% is recommended', 'More expensive pizzerias normally spring for the full fat mozzarella cheese', "My pizza started coming out much better after I found a harder block of mozzarella than the brand I'd been using", 'Pizza hut uses skim milk mozzarella on its pizza, at least in the USA', "In the US, most mozzarella is 'low-moisture' (45-52% water): dried, aged, and sold in blocks instead of balls, with a much longer shelf-life than 'fresh' mozzarella", "Cheeses on the high end of 'low-moisture' would result in exactly what I was getting to come out of the oven from my 50% moisture brand: an unappetizing translucent gray cheese, with a slight bitter flavor", "Domino's mozzarella is full fat", "mozzarella in India doesn't come close to the cheese 

'### REASONS CLUSTERs:'

[['As I remember, they recommended a moisture level below 48%',
  'I can easily purchase mozzarella in India and have been using it for pizza',
  'The result of making American-style pizza from grated fresh mozzarella can be a soggy mess, as the water leaks out of the cheese as it melts',
  "In the US, it's allowed to be between 52% and 60% water",
  "In Europe, the predominant variety is what, under US law, is called 'fresh mozzarella', which comes in a ball, either shrink-wrapped or packed in water",
  "Domino's offers a reduced fat cheese option",
  "'pizza cheese' is loaded with modified food starch",
  'That type of mozzarella is not appropriate for use in making American style pizza, as its moisture is much too high',
  'Its moisture is much too high',
  "'pizza cheese' is not 'real' mozzarella",
  "'pizza cheese' is made by Leprino Foods",
  "It's just not appropriate for trying to make American-style pizza",
  'personal experience',
  "There are two very different cheeses named


### CLUSTER ID: 1
### CLAIM CLUSTER: ["The cheese that Domino's uses is a mixture of mozzarella, monterey Jack and white cheddar in equal proportions"]


'### REASONS CLUSTERs:'

[['I used to work there way back in the day when we were required to be trained on all of the aspects of making the pizzas even right down to what went into the ingredients']]


### CLUSTER ID: 2
### CLAIM CLUSTER: ['There is no right or wrong when it comes to choosing a cheese', 'I would like to experiment with cheese']


'### REASONS CLUSTERs:'

[['It is a preference']]




In [131]:
ground_cluster_df[ground_cluster_df['clusters'].str.len() == 0]

Unnamed: 0,subset,id,questionText,cluster_id,clusters


In [134]:
# # # # ground_cluster_df.to_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_3.0.pkl")
# # # # ground_cluster_df.to_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_3.5.pkl")
# # ground_cluster_df.to_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_2.5.pkl")
# ground_cluster_df.to_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_2.pkl")

# # # ground_cluster_df.to_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.0_3.0.pkl")

# Create Quantitative Summary

In [63]:
claim_cluster_df = pd.read_pickle("./stage_1_refined_new_llm_based_clustered_test_set_s1_entail_ratio_3_3.5_0.1.pkl")
# claim_cluster_df = pd.read_pickle("./stage_1_refined_new_llm_based_clustered_test_set_s1_entail_ratio_3_3_0.1.pkl")
claim_cluster_df['cluster_id'] = claim_cluster_df['clusters'].apply(lambda x: [i for i in range(len(x))])
temp_df = claim_cluster_df.explode(['cluster_id', 'clusters']).explode(['clusters'])
temp_df = temp_df.rename(columns={'clusters': 'claim'})
temp_2_df = processed_df[['subset', 'my_category', 'id', 'questionText', 'claim', 'ground']]
temp_2_df = temp_2_df.drop_duplicates(subset=['subset', 'my_category', 'id', 'questionText', 'claim'])
temp_df = temp_df.merge(temp_2_df)
claim_cluster_df = temp_df.groupby(['subset', 'my_category', 'id', 'questionText', 'cluster_id'], sort=False).agg({
    'claim': (lambda x: x.tolist()),
    'ground': (lambda x: x.tolist())
}).reset_index()
claim_cluster_df = claim_cluster_df.rename(columns={'claim': 'claim_cluster'})
claim_cluster_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim_cluster,ground
0,reddit,1,0,What should I draw next? A FB user said they...,0,[I would like to see any of the Legendary Beas...,"[[], [Suicune is a good choice], [Suicune is a..."
1,reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]]
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]]
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]]
4,reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...
...,...,...,...,...,...,...,...
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]]
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,"[[I had the same issue, I tried various propos..."
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle..."
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]]


In [64]:
# ground_cluster_df = pd.read_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_3.0.pkl")
# ground_cluster_df = pd.read_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_3.5.pkl")  # NOT YET TRIED
ground_cluster_df = pd.read_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_2.5.pkl")
# ground_cluster_df = pd.read_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.5_2.pkl")

# ground_cluster_df = pd.read_pickle("./stage_2_refined_new_llm_based_clustered_test_set_s1_3.0_3.0.pkl")
ground_cluster_df = ground_cluster_df.rename(columns={'clusters': 'ground_clusters'})
ground_cluster_df

Unnamed: 0,subset,id,questionText,cluster_id,ground_clusters
0,reddit,0,What should I draw next? A FB user said they...,0,[[Suicune is a good choice]]
1,reddit,0,What should I draw next? A FB user said they...,1,[[If you have the time]]
2,reddit,0,What should I draw next? A FB user said they...,4,[[girafarig is the best pokemon ever created n...
3,reddit,0,What should I draw next? A FB user said they...,5,[[I admire your steadfast courage in how aweso...
4,reddit,1,"Historical Hypocrisy (First, a disclaimer: T...",1,[[none of the women in the paintings from cent...
...,...,...,...,...,...
23262,stack,249,"""Could not find Cortex-M device in the JTAG ch...",1,[[this will solve issues]]
23263,stack,249,"""Could not find Cortex-M device in the JTAG ch...",3,"[[Only JTAG is supported], [The ICDI can be us..."
23273,stack,249,"""Could not find Cortex-M device in the JTAG ch...",5,[[Disabling all the drivers for the device in ...
23276,stack,249,"""Could not find Cortex-M device in the JTAG ch...",8,"[[I tried various proposed solutions, I had th..."


In [65]:
final_df = claim_cluster_df.merge(ground_cluster_df, how='left')
final_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim_cluster,ground,ground_clusters
0,reddit,1,0,What should I draw next? A FB user said they...,0,[I would like to see any of the Legendary Beas...,"[[], [Suicune is a good choice], [Suicune is a...",[[Suicune is a good choice]]
1,reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]],[[If you have the time]]
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]],
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]],
4,reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...,[[girafarig is the best pokemon ever created n...
...,...,...,...,...,...,...,...,...
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]],
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,"[[I had the same issue, I tried various propos...","[[I tried various proposed solutions, I had th..."
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle...","[[The device works after rebooting the PC, cle..."
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]],


In [66]:
mask = pd.isnull(final_df['ground_clusters'])
final_df[mask]

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim_cluster,ground,ground_clusters
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]],
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]],
6,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",0,[I can't repost this enough],[[]],
8,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",2,"[Postimg is horrible, Your view on the image h...","[[], []]",
11,reddit,1,1,"Historical Hypocrisy (First, a disclaimer: T...",5,[Hugely obese people back then were very rare ...,[[]],
...,...,...,...,...,...,...,...,...
4829,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",4,[The Keil forums might have some more ideas yo...,[[]],
4831,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",6,[Try lowering the JTAG frequency],[[]],
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]],
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]],


In [67]:
from collections import OrderedDict
final_df.loc[mask, 'ground_clusters'] = final_df.loc[mask, 'ground'].apply(lambda x: [list(OrderedDict.fromkeys(np.hstack(x)))])

In [68]:
final_df

Unnamed: 0,subset,my_category,id,questionText,cluster_id,claim_cluster,ground,ground_clusters
0,reddit,1,0,What should I draw next? A FB user said they...,0,[I would like to see any of the Legendary Beas...,"[[], [Suicune is a good choice], [Suicune is a...",[[Suicune is a good choice]]
1,reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]],[[If you have the time]]
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]],[[]]
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]],[[]]
4,reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...,[[girafarig is the best pokemon ever created n...
...,...,...,...,...,...,...,...,...
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]],[[]]
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,"[[I had the same issue, I tried various propos...","[[I tried various proposed solutions, I had th..."
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle...","[[The device works after rebooting the PC, cle..."
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]],[[]]


In [69]:
final_df = final_df.rename(columns={'questionText': 'query'})
final_df['arg_input'] = final_df.apply(lambda row: {'claim': row['claim_cluster'],
                                                    'ground': row['ground_clusters']}, axis=1)

In [70]:
final_df

Unnamed: 0,subset,my_category,id,query,cluster_id,claim_cluster,ground,ground_clusters,arg_input
0,reddit,1,0,What should I draw next? A FB user said they...,0,[I would like to see any of the Legendary Beas...,"[[], [Suicune is a good choice], [Suicune is a...",[[Suicune is a good choice]],{'claim': ['I would like to see any of the Leg...
1,reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]],[[If you have the time]],"{'claim': ['I would love to see Reshiram'], 'g..."
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]],[[]],"{'claim': ['Pancham'], 'ground': [[]]}"
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]],[[]],"{'claim': ['Seel?'], 'ground': [[]]}"
4,reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...,[[girafarig is the best pokemon ever created n...,"{'claim': ['You should girafarig'], 'ground': ..."
...,...,...,...,...,...,...,...,...,...
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]],[[]],{'claim': ['my JTAG debugger was functioning w...
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,"[[I had the same issue, I tried various propos...","[[I tried various proposed solutions, I had th...",{'claim': ['the solution proposed by splatapus...
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle...","[[The device works after rebooting the PC, cle...",{'claim': ['A basic LED blink program can be u...
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]],[[]],"{'claim': ['Check the probe settings'], 'groun..."


In [71]:
final_df

Unnamed: 0,subset,my_category,id,query,cluster_id,claim_cluster,ground,ground_clusters,arg_input
0,reddit,1,0,What should I draw next? A FB user said they...,0,[I would like to see any of the Legendary Beas...,"[[], [Suicune is a good choice], [Suicune is a...",[[Suicune is a good choice]],{'claim': ['I would like to see any of the Leg...
1,reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]],[[If you have the time]],"{'claim': ['I would love to see Reshiram'], 'g..."
2,reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]],[[]],"{'claim': ['Pancham'], 'ground': [[]]}"
3,reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]],[[]],"{'claim': ['Seel?'], 'ground': [[]]}"
4,reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...,[[girafarig is the best pokemon ever created n...,"{'claim': ['You should girafarig'], 'ground': ..."
...,...,...,...,...,...,...,...,...,...
4832,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]],[[]],{'claim': ['my JTAG debugger was functioning w...
4833,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,"[[I had the same issue, I tried various propos...","[[I tried various proposed solutions, I had th...",{'claim': ['the solution proposed by splatapus...
4834,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle...","[[The device works after rebooting the PC, cle...",{'claim': ['A basic LED blink program can be u...
4835,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]],[[]],"{'claim': ['Check the probe settings'], 'groun..."


In [72]:
import os
from openai import AzureOpenAI

endpoint = "https://s3695-ma1sfp0e-eastus2.cognitiveservices.azure.com/"
model_name = "gpt-4.1-mini"
deployment = "gpt-4.1-mini"

subscription_key = "9OzzB1oQ6CBTBiIqkjQSURWXX52LccWTu0scsgfmEcXUVG5CQRV7JQQJ99BDACHYHv6XJ3w3AAAAACOGABC8"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)
model="gpt-4.1"
# model="gpt-4.1-mini"

# from openai import OpenAI
# import os
# import pandas as pd

# client = OpenAI(
#     api_key = "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" # RMIT Account
# )

prompt = "Once upon a time"

In [73]:
# BASE_PROMPT = """You will be provided with a community question, a social comment A answering the question, and a list of other social comments also answering the question.
# You will be asked to select best-matching (most semantically similar and sharing similar opinion and the similar aspect) comments from the list of other social comments. Each best-matching comment must correspond to a every opinion from the input social comment A. The selection must be from the input list of other social comments. Do not generate new best-matching comments.

# Use and output the following format:
# Community Question: <the input question>
# Social Comment: <the input comment>
# List of Other Social Comments: <the input list of other comments>
# Best-Matching Other Social Comments: <list of social comments best-matched to every opinion of the input social comment>

# """

In [74]:
# base_prompt = """
# In this task you are presented with a query on a product, a key point taken from the summary answering the query, and a sentence taken from a review of that product.
# You will be asked to answer the following question: "Does the key point match, i.e, represent an opinion in the review sentence?"
# A review sentence might express opinions on multiple aspects. A key point matches a review sentence they implicitly discusses issues relating to each other, might not have to absolutely support each other.

# The options are:
# - Not At All
# - Somewhat Not Well
# - Somewhat Well
# - Very Well

# Remember to explain your answer in the output.

# Query: \"\"\"{query}\"\"\"
# Key Point: \"\"\"{key_point}\"\"\"
# Review Sentence: \"\"\"{review_sentence}\"\"\"
# """

In [75]:
# claims and their supporting grounds

In [76]:
# base_prompt = """In this task you are presented with a community question, claim A and claim B extracted from the social comments answering the question.
# You will be asked to answer the following question: "Does claim A match, i.e, express similar opinion on similar aspect, with claim B?"
# Claim A matches claim B if they explicitly discusses similar opinions on similar aspect with Claim B.

# The options are:
# - Not At All
# - Somewhat Not Well
# - Somewhat Well
# - Very Well

# Remember to explain your answer in the output.

# Query: \"\"\"{query}\"\"\"
# Claim A: \"\"\"{key_point}\"\"\"
# Claim B: \"\"\"{review_sentence}\"\"\"
# """

## V0

In [69]:
base_prompt = """In this task you are presented with a community question, lists of comments to support a claim and its ground.
The question asks the opinions of users, and can be generally answered by the list of claim's comments. The 'claim' can then be further justified by ground's comments.

You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments.

Perform the following actions to solve this task:
+ Generate a concise key point that captures salient opinions from the list of claim's comments --> 'Key Point Claim'
+ For every list of ground's comments, generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

Note that a key point must have less than 10 tokens, and must have a sentiment

The output must be a JSON object following the below template:
{'claim': 'Key Point Claim', 'ground': ['Key Point Ground 1', 'Key Point Ground 2']}

Question: %s
Claim's comments: %s"""

In [143]:
ground_input_template = "Ground %s's comments: %s"

In [144]:
row = final_df.iloc[2]
# row = final_df.iloc[3]
# row = final_df.iloc[9]
# mask = final_df['ground'].str.len() > 1
# # row = final_df[mask].iloc[0]
# # row = final_df[mask].iloc[1]
# # row = final_df[mask].iloc[2]
# # row = final_df[mask].iloc[3]
# # row = final_df[mask].iloc[4]
# # row = final_df[mask].iloc[5]
# row = final_df[mask].iloc[6]

print(row['id'])
print(row['query'])
print(len(row['ground']))
if len(row['ground']) == 0:
    ground_input = "Ground's comments: []"
else:
    ground_input = "\n".join([ground_input_template % (i, ground) for i, ground in enumerate(row['ground'])])
prompt = base_prompt %(row['query'], row['claim']) + ground_input

1
Hand from tonight.   Playing at my local pub league. Villian is loose aggressive. Sorry if this is bad formatting just quickly whipped it up for your opinions.    Button: (~30000)   SB: Villian: (50000)   BB: Hero: (40000)    Blinds 3000 6000   Hero is dealt Qs 9s.    Pre flop: Villian calls 6000.   Hero bets 16000. Button folds. Villian calls 16000   Flop: As 9c 10s (32000)   Villain bets 20000   Hero ???
1


In [330]:
print(prompt)

In this task you are presented with a community question, lists of comments to support a claim and its ground.
The question asks the opinions of users, and can be generally answered by the list of claim's comments. The 'claim' can then be further justified by ground's comments.

You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments.

Perform the following actions to solve this task:
+ Generate a concise key point that captures salient opinions from the list of claim's comments --> 'Key Point Claim'
+ For every list of ground's comments, generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

Note that a key point must have less than 10 tokens, and must have a sentiment

The output must be a JSON object following the below template:
{'claim': 'Key Point Claim', 'ground': ['Key Point Ground 

In [332]:
response = get_completion(prompt, model)

In [333]:
print(response)

{
  "claim": "Raising preflop is a mistake",
  "ground": [
    "Raising pot-commits you",
    "Folding after raise is impossible"
  ]
}


## V1

In [77]:
# base_prompt = """In this task you are presented with a community question, a JSON object containing lists of comments to support a claim and its ground.
# The question asks the opinions of users, and can be generally answered by the list of comments under 'claim'. The 'claim' can then be further justified by lists (clusters) of comments under 'ground'

# You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments under 'ground'

# Perform the following actions to solve this task:
# + Generate a concise key point that captures salient opinions from the list of comments under 'claim' --> 'Key Point Claim'
# + For every list of comments under 'ground', generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

# Note that a key point must have less than 10 tokens, and must have a sentiment

# The output must be a JSON object following the below template:
# {'claim': 'Key Point Claim', 'ground': ['Key Point Ground 1', 'Key Point Ground 2']}

# Question: %s
# Claim-Ground JSON input: %s

# """

In [78]:
# base_prompt = """In this task you are presented with a community question, a JSON object containing lists of comments to support a claim and its ground.
# The question asks the opinions of users, and can be generally answered by the list of comments under 'claim'. The 'claim' can then be further justified by lists (clusters) of comments under 'ground'

# You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments under 'ground'

# Perform the following actions to solve this task:
# + Generate a concise key point that captures salient opinions from the list of comments under 'claim' --> 'Key Point Claim'
# + For every list of comments under 'ground', generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

# Note that a key point should be general (less than 10 tokens) and must have a sentiment

# The output must be a JSON object following the below template:
# {'claim': 'Key Point Claim', 'ground': ['Key Point Ground 1', 'Key Point Ground 2']}

# Question: %s
# Claim-Ground JSON input: %s

# """

In [79]:
base_prompt = """In this task you are presented with a community question, a JSON object containing lists of comments to support a claim and its ground.
The question asks the opinions of users, and can be generally answered by the list of comments under 'claim'. The 'claim' can then be further justified by lists (clusters) of comments under 'ground'

You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments under 'ground'

Perform the following actions to solve this task:
+ Generate a concise key point that captures salient opinions from the list of comments under 'claim' --> 'Key Point Claim'
+ For every list of comments under 'ground', generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

Note that the claim should be general (less than 10 tokens) and the ground should be specific and cover details as much as possible, and must have a sentiment

The output must be a JSON object following the below template:
{'claim': 'Key Point Claim', 'ground': ['Key Point Ground 1', 'Key Point Ground 2']}

Question: %s
Claim-Ground JSON input: %s

"""

In [80]:
# base_prompt = """In this task you are presented with a community question, a JSON object containing lists of comments to support a claim and its ground.
# The question asks the opinions of users, and can be generally answered by the list of comments under 'claim'. The 'claim' can then be further justified by lists (clusters) of comments under 'ground'

# You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments under 'ground'

# Perform the following actions to solve this task:
# + Generate a concise key point that captures salient opinions from the list of comments under 'claim' --> 'Key Point Claim'
# + For every list of comments under 'ground', generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

# Note that the claim and ground must have less than 10 tokens. The claim should be general and the ground should be specific and cover details as much as possible, and the ground must strongly support the claim.

# The output must be a JSON object following the below template:
# {'claim': 'Key Point Claim', 'ground': ['Key Point Ground 1', 'Key Point Ground 2']}

# Question: %s
# Claim-Ground JSON input: %s

# """

In [81]:
# base_prompt = """In this task you are presented with a community question, a JSON object containing lists of comments to support a claim and its ground.
# The question asks the opinions of users, and can be generally answered by the list of comments under 'claim'. The 'claim' can then be further justified by lists (clusters) of comments under 'ground'

# You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments under 'ground'

# Perform the following actions to solve this task:
# + Generate a concise key point that captures salient opinions from the list of comments under 'claim' --> 'Key Point Claim'
# + For every list of comments under 'ground', generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

# Note that a key point must have a sentiment

# The output must be a JSON object following the below template:
# {'claim': 'Key Point Claim', 'ground': ['Key Point Ground 1', 'Key Point Ground 2']}

# Question: %s
# Claim-Ground JSON input: %s

# """

In [82]:
print(base_prompt)

In this task you are presented with a community question, a JSON object containing lists of comments to support a claim and its ground.
The question asks the opinions of users, and can be generally answered by the list of comments under 'claim'. The 'claim' can then be further justified by lists (clusters) of comments under 'ground'

You were tasked to generate the general opinion implied by comments under 'claim' to answer the question, and also generate every ground supported by each list of comments under 'ground'

Perform the following actions to solve this task:
+ Generate a concise key point that captures salient opinions from the list of comments under 'claim' --> 'Key Point Claim'
+ For every list of comments under 'ground', generate a concise key point that captures salient opinions across the comment that support the 'Key Point Claim' --> 'Key Point Ground'

Note that the claim should be general (less than 10 tokens) and the ground should be specific and cover details as much

In [83]:
# row = final_df.iloc[0]
# row = final_df.iloc[1]
# row = final_df.iloc[10]
# row = final_df.iloc[15]
# row = final_df.iloc[2]
# row = final_df.iloc[20]
# row = final_df.iloc[10]
# row = final_df.iloc[15]
# row = final_df.iloc[15]

# row = final_df.iloc[25]
# row = final_df.iloc[26]
row = final_df.iloc[27]
# row = final_df.iloc[28]
# row = final_df.iloc[29]
# row = final_df.iloc[30]
# row = final_df.iloc[36]
# row = final_df.iloc[43]

# row = final_df.iloc[6]
# row = final_df.iloc[7]
# row = final_df.iloc[24]
# row = final_df.iloc[14]
# row = final_df.iloc[17]
###########
# mask = final_df['id'] == 45
# # row = final_df[mask].iloc[0]
# # row = final_df[mask].iloc[1]
# row = final_df[mask].iloc[2]
# # row = final_df[mask].iloc[3]
# # row = final_df[mask].iloc[4]
# # row = final_df[mask].iloc[5]
# # row = final_df[mask].iloc[6]
# # row = final_df[mask].iloc[9]

print(row['id'])
print(row['query'])
print(row['claim_cluster'])
print(len(row['ground_clusters']))
print(row['arg_input'])
prompt = base_prompt %(row['query'], row['arg_input'])

3
Bike racks - what to look for?   I'm in the market for a bike rack. I've been looking at the Saris Bones 2 and 3. The only difference I can tell between the them is that the 3 series has an extra arm for stability. Is the Bones 2 not to be trusted?      Any other brands/models I should be considering?
['If you cannot attach bikes to a roof rack or a hitch, use the bones', 'I would recommend getting a hitch installed', 'If you can attach bikes to a roof rack or a hitch, do that', "If you don't have a hitch, and have no interest of adding one, go with the Bones"]
1
{'claim': ['If you cannot attach bikes to a roof rack or a hitch, use the bones', 'I would recommend getting a hitch installed', 'If you can attach bikes to a roof rack or a hitch, do that', "If you don't have a hitch, and have no interest of adding one, go with the Bones"], 'ground': [['A hitch is not a bad investment', "even saris says 'if you can attach bikes to a roof rack or a hitch, do that'", 'Most of the time it can 

In [84]:
response = get_completion(prompt, model)

In [85]:
print(response)

{
  "claim": "Hitch or roof racks are preferred",
  "ground": [
    "Hitches are affordable, sturdy, and versatile, making them a worthwhile investment for better bike rack options."
  ]
}


In [134]:
print(response)

{
  "claim": "Hitch or roof racks are preferred",
  "ground": [
    "Hitches are affordable, sturdy, and offer future utility"
  ]
}


In [95]:
print(response)

{
  "claim": "Consider Thelma, Thule 916/917, or Yakima HoldUp racks",
  "ground": [
    "These racks prevent paint and frame damage by avoiding direct contact with the bike frame, which is especially important for carbon bikes where over-tightening clamps can cause cracks"
  ]
}


In [87]:
print(response)

{
  "claim": "Saris Bones 3 is highly recommended",
  "ground": [
    "Saris provides exceptional customer service with free replacement parts and support, reflecting reliability and user satisfaction (positive)",
    "More permanent and secure bike racks with integrated locks, like Saris Bones 3 and Thule models, offer better protection though no lock is completely theft-proof (cautiously positive)",
    "Saris Bones 2 offers less security, as evidenced by a reported bike theft, indicating it may not be as trustworthy for bike protection (negative)"
  ]
}


In [84]:
print(response)

{
  "claim": "Saris Bones 3 preferred for stability and reliability",
  "ground": [
    "Saris offers excellent customer service and support",
    "Permanent/secure racks with locks provide better bike protection",
    "Saris Bones 2 has security vulnerabilities"
  ]
}


In [84]:
print(response)

{
  "claim": "Jungler farming strategy is valid",
  "ground": [
    "The strategy involves risk since it depends on the team avoiding deaths to ganks",
    "Failing multiple ganks puts the jungler significantly behind in farm",
    "Using teleport allows counterganking, making a well-farmed jungler more impactful in teamfights",
    "Overall, it is a high risk, high reward approach"
  ]
}


### Run

In [86]:
def get_claim_ground_kp_completion(query, arg_input):
    prompt = base_prompt %(query, arg_input)

    retries = 5
    while retries > 0:
        try:
            response = get_completion(prompt, model)
            return response
        except Exception as e:
            if e:
                if "exceeded your current quota" in str(e).lower():
                    raise e
                print(e)
                print('Timeout error, retrying...')
                retries -= 1
                if "limit reached for" in str(e).lower():
                    time.sleep(30)
                else:
                    time.sleep(5)
            else:
                raise e

    print('API is not responding, moving on...')
    return None

In [87]:
def prompted_comment_kp_annotation(root_path, domain, domain_df, save_step=100):
    src_path = f"{root_path}/{domain}"
    Path(src_path).mkdir(parents=True, exist_ok=True)
    claim_ground_kps = []

    file_names = listdir(src_path)
    postfix = [re.split("[_.]", name)[1]
               for name in listdir(src_path)
               ]
    start = 0
    if 'done' in postfix:
        print(domain, ": ", "Loaded saved file. Done")
        new_domain_df = pd.read_pickle(f"{src_path}/{domain}_done.pkl")
        return new_domain_df
    elif len(postfix) > 0:
        last_index = max([int(idx) for idx in postfix if idx != 'done'])
        last_domain_df = pd.read_pickle(f"{src_path}/{domain}_{last_index}.pkl")
        claim_ground_kps = last_domain_df['claim_ground_kps'].tolist()
        start = last_index
        print(domain, "Loaded saved file. Continuing")
    else:
        print(domain, "Start new process.")

    for i, (_, row) in tqdm(enumerate(domain_df.iterrows()), total=domain_df.shape[0]):
        if i < start:
            continue

        query = row['query']
        arg_input = row['arg_input']
        
        claim_ground_kp = get_claim_ground_kp_completion(query, arg_input)
        claim_ground_kps += [claim_ground_kp]
        
        time.sleep(0.1)
        
        if (i + 1) % save_step == 0:
            save_df = domain_df.iloc[:i + 1]
            save_df.insert(0, 'claim_ground_kps', claim_ground_kps)
            save_df.to_pickle(f"{src_path}/{domain}_{i + 1}.pkl")

    new_domain_df = domain_df.iloc[:i + 1]
    new_domain_df.insert(0, 'claim_ground_kps', claim_ground_kps)
    new_domain_df.to_pickle(f"{src_path}/{domain}_done.pkl")
    return new_domain_df

In [88]:
final_df['my_category'] = 1

In [90]:
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_3.0/gpt_4.1_mini/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_3.0_better_prompt/gpt_4.1_mini/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_3.0_long_form/gpt_4.1_mini/few_shot_test_set"
root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_3.0_long_form/gpt_4.1/few_shot_test_set"

# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_2.5/gpt_4.1_mini/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_2.5_long_form/gpt_4.1_mini/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3.5_0.1_2_long_form/gpt_4.1_mini/few_shot_test_set"

# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3_0.1_3.0/gpt_4.1_mini/few_shot_test_set"
# root_path = f"/mnt/e/Desktop/PHD READING/Quantitative_Fact_Check/claim_ground_kp_generation_2_refined_framework_new_llm_based_3_3_0.1_3.0_long_form/gpt_4.1_mini/few_shot_test_set"
inputs = [(root_path,
           domain,
           final_df[final_df['my_category'] == domain].reset_index(drop=True)
           )
          for domain in final_df['my_category'].unique()]

In [91]:
num_workers = 1

In [92]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import time
from multiprocessing import Pool
from pathlib import Path
from os import listdir
from tqdm import tqdm
import random
import re
import math
# import spacy
# pd.set_option('display.max_colwidth', None)

In [93]:
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompted_comment_kp_annotation, inputs)
print("TIME ELAPSED", time.time() - start_time)

1 Start new process.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4837/4837 [1:14:13<00:00,  1.09it/s]


TIME ELAPSED 4453.692061662674


### Read

In [102]:
processed_final_df = pd.read_pickle(root_path + "/1/1_done.pkl")
# processed_final_df = pd.read_pickle(root_path + "/1/1_1800.pkl")
# processed_final_df = pd.read_pickle(root_path + "/1/1_200.pkl")
processed_final_df = processed_final_df.dropna(subset=['claim_ground_kps'])
# processed_final_df = processed_final_df.drop(columns=['s1_clusters', 'ground'])
processed_final_df = processed_final_df.drop(columns=['ground'])
processed_final_df = processed_final_df.rename(columns={'claim_cluster': 'claim', 'ground_clusters': 'ground'})
processed_final_df

Unnamed: 0,claim_ground_kps,subset,my_category,id,query,cluster_id,claim,ground,arg_input
1,"{'claim': 'Draw Reshiram', 'ground': ['Drawing...",reddit,1,0,What should I draw next? A FB user said they...,1,[I would love to see Reshiram],[[If you have the time]],"{'claim': ['I would love to see Reshiram'], 'g..."
2,"{'claim': 'Draw Pancham next', 'ground': ['No ...",reddit,1,0,What should I draw next? A FB user said they...,2,[Pancham],[[]],"{'claim': ['Pancham'], 'ground': [[]]}"
3,"{'claim': 'Seel is a good choice', 'ground': [...",reddit,1,0,What should I draw next? A FB user said they...,3,[Seel?],[[]],"{'claim': ['Seel?'], 'ground': [[]]}"
4,"{\n ""claim"": ""Girafarig is the best choice"",\...",reddit,1,0,What should I draw next? A FB user said they...,4,[You should girafarig],[[girafarig is the best pokemon ever created n...,"{'claim': ['You should girafarig'], 'ground': ..."
5,"{'claim': 'Undecided on next drawing', 'ground...",reddit,1,0,What should I draw next? A FB user said they...,5,[I may wait on that one],[[I admire your steadfast courage in how aweso...,"{'claim': ['I may wait on that one'], 'ground'..."
...,...,...,...,...,...,...,...,...,...
4832,"{\n ""claim"": ""JTAG debugger worked before sof...",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",7,[my JTAG debugger was functioning well on both...,[[]],{'claim': ['my JTAG debugger was functioning w...
4833,"{\n ""claim"": ""Splatapus's solution fixed the ...",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",8,[the solution proposed by splatapus on the TI ...,[[it was a Windows registry issue for me and f...,{'claim': ['the solution proposed by splatapus...
4834,"{\n ""claim"": ""Basic LED blink confirms device...",stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",9,[A basic LED blink program can be used for tes...,"[[The device works after rebooting the PC, cle...",{'claim': ['A basic LED blink program can be u...
4835,{'claim': 'Check probe settings to fix JTAG is...,stack,1,249,"""Could not find Cortex-M device in the JTAG ch...",10,[Check the probe settings],[[]],"{'claim': ['Check the probe settings'], 'groun..."


In [103]:
processed_final_df['claim_ground_kps'] = processed_final_df['claim_ground_kps'].apply(lambda x: x.strip("`").strip("json"))

In [104]:
processed_final_df['claim_ground_kps'] = processed_final_df['claim_ground_kps'].apply(lambda x: ast.literal_eval(x))

In [105]:
processed_final_df.iloc[0]['claim_ground_kps']

{'claim': 'Draw Reshiram', 'ground': ['Drawing depends on available time']}

In [106]:
mask = processed_final_df['claim_ground_kps'].apply(lambda x: 'claim' not in x)
processed_final_df[mask]

Unnamed: 0,claim_ground_kps,subset,my_category,id,query,cluster_id,claim,ground,arg_input


In [107]:
mask = processed_final_df['ground'].str.len() > 1
with_ground_id = processed_final_df[mask]['id'].unique().tolist()

In [108]:
# test_id = with_ground_id[5]
test_id = with_ground_id[6]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]

Unnamed: 0,claim_ground_kps,subset,my_category,id,query,cluster_id,claim,ground,arg_input
88,"{'claim': 'Logo design is highly praised', 'gr...",reddit,1,8,"After long time coming, I finally get to see m...",0,[I am in love with the 'Z' logo you did for Za...,[[the 'Z' logo you did for Zachary Domes is se...,{'claim': ['I am in love with the 'Z' logo you...
89,{'claim': 'Underlines and centered text reduce...,reddit,1,8,"After long time coming, I finally get to see m...",1,[If they're underlined I may as well make them...,[[It doesn't look sharp or consistent with the...,{'claim': ['If they're underlined I may as wel...
90,{'claim': 'Mixed opinions on service icons siz...,reddit,1,8,"After long time coming, I finally get to see m...",2,"[I like the icons for your services, The icons...",[[It's making you want them to be noticed too ...,{'claim': ['I like the icons for your services...
91,{'claim': 'Portfolio interaction needs improve...,reddit,1,8,"After long time coming, I finally get to see m...",3,[The squares you have below the reel should be...,[[You want people to know they can click those...,{'claim': ['The squares you have below the ree...
92,{'claim': 'Simple underline highlights keyword...,reddit,1,8,"After long time coming, I finally get to see m...",4,[The underline was just the most simple way to...,[[they're all services I provide]],{'claim': ['The underline was just the most si...
93,{'claim': 'Portfolio site looks great and prom...,reddit,1,8,"After long time coming, I finally get to see m...",5,[I am not at a stage where I will change anyth...,[[Apercu and LL Circular are both really great...,{'claim': ['I am not at a stage where I will c...
94,"{'claim': 'Portfolio site is exciting', 'groun...",reddit,1,8,"After long time coming, I finally get to see m...",6,[its like a window],[[]],"{'claim': ['its like a window'], 'ground': [[]]}"
95,{'claim': 'Photo-retouching display is impress...,reddit,1,8,"After long time coming, I finally get to see m...",7,[the way you display the photo-retouching is a...,[[]],{'claim': ['the way you display the photo-reto...
96,{'claim': 'Image change on hover is redundant'...,reddit,1,8,"After long time coming, I finally get to see m...",8,[By the time someone has scrolled far enough d...,[[they wouldn't be looking at the slider anymo...,{'claim': ['By the time someone has scrolled f...
97,"{'claim': 'Links redirect incorrectly', 'groun...",reddit,1,8,"After long time coming, I finally get to see m...",9,[Your Behance and LinkedIN links redirect to F...,[[]],{'claim': ['Your Behance and LinkedIN links re...


In [91]:
with open("./output/two_fold_cluster_arg_quant_summ_refined_pipeline_output.txt", "w") as f:
    for subset in processed_final_df['subset'].unique():
        mask = processed_final_df['subset'] == subset
        subset_df = processed_final_df[mask]
        ids = subset_df['id'].unique()
        for question_id in ids:
            question_df = subset_df[subset_df['id'] == question_id]
            
            f.write("SUBSET: %s\n" %(subset))
            f.write("QID: %s\n" %(question_id))
            f.write("QUESTION: %s\n" %(question_df['query'].iloc[0]))
            f.write("SUMMARY:\n")

            for i, row in question_df.iterrows():
                c = row['claim_ground_kps']['claim']

                f.write("* %s (Prevalence: %s)\n" %(c,len(row['claim'])))

                if len(row['ground']) == 1:
                    for g in row['claim_ground_kps']['ground']:
                        f.write("\t+ %s (Prevalence: %s)\n" %(g, len(row['ground'][0])))
                else:
                    if 'ground' in row['claim_ground_kps']:
                        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
                            f.write("\t+ %s (Prevalence: %s)\n" %(g, len(gp)))

            f.write("\n##########################################################################\n\n")

In [None]:
# New rider here, Looking for opinions on first bike   Hello ever

In [236]:
mask = processed_final_df['id'] == 45
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print("+", c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  TV purchase help, please? Looking to spend ~700. 50" LG Plasma vs. 60" Mitsubishi DLP   From Sears, I see [LG 50'' Class Full HD 1080p Plasma Screen TV](http://www.sears.com/shc/s/p_10153_12605_05775850000P?prdNo=10&blockNo=10&blockType=L10#reviewsWrap) for $699 and [Mitsubishi Electronics WD-60638 3D-Ready 60 ](http://www.sears.com/shc/s/p_10153_12605_05715201000P?prdNo=3&blockNo=3&blockType=G3) for $649 (tomorrow). Apologies if the answer seems very obvious, but I'm a little out of my element.      Any other suggestions/recommendations are much appreciated. Also, any comments about the DLP being 3D-ready?      ---------------      EDIT: Thanks for all the responses! For the record, my friend went DLP (I was asking on his behalf). Some of the more critical responses came after he had placed the order. In the future for my own sake, I'll reference this thread and reconsider.      If anyone is interested, I will update this thread w/ his (my experience w/ the DLP set). I'm su

In [225]:
mask = processed_final_df['id'] == 3
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Which Windows Should I get?   I currently have Windows 7 32-bit and I'm thinking of upgrading to 64-bit so I can get some more RAM. Should I stay with Windows 7 or should I get 8 or 10? The reason why I want more RAM is so that WoW wont lag as much.
########################
PIPELINE OUTPUT:
Upgrading graphics card or SSD improves gaming (Prevalence: 5)
	Upgrading to an SSD boosts game performance positively (Prevalence: 3)
	Upgrading the graphics card significantly enhances gaming experience (Prevalence: 3)
Stay with Windows 7 or upgrade to 8.1 (Prevalence: 5)
	Windows 8.1 is a decent OS and will transition to Windows 10 for free, which is not yet released (Prevalence: 5)
Windows 10 upgrade is free (Prevalence: 3)
	Upgrading to Windows 10 will be free if done within the first year (Prevalence: 2)
	The free upgrade offer is a key benefit of moving to Windows 10 (Prevalence: 2)
Upgrade to 64-bit Windows for better performance (Prevalence: 1)
	64-bit Windows supports more RAM, 

In [227]:
mask = processed_final_df['id'] == 1
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Hand from tonight.   Playing at my local pub league. Villian is loose aggressive. Sorry if this is bad formatting just quickly whipped it up for your opinions.    Button: (~30000)   SB: Villian: (50000)   BB: Hero: (40000)    Blinds 3000 6000   Hero is dealt Qs 9s.    Pre flop: Villian calls 6000.   Hero bets 16000. Button folds. Villian calls 16000   Flop: As 9c 10s (32000)   Villain bets 20000   Hero ???
########################
PIPELINE OUTPUT:
Shove preflop with short stack (Prevalence: 8)
	With less than 10 big blinds, holding a pair and flush draw is a strong spot to shove preflop, as you have many outs and are often a statistical favorite (Prevalence: 4)
	A shove may not gain fold equity against hands that beat you, since opponents with short stacks tend to call rather than fold, especially after limping and calling preflop (Prevalence: 3)
	By raising 16000 preflop, you are pot committed postflop with a pair and flush draw, meaning you cannot easily fold and must cont

In [222]:
mask = processed_final_df['id'] == 45
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

IndexError: single positional indexer is out-of-bounds

In [223]:
# test_id = with_ground_id[11]
test_id = with_ground_id[14]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  The Warrior build   I'm a level 15 and right now it costs about 1445 per point to upgrade my stats. I have Attunement=8, Faith=9, Intelligence=10 but as expected with the Warrior class, my Str, Dex, Endurance, and resistance are pretty nice at 15, 15, 14 and 13.       My question is: Is it worth upgrading Attunement, Faith and Intelligence at this stage or can it wait until I actually have some magic to use those things?
########################
PIPELINE OUTPUT:
Delay magic stats upgrade positively (Prevalence: 9)
	Plan to raise magic stats later (Prevalence: 3)
	No immediate need to upgrade magic stats (Prevalence: 4)
Prioritize Vitality and Endurance early (Prevalence: 4)
	High Vitality and Endurance boost survivability (Prevalence: 2)
	More Vitality and Endurance improve damage absorption (Prevalence: 2)
Upgrade cost increases steadily (Prevalence: 4)
	Current upgrade costs are high (Prevalence: 5)
	Upgrade costs rise with each level (Prevalence: 2)
Use 'ourself' for sing

In [230]:
stack_id = processed_final_df[processed_final_df['subset'] == 'stack']['id'].unique()
mask = processed_final_df['subset'] == 'stack'
mask &= processed_final_df['id'] == stack_id[2]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Should the Adamantium bullets have worked on Wolverine?
########################
PIPELINE OUTPUT:
Adamantium bonds with bones positively (Prevalence: 2)
	Adamantium heals with bones (Prevalence: 4)
	Beta adamantium less strong (Prevalence: 4)
	Healing factor bonds metal and bone (Prevalence: 4)
	Adamantium coverage incomplete (Prevalence: 4)
Adamantium bullets shouldn't penetrate Wolverine (Prevalence: 6)
	Bullets designed to penetrate but usually bounce off adamantium (Prevalence: 5)
	Adamantium coating prevents damage from adamantium claws (Prevalence: 3)
	Wolverine's healing negates bullet damage (Prevalence: 4)
Adamantium bullets shouldn't work (negative) (Prevalence: 9)
	Gun mechanics make bullet use implausible (negative) (Prevalence: 2)
	Bullet force insufficient to penetrate (negative) (Prevalence: 4)
	Adamantium is indestructible, bullets bounce off (negative) (Prevalence: 4)
	Wolverine's healing negates bullet effects (negative) (Prevalence: 4)


In [236]:
stack_id = processed_final_df[processed_final_df['subset'] == 'stack']['id'].unique()
mask = processed_final_df['subset'] == 'stack'
mask &= processed_final_df['id'] == stack_id[8]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("PIPELINE OUTPUT:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  What to do with old tires?
########################
PIPELINE OUTPUT:
Old tires can be reused or recycled (Prevalence: 6)
	Old tires provide months of use (Prevalence: 2)
	Sharing tires via local forums helps reuse (Prevalence: 2)
Keep old tires for backup use (Prevalence: 8)
	Old tires useful for long trips (Prevalence: 4)
	Trainer use preserves new tires (Prevalence: 4)
Old tires useful for protection and padding (Prevalence: 8)
	Cutting beads transforms tires for reuse (Prevalence: 3)
Old tires protect bike chains (Prevalence: 2)
	No supporting comments (Prevalence: 2)
Old tires can be creatively reused (Prevalence: 6)
	No additional supporting grounds provided (Prevalence: 2)
Use tires based on coverage needs (Prevalence: 2)
	No supporting grounds provided (Prevalence: 2)
Old tires are useful waterproof covers (Prevalence: 2)
	No additional support provided (Prevalence: 2)


In [279]:
test_id = with_ground_id[18]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("ANSWER:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Rewatching Inca Mummy Girl when I had a thought.   Impada's family never wondered why he never returned from the exchange program?  Their son just left for America and then never returned.  Never to be seen or heard from again.  The people in Sunnydale are kind of used to people just disappearing, but he came from somewhere else where, presumably, people dont just disappear.
########################
ANSWER:
Impada's disappearance is unexplained and odd (Prevalence: 29)
	Multiple characters disappeared simultaneously (Prevalence: 2)
	Supernatural chaos affected Los Angeles (Prevalence: 2)
	Impada likely died unknown to family (Prevalence: 2)
Confusion about Impada name (Prevalence: 2)
Jared's verse features many odd couples (Prevalence: 3)


In [282]:
test_id = with_ground_id[21]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("ANSWER:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Will this battery be safe in an Eleaf Pico 75w?   https://www.amazon.co.uk/Genuine-Efest-18650-Batteries-Independently/dp/B00OQ10A3G/ref=sr_1_fkmr1_1?ie=UTF8&qid=1462138448&sr=8-1-fkmr1&keywords=efest+2100+35a They say 20A/35A, reading some odd things about efest online, I have this battery currently, waiting for the pico in the mail, will this be safe?      I'm in the UK so suggestions on where to get a good battery otherwise, would be welcome.
########################
ANSWER:
Efest batteries are risky (Prevalence: 31)
	Lower watt users may be safe (Prevalence: 2)
	Efest cells often misrepresented (Prevalence: 21)
Good UK battery sources exist (Prevalence: 2)
	BatteriesPlus UK has good selection (Prevalence: 2)
	Nkon.nl stocks quality batteries (Prevalence: 2)


In [286]:
test_id = with_ground_id[25]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("ANSWER:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Okay, seriously, how the fuck do I upgrade my Windows 7 OS to 64-bit instead of 32-bit?   I can't find a way from Google on it anywhere. I have a download key, but I can only find how to do it on disc, but I don't want to spend 90$ to get a disc. I may as well just pirate it but that's risky and I have no clue how to pirate an entire OS.      Halp
########################
ANSWER:
Clean install needed for 64-bit upgrade (Prevalence: 24)
	Keys may work for 32 or 64-bit (Prevalence: 17)
	OEM keys require manual activation (Prevalence: 2)
Upgrade to Windows 8.1 for better optimization (Prevalence: 2)


In [288]:
test_id = with_ground_id[0]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("ANSWER:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  ELI5: Why don't we have (many) cars driven only by electric engines with fuel generators instead of big batteries/standard hybrid systems?   * it could still recuperate   * it could startstop fuel engine from time to time (it would power the wheels and charge the batteries, then stop and start again when batteries are drained) so the engine would be at almost constant RPM/load = more fuel economy   * car wouldn't need complex mechanical systems for distribution of mechanical power so it would be propably much lighter and simpler   * stability control, abs, esp, etc. would have direct access to wheel rotation speeds and could adjust them on the fly as needed      I consider fully electric 4WD drive with engine per wheel as the greatest way to drive a car, so I would expect that this is what we would be building, but for current state of batteries with power generator. Instead, we still have engine directly linked to wheels. I can imagine some power loss on electric generator 

In [273]:
test_id = with_ground_id[12]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("ANSWER:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  I want to ask you guys if my friend can run DayZ on lowest settings.   He has an i3 processor-2120 with 3.0 ghz   6 gb ram   Intel HD family graphics   Directx 11   Windows 7 64 bit    I want to play with him because he is a cool guy but his specs aren't on the gamer side.
########################
ANSWER:
Can run on low with GPU upgrade (Prevalence: 20)
	Integrated graphics insufficient (Prevalence: 4)
	Low settings playable on similar PCs (Prevalence: 6)


In [272]:
test_id = with_ground_id[6]
mask = processed_final_df['id'] == test_id
processed_final_df[mask]
inspect_df = processed_final_df[mask]

print("QUESTION: ", inspect_df['query'].iloc[0])
print("########################")
print("ANSWER:")
for i, row in inspect_df.iterrows():
    c = row['claim_ground_kps']['claim']
    print(c, "(Prevalence: %s)" %(len(row['claim'])))
#     for g in row['claim_ground_kps']['ground']:
#         print("\t%s"%(g))
    if len(row['ground']) == 1:
        for g in row['claim_ground_kps']['ground']:
            print("\t%s (Prevalence: %s)" %(g, len(row['ground'][0])))
    else:
        for g, gp in zip(row['claim_ground_kps']['ground'], row['ground']):
            print("\t%s (Prevalence: %s)" %(g, len(gp)))

QUESTION:  Good place for base cheranus   What's a good place for a base on the Dayz map. I want to find maybe an already pre made farm or garage then put walls an make it as a base. The good thing is that I'm playing on a private server so we're all friends and no one kills each other. Thanks
########################
ANSWER:
Hidden bases in quiet towns work well (Prevalence: 26)
	Gorka area offers concealed, well-equipped base spots (Prevalence: 18)
	Remote northern locations have little value (Prevalence: 2)
Base needs defense due to PVP (Prevalence: 3)
No need for base if friendly (Prevalence: 8)
	Everyone is friendly and peaceful (Prevalence: 4)
Pre-made base spots are risky (Prevalence: 3)
	Popular spots are often checked (Prevalence: 2)


# Best-Comment Matching

In [60]:
from openai import OpenAI
import os
import pandas as pd

client = OpenAI(
    api_key = "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" # RMIT Account
)
model="gpt-4.1-mini"
# model="gpt-4o-mini"
# model="gpt-4.1"

prompt = "Once upon a time"

In [61]:
def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=500,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [81]:
BASE_PROMPT = """You will be provided with a community question, a social comment A answering the question, and a list of other social comments also answering the question.
You will be asked to select best-matching (most semantically similar and sharing similar opinion and the similar aspect) comments from the list of other social comments. Each best-matching comment must correspond to a every opinion from the input social comment A. The selection must be from the input list of other social comments. Do not generate new best-matching comments.

Use and output the following format:
Community Question: <the input question>
Social Comment: <the input comment>
List of Other Social Comments: <the input list of other comments>
Best-Matching Other Social Comments: <list of social comments best-matched to every opinion of the input social comment>

"""

In [82]:
INFERENCE_PROMPT = """Now perform the task on the following input:
Community Question: {Question}
Social Comment: {Comment}
List of Other Social Comments: {Comment_List}
"""

In [98]:
# row = df.iloc[0]
# row = df.iloc[1]
row = df.iloc[3]

In [99]:
prompt = BASE_PROMPT + INFERENCE_PROMPT.format(Question = row['questionText'], Comment = row['selected_relevent_sent'], Comment_List=row['other_selected_relevent_sent'])

In [100]:
print(prompt)

You will be provided with a community question, a social comment A answering the question, and a list of other social comments also answering the question.
You will be asked to select best-matching (most semantically similar and sharing similar opinion and the similar aspect) comments from the list of other social comments. Each best-matching comment must correspond to a every opinion from the input social comment A. The selection must be from the input list of other social comments. Do not generate new best-matching comments.

Use and output the following format:
Community Question: <the input question>
Social Comment: <the input comment>
List of Other Social Comments: <the input list of other comments>
Best-Matching Other Social Comments: <list of social comments best-matched to every opinion of the input social comment>

Now perform the task on the following input:
Community Question: What should I draw next?   A FB user said they wanted to see a Zapdos, but depending on what people

In [101]:
response = get_completion(prompt, model)

In [102]:
print(response)

Community Question: What should I draw next?   A FB user said they wanted to see a Zapdos, but depending on what people say on here, if any, I may do one that you all choose! So help out if you want!            Or don't. :)
Social Comment: I'll consider it for sure!
List of Other Social Comments: ["I'd like to see any of the Legendary Beasts, Raikou, Suicune, Entei.   ", "Ooo I'm kinda thinking of Suicune now.", 'Good choice!', 'If not, it will definitely be very soon!    ', 'Pancham   ', 'Oooo good choice.', "That'll be one of the next ones too.   ", "If you have the time, I'd love to see Reshiram!   ", "I'll definitely do that as one of the next ones!   ", 'Thank you very much!   ', 'Seel?   ', 'With your username, I may just do that soon.   ', 'Cool!    ', 'MAGIKARP   ', 'Dear Lord.      ', 'Aron & lairon!   ', "You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.   ", 'While I admire your steadfast courage in how awesome Girafig is, I may wait

In [59]:
print(response)

Community Question: What should I draw next?   A FB user said they wanted to see a Zapdos, but depending on what people say on here, if any, I may do one that you all choose! So help out if you want!            Or don't. :)
Social Comment: I'd like to see any of the Legendary Beasts, Raikou, Suicune, Entei.   
List of Other Social Comments: ["Ooo I'm kinda thinking of Suicune now.", 'Good choice!', "I'll consider it for sure!", 'If not, it will definitely be very soon!    ', 'Pancham   ', 'Oooo good choice.', "That'll be one of the next ones too.   ", "If you have the time, I'd love to see Reshiram!   ", "I'll definitely do that as one of the next ones!   ", 'Thank you very much!   ', 'Seel?   ', 'With your username, I may just do that soon.   ', 'Cool!    ', 'MAGIKARP   ', 'Dear Lord.      ', 'Aron & lairon!   ', "You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.   ", 'While I admire your steadfast courage in how awesome Girafig is, I may wait

In [50]:
print(response)

Community Question: What should I draw next?   
Social Comment: I'd like to see any of the Legendary Beasts, Raikou, Suicune, Entei.   
List of Other Social Comments: ["Ooo I'm kinda thinking of Suicune now.", 'Good choice!', "I'll consider it for sure!", 'If not, it will definitely be very soon!    ', 'Pancham   ', 'Oooo good choice.', "That'll be one of the next ones too.   ", "If you have the time, I'd love to see Reshiram!   ", "I'll definitely do that as one of the next ones!   ", 'Thank you very much!   ', 'Seel?   ', 'With your username, I may just do that soon.   ', 'Cool!    ', 'MAGIKARP   ', 'Dear Lord.      ', 'Aron & lairon!   ', "You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.   ", 'While I admire your steadfast courage in how awesome Girafig is, I may wait on that one. :P', 'BUT.', 'Maybe.', ':)']
Best-Matching Other Social Comments: ["Ooo I'm kinda thinking of Suicune now.", 'Good choice!', 'Oooo good choice.', "That'll be one 

In [40]:
print(response)

Community Question: What should I draw next?   A FB user said they wanted to see a Zapdos, but depending on what people say on here, if any, I may do one that you all choose! So help out if you want!            Or don't. :)
Social Comment: I'd like to see any of the Legendary Beasts, Raikou, Suicune, Entei.   
List of Other Social Comments: ["Ooo I'm kinda thinking of Suicune now.", 'Good choice!', "I'll consider it for sure!", 'If not, it will definitely be very soon!    ', 'Pancham   ', 'Oooo good choice.', "That'll be one of the next ones too.   ", "If you have the time, I'd love to see Reshiram!   ", "I'll definitely do that as one of the next ones!   ", 'Thank you very much!   ', 'Seel?   ', 'With your username, I may just do that soon.   ', 'Cool!    ', 'MAGIKARP   ', 'Dear Lord.      ', 'Aron & lairon!   ', "You should girafarig because it's the best pokemon ever created no matter what ANYONE SAYS.   ", 'While I admire your steadfast courage in how awesome Girafig is, I may wait

In [25]:
row

questionText                    What should I draw next?   A FB user said they...
selected_relevent_sent          I'd like to see any of the Legendary Beasts, R...
summary                         Commenters suggest ideas of the next Pokemon f...
id                                                                              0
subset                                                                     reddit
other_selected_relevent_sent    [Ooo I'm kinda thinking of Suicune now., Good ...
Name: 0, dtype: object

In [None]:
prompt = """In this task you are presented with a business category, a sentence taken from a review of a business in that category and a list of key points already matched to the sentence. \ 
The business category, the sentence and the list of key points are delimited by triple quotes.
You will be asked to select top n most relevant and non-duplicate key points to the sentence. The selection must be from the provided list of key points. Do not generate new key points.

Use and output the following format:
Comment: <the input comment>
Key Points: <the input key points>
Matched Key Points: <list of key points matched to the comments sorted by their relevance>

Business category: \"\"\"Arts & Entertainment\"\"\"
Review sentence: \"\"\"Loved the small concert area and all the details put into the design of the brewery.\"\"\"
Key Points: \"\"\"['We will definitely be returning!', 'This was a great experience!', 'Fun place to go.', 'The atmosphere here is great.', 'Decor is trendy and fun.']\"\"\"
Matched Key Points: ['The atmosphere here is great.', 'Decor is trendy and fun.']

Business category: \"\"\"{business_category}\"\"\"
Review sentence: \"\"\"{review_sentence}\"\"\"
Key Points: \"\"\"{key_points}\"\"\"
"""

In [None]:
prompt = """In this task you are presented with a business category, a sentence taken from a review of a business in that category and a list of key points already matched to the sentence. \ 
The business category, the sentence and the list of key points are delimited by triple quotes.
You will be asked to select top n most relevant and non-duplicate key points to the sentence. The selection must be from the provided list of key points. Do not generate new key points.

Use and output the following format:
Comment: <the input comment>
Key Points: <the input key points>
Matched Key Points: <list of key points matched to the comments sorted by their relevance>

Business category: \"\"\"Arts & Entertainment\"\"\"
Review sentence: \"\"\"Loved the small concert area and all the details put into the design of the brewery.\"\"\"
Key Points: \"\"\"['We will definitely be returning!', 'This was a great experience!', 'Fun place to go.', 'The atmosphere here is great.', 'Decor is trendy and fun.']\"\"\"
Matched Key Points: ['The atmosphere here is great.', 'Decor is trendy and fun.']

Business category: \"\"\"{business_category}\"\"\"
Review sentence: \"\"\"{review_sentence}\"\"\"
Key Points: \"\"\"{key_points}\"\"\"
"""