# This notebook is designed to generate explanations for a given dataset.

In [1]:
from openai import OpenAI

import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import json

from utils import parse_response

# Load OPENAI_API_KEY from .env file
load_dotenv()

client = OpenAI()

## Prompts

In [13]:
# Set the file to apply the explanations for
file = "../data/wdc/train_large/preprocessed_wdcproducts80cc20rnd000un_train_large_domain_simple_free.csv"

In [16]:
def generate_structured_explanations(product_1, product_2, label, custom_id):
    
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [
                {"role": "user", "content": f"""
                Do the two entity descriptions refer to the same real-world entity?
                Entity 1: {product_1}
                Entity 2: {product_2}

                The correct answer is {label}.

                Please provide an explanation for this answer in a structured format, listing the attributes that you compared for reaching this answer. Each attribute should be accompanied by the attribute values and a score between -1 and 1 that shows the importance of the attribute for the decision. If the attribute influenced the decision towards non-match the importance score should be negative. If the attribute pointed towards a match, the importance score should be positive. Also provide a similarity score for the attribute values. If an attribute only occurs in one item, specify the value of that attribute for the other item as "missing". An example output is the following:

                attribute=brand|||importance=0.05|||values=Logitech###Logitech|||similarity=1.00
                attribute=model|||importance=-0.95|||values=MX G500###MX Master 3S|||similarity=0.20
                attribute=color|||importance=0.00|||values=missing###Graphite|||similarity=0.00
                
                Here is a complete example:
                Do the two product descriptions refer to the same real-world product? Entity 1: 'WD 4TB Black My Passport Portable External Hard Drive - USB 3.0 - WDBYFT0040BBK-WESN'. Entity 2: 'Dysk WD My Passport 1TB USB 3.0 black'.
                "No. 
                attribute=brand|||importance=0.05|||values=Western Digital###Western Digital|||similarity=1.00
                attribute=model|||importance=0.95|||values=My Passport###My Passport|||similarity=1.00
                attribute=storage capacity|||importance=0.9|||values=4TB###1TB|||similarity=0.25
                attribute=color|||importance=0.1|||values=Black###Black|||similarity=1.00
                attribute=USB version|||importance=0.05|||values=USB 3.0###USB 3.0|||similarity=1.00
                
                Do not provide a explanation in a different format. The explanation should be in the format described above. Only provide the answer and explanation dont repeat the question.
                """}
            ],
            "max_tokens": 1000,
            "temperature": 0
        }
    }



In [17]:
def generate_wadhwa_explanations(product_1, product_2, label, custom_id):
    label = "MATCH" if label == 1 else "NOT A MATCH"
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": f"""
                <s>[INST] Given the following two examples, provide an explanation for the third example for why the two entities do or do not match. [\INST]

                Entity A: [NAME] samsung dlp tv stand in black tr72bx [DESCRIPTION] samsung dlp tv stand in black tr72bx designed to fit samsung hlt7288, hlt7288, hl72a650, and hl67a650 television sets tempered 6mm tinted glass shelves wide audio storage shelves to accommodate 4 or more components wire management system easy to assemble high gloss black finish [PRICE] 369.0
                Entity B: [NAME] samsung tr72b tv stand [DESCRIPTION] glass black [PRICE] 232.14
                Label: MATCH
                Explanation: Both entities refer to samsung TV stand in black and therefore have substantially similar specifications, therefore they’re a match. </s>

                Entity A: [NAME] canon high capacity color ink cartridge color ink cl51 [DESCRIPTION] canon high capacity color ink cartridge cl51 compatible with pixma ip6210d, ip6220d, mp150, mp170 and mp450 printers [PRICE] 35.0
                Entity B: [NAME] canon pg-40 twin pack black ink cartridge 0615b013 [DESCRIPTION] black [PRICE]
                Label: NOT A MATCH
                Explanation: Entity A refers to color ink cartridge while Entity B is a black ink cartridge, therefore they are not a match. </s>

                Entity A: [NAME] {product_1.get("name")} [DESCRIPTION] {product_1.get("description")} [PRICE] {product_1.get("price")}
                Entity B: [NAME] {product_2.get("name")} [DESCRIPTION] {product_2.get("description")} [PRICE] {product_2.get("price")}
                Label: {label}
                Explanation:
                """}
            ],
            "max_tokens": 128,
            "temperature": 0,
            "top_p": 0.95,
        }
    }

In [14]:
# Function to extract the entity strings
def extract_entities(text):
    entity_1 = text.split("Entity 1: '")[1].split("'")[0]
    entity_2 = text.split("Entity 2: '")[1].split("'")[0]
    return entity_1, entity_2

In [19]:
# set the file to generate explanations for 
small_df = pd.read_csv(file)

# Create the JSONL file with all requests
requests = []
for index, row in tqdm(small_df.iterrows(), total=small_df.shape[0]):
    product_1, product_2 = extract_entities(row["prompt"])
    label = row["completion"]
    custom_id = str(index)
    prompt = generate_structured_explanations(product_1, product_2, label, custom_id=custom_id)
    requests.append(prompt)

batch_file_path = "explanation.jsonl"
with open(batch_file_path, "w") as f:
    for request in requests:
        f.write(json.dumps(request) + "\n")
        
        
batch_input_file = client.files.create(
    file=open(batch_file_path, "rb"),
    purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Generate structured explanations for the WDC dataset"}
)


  0%|          | 0/19835 [00:00<?, ?it/s]

In [20]:
client.batches.retrieve(batch.id)

# download the results
batch_output_file = client.files.retrieve(batch.output_file_id)
batch_output_file.download_to_file("explanation_output.jsonl")


NameError: name 'batch' is not defined

In [None]:
generated_explanations = pd.read_json("explanation_output.jsonl", lines=True)

generated_explanations_parsed = generated_explanations["response"].apply(parse_response)    
generated_explanations = pd.concat([generated_explanations, generated_explanations_parsed], axis=1)

# convert the custom_id to an int
generated_explanations["custom_id"] = generated_explanations["custom_id"].astype(int)

dataset_without_explanations = pd.read_csv(file)

for index, row in dataset_without_explanations.iterrows():
    custom_id = index
    explanation = generated_explanations[generated_explanations["custom_id"] == custom_id]["content"].values[0]
    dataset_without_explanations.at[index, "completion"] = explanation
    
dataset_without_explanations.to_csv(file.replace(".csv", "_with_explanation.csv"), index=False)