In [27]:
from pathlib import Path
import pandas as pd 

In [28]:
from dotenv import load_dotenv

load_dotenv()

True

In [29]:
df = pd.read_json('hotpotqa-cte-inspect.jsonl', lines=True)
print(len(df))
df.head()

7405


Unnamed: 0,id,question,question_decomposition,context,answer,answers,predicted_triples,predicted_answer,score,n_hops,exact_match,f1,fuzzy_match
0,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,"[{'answer': '', 'id': 0, 'paragraph_support_id...",# Scott Derrickson\nScott Derrickson (born Jul...,yes,[yes],"Scott Derrickson, nationality, American\nEd Wo...",Yes,1.0,2,1,1.0,0
1,5a8c7595554299585d9e36b6,What government position was held by the woman...,"[{'answer': '', 'id': 0, 'paragraph_support_id...",# Shirley Temple\nShirley Temple Black (April ...,Chief of Protocol,[Chief of Protocol],"Shirley Temple, portrayed, Corliss Archer\nShi...",United States ambassador and Chief of Protocol...,0.461538,3,0,0.461538,0
2,5a85ea095542994775f606a8,"What science fantasy young adult series, told ...","[{'answer': '', 'id': 0, 'paragraph_support_id...",# The Hork-Bajir Chronicles\nThe Hork-Bajir Ch...,Animorphs,[Animorphs],"Animorphs, written by, K. A. Applegate\nThe Ho...",Animorphs,1.0,5,1,1.0,1
3,5adbf0a255429947ff17385a,Are the Laleli Mosque and Esma Sultan Mansion ...,"[{'answer': '', 'id': 0, 'paragraph_support_id...","# Laleli Mosque\nThe Laleli Mosque (Turkish: ""...",no,[no],"Laleli Mosque, located_in, Fatih\nEsma Sultan ...",No,1.0,2,1,1.0,0
4,5a8e3ea95542995a26add48d,"The director of the romantic comedy ""Big Stone...","[{'answer': '', 'id': 0, 'paragraph_support_id...",# Adriana Trigiani\nAdriana Trigiani is an Ita...,"Greenwich Village, New York City","[Greenwich Village, New York City]","Adriana Trigiani, based in, Greenwich Village\...",New York City,0.75,2,0,0.75,0


In [41]:
print(df[['n_hops', 'exact_match', 'f1']].describe().to_markdown())

|       |      n_hops |   exact_match |          f1 |
|:------|------------:|--------------:|------------:|
| count | 7405        |   7405        | 7405        |
| mean  |    2.43147  |      0.664281 |    0.816989 |
| std   |    0.711531 |      0.472273 |    0.315591 |
| min   |    2        |      0        |    0        |
| 25%   |    2        |      0        |    0.666667 |
| 50%   |    2        |      1        |    1        |
| 75%   |    3        |      1        |    1        |
| max   |    8        |      1        |    1        |


In [31]:
success_mask = df['f1'] > 0.3
fail_df = df[~success_mask]
success_df = df[success_mask]
print(len(fail_df), len(success_df), len(df))
print("{:.3f}".format(len(success_df)/len(df)))

766 6639 7405
0.897


In [32]:
df.groupby('n_hops')[['exact_match', 'f1']].agg(['mean', 'count'])

Unnamed: 0_level_0,exact_match,exact_match,f1,f1
Unnamed: 0_level_1,mean,count,mean,count
n_hops,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,0.666733,4990,0.815248,4990
3,0.643179,1774,0.810292,1774
4,0.698324,537,0.844756,537
5,0.775,80,0.885774,80
6,0.642857,14,0.840476,14
7,0.444444,9,0.77672,9
8,1.0,1,1.0,1


In [43]:
# %%
import asyncio
import pandas as pd
import json
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential_jitter

# Initialize OpenAI client
client = AsyncOpenAI()

BATCH_SIZE = 10
DEFAULT_MODEL = "llama-3-70b-tgi"
DEFAULT_TEMPERATURE = 0.5

INSPECT_PROMPT = """
Analyze the following failed Q&A data point and categorize the failure(s) into one or more of these types:
1. Granularity mismatch.
2. Semantic equivalence mismatch.
3. Temporal or hierarchical reasoning error.
4. Incorrect entity or relationship focus.
5. Formatting inconsistency.
6. Logical inconsistency in reasoning.
7. Incorrect synthesis of multiple hops.
8. Reference answer ambiguity or error.

Data:
- Context: 
{context}

- Question: {question}

- Reference Answer(s): {answers}

- Predicted triplets: 
{predicted_triples}

- Predicted Answer: {predicted_answer}

- Evaluation Metrics: EM = {exact_match:.3f}, F1 = {f1:.3f}

Output:
- Failure Categories:
- Explanation:
""".strip()

SUMMARIZE_PROMPT = """
Summarize the failure analysis results of Q&A data points. Include:
1. The frequency of each failure mode.
2. Common examples of failure modes.
3. Rare examples of failure modes.
4. Observations about systemic issues in the predictions.

Analysis data:
{categorized_failure_data}
"""


def format_prompt(row):
    return INSPECT_PROMPT.format(
        question=row["question"],
        answers=row["answers"],
        predicted_triples=row["predicted_triples"],
        predicted_answer=row["predicted_answer"],
        context=row["context"],
        exact_match=row["exact_match"],
        f1=row["f1"],
    )


@retry(stop=stop_after_attempt(3), wait=wait_exponential_jitter(max=10))
async def chat_completion(
    messages,
    model=DEFAULT_MODEL,
    temperature=DEFAULT_TEMPERATURE,
):
    response = await client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message.content


async def analyze_data_point(row):
    prompt = format_prompt(row)
    messages = [
        {
            "role": "system",
            "content": "You are an assistant for analyzing Q&A model failures.",
        },
        {"role": "user", "content": prompt},
    ]
    return await chat_completion(messages)


async def process_batch(batch):
    tasks = [analyze_data_point(row) for _, row in batch.iterrows()]
    return await asyncio.gather(*tasks)


def save_jsonl(filename, data):
    with open(filename, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")


def get_completed_batches(out_dir):
    completed_files = list(out_dir.glob("batch_*_results.jsonl"))
    completed_batches = [int(file.stem.split("_")[1]) for file in completed_files]
    return set(completed_batches)


async def process_batches(dataframe, out_dir):
    out_dir.mkdir(parents=True, exist_ok=True)

    completed_batches = get_completed_batches(out_dir)
    results = []

    for i in tqdm(range(0, len(dataframe), BATCH_SIZE), desc="Processing batches"):
        batch_index = i // BATCH_SIZE
        if batch_index in completed_batches:
            continue

        batch = dataframe.iloc[i : i + BATCH_SIZE]
        batch_results = await process_batch(batch)
        batch_with_ids = [
            {"id": row["id"], "analysis": analysis}
            for row, analysis in zip(batch.to_dict("records"), batch_results)
        ]
        results.extend(batch_with_ids)
        save_jsonl(out_dir / f"batch_{batch_index}_results.jsonl", batch_with_ids)

    return results


async def summarize_results_in_chunks(results, chunk_size=100, model=DEFAULT_MODEL):
    chunk_summaries = []

    # Divide results into smaller chunks
    for i in tqdm(range(0, len(results), chunk_size), desc="Summarizing chunks"):
        chunk = results[i : i + chunk_size]
        summary_prompt = SUMMARIZE_PROMPT.format(categorized_failure_data=chunk)
        messages = [
            {
                "role": "system",
                "content": "You are an assistant for analyzing Q&A model failures.",
            },
            {"role": "user", "content": summary_prompt},
        ]
        chunk_summary = await chat_completion(messages, model=model)
        chunk_summaries.append(chunk_summary)

    # Combine chunk summaries into a final summary
    final_prompt = """
    Combine the following summaries into a single cohesive summary:
    {chunk_summaries}
    """
    messages = [
        {
            "role": "system",
            "content": "You are an assistant for summarizing Q&A analysis summaries.",
        },
        {
            "role": "user",
            "content": final_prompt.format(
                chunk_summaries="\n\n".join(chunk_summaries)
            ),
        },
    ]
    final_summary = await chat_completion(messages, model=model)
    return final_summary


In [34]:
OUT_DIR = Path("../tmp/hotpotqa-auto-inspect-cte3/")
ANALYSIS_DIR = OUT_DIR / "analysis_results"
SUMMARY_FILE = OUT_DIR / "summary.txt"

In [35]:
results = await process_batches(fail_df, ANALYSIS_DIR)

Processing batches:   0%|          | 0/77 [00:00<?, ?it/s]

Processing batches: 100%|██████████| 77/77 [1:12:01<00:00, 56.12s/it]


In [38]:
results = []
for filepath in ANALYSIS_DIR.glob("*.jsonl"):
    with open(filepath) as f:
        for line in f:
            results.append(json.loads(line))

len(results)

766

In [44]:
summary = await summarize_results_in_chunks(results, chunk_size=30, model="gpt-4o-mini")
with open(SUMMARY_FILE, "w") as f:
    f.write(summary)

Summarizing chunks: 100%|██████████| 26/26 [04:05<00:00,  9.43s/it]
