In [25]:
import re
import os
import pandas as pd

# load log file
log_path = os.path.expanduser("~/Desktop/temp/evaluation_input/origin/rag_sumrag_interview_1090_log_2025-05-12_17-43.log")
with open(log_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

records = []
current_id = None
current_question = None
collecting = False
context_lines = []

for line in lines:
    line = line.strip()

    # extract respondent_id
    m_file = re.match(r".*Processing file: ([a-f0-9\-]+)", line)
    if m_file:
        current_id = m_file.group(1)
        continue

    # extract guide_question
    m_question = re.match(r".*Processing guide question \(top-k mataches\): (.+)", line)
    if m_question:
        # save previous block if valid
        if current_id and current_question and context_lines:
            # filter: keep only lines starting with Interviewer: or Interviewee:
            clean_context = [l for l in context_lines if l.startswith("Interviewer:") or l.startswith("Interviewee:")]
            if clean_context:
                records.append({
                    "respondent_id": current_id,
                    "guide_question": current_question,
                    "retrieved_context": "\n".join(clean_context)
                })
        current_question = m_question.group(1)
        context_lines = []
        collecting = False
        continue

    # start collecting
    if "Relevant Interviewee Responses:" in line:
        collecting = True
        continue

    if collecting:
        if "Processing guide question" in line or "Processing file:" in line:
            collecting = False
            continue
        if ("Summarized Response" in line or
            "File id processing complete." in line or
            "Output saved to" in line):
            continue
        # remove timestamp
        line = re.sub(r"^\d{4}-\d{2}-\d{2}.*? - INFO - ", "", line)
        if line and not re.match(r"=+", line):
            context_lines.append(line)

# save last block
if current_id and current_question and context_lines:
    clean_context = [l for l in context_lines if l.startswith("Interviewer:") or l.startswith("Interviewee:")]
    if clean_context:
        records.append({
            "respondent_id": current_id,
            "guide_question": current_question,
            "retrieved_context": "\n".join(clean_context)
        })

# create DataFrame
retrieval_df = pd.DataFrame(records)

# save cleaned file
output_path = os.path.expanduser("~/Desktop/temp/evaluation_input/retrieved_contexts_cleaned.csv")
retrieval_df.to_csv(output_path, index=False)

# preview
print(retrieval_df.head(3))


                          respondent_id  \
0  c7d7640b-9344-48aa-9d48-7395eaeda149   
1  c7d7640b-9344-48aa-9d48-7395eaeda149   
2  c7d7640b-9344-48aa-9d48-7395eaeda149   

                                      guide_question  \
0  Hey, what’s the biggest news story or issue yo...   
1  Can you tell me more about that news story? Wh...   
2  So, why does that event or issue feel like the...   

                                   retrieved_context  
0  Interviewer: Alright, thanks for hanging out w...  
1  Interviewer: Alright, thanks for hanging out w...  
2  Interviewer: Right—so it feels like a pretty b...  
