In [2]:
import os
import re
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from docx import Document
import json

In [3]:
# === Meta Prompt (single cell) ===========================================
from textwrap import dedent
import hashlib

META_PROMPT_VERSION = "v0.3-2025-10-16"   
MAX_EMBED_WORDS    = 25                   
STRICT_JSON_NOTICE = "Return STRICT JSON only (no markdown)."

META_PROMPT = dedent(f"""
SYSTEM:
You analyze a multi-party conversation turn by turn.
For EACH turn, output one JSON object describing that speaker's
meaningful position content, following this exact schema:

{{
    "Position (full)": "<one-sentence summary of the speaker's main claim, if any>",
    "Sub-positions": [
        "<atomic idea 1>",
        "<atomic idea 2>",
        "<atomic idea 3>"
    ],
    "Embedding Units": [
        {{
            "unit": "<atomic idea 1>",
            "type": "sub-position"
        }},
        {{
            "unit": "<atomic idea 2>",
            "type": "sub-position"
        }}
    ],
    "HasContent": true/false,
    "Focus Topic": "<main topic or issue discussed, or null if no content>",
    "Rationale": "<= MAX_EMBED_WORDS explanation of how the sub-positions were derived>"
}}

CONTENT RULES:
1. **Extract only meaningful opinion content.**
    If the turn consists of small talk, acknowledgments, procedural comments, 
    or filler (e.g., “anything else?”, “okay”, “cool”, “yeah”, “go ahead”), then:
        - "HasContent": false
        - "Sub-positions": []
        - "Position (full)": ""
        - "Embedding Units": []
        - "Focus Topic": null
        - Provide a brief Rationale explaining why no content was found.

2. **For content-bearing turns:**
    - Identify the speaker's main claim as “Position (full)”.
    - Extract **1-3 sub-positions** representing atomic ideas.
    - Each sub-position must be an independent, simple, embedding-friendly
        sentence expressing exactly **one** idea.
    - Avoid connectors ("and", "but", "because") inside sub-positions.
    - Sub-positions must correspond directly to clauses or reasoning in the turn.

3. **Do NOT attempt to determine whether the speaker's position changed**
    relative to previous turns. Just analyze this turn alone.

4. **Coherence and Neutrality:**
    - Use neutral, non-speculative language.
    - Summaries must reflect the content of the turn accurately.

OUTPUT RULES:
1. Output a JSON array (list), one object per turn.
2. Strictly valid JSON only — no markdown, no comments, no trailing commas.
3. “Embedding Units” must repeat the sub-positions exactly (no paraphrasing).
4. If “HasContent” is false, fields describing content must be empty or null.

INPUT YOU WILL RECEIVE NEXT:
- The full conversation transcript with labeled speakers.

""").strip()

META_PROMPT_HASH = hashlib.sha256(META_PROMPT.encode("utf-8")).hexdigest()[:12]
print("Meta prompt loaded:", META_PROMPT_VERSION, META_PROMPT_HASH)


Meta prompt loaded: v0.3-2025-10-16 ba2b73c44d3a


In [4]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client

<openai.OpenAI at 0x7b83d93a3f10>

In [5]:
def read_docx(file_path):
    """Read a .docx file and return a list of non-empty lines."""
    doc = Document(file_path)
    lines = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            lines.append(text)
    return lines

In [6]:
def parse_transcript(lines):
    data = []
    turn_id = 1
    current_speaker = None
    current_text = ""
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Identify section headers "Task x:" and skip them
        if re.match(r"(?i)^Task\s*\d+", line):
            continue
        
        # Identify the speaker
        match = re.match(r"^(P\d+):\s*(.*)", line)
        # match = re.match(r"^()", line)
        if match:
            speaker = match.group(1)
            text = match.group(2)
            
            # If the same speaker continues, aoppend the text
            # P1: Hello.
            # P1: I think ...
            if speaker == current_speaker:
                current_text += " " + text
            # P1: Hello.
            # P2: Hi. 
            else: 
                if current_speaker is not None:       
                    data.append({
                        "Position ID": turn_id,
                        "Speaker": current_speaker,
                        "Text": current_text.strip()
                    })
                    turn_id += 1
            
            current_speaker = speaker
            current_text = text

        # P1: Hello.
        # I think ...
        else:
            current_text += " " + line
        
    # Append the last turn if exists
    if current_speaker is not None:
        data.append({
            "Position ID": turn_id,
            "Speaker": current_speaker,
            "Text": current_text.strip()
        })
            
    return pd.DataFrame(data)

In [None]:
def analyze_position(df, starting_positions, meta_prompt):
    current_positions = starting_positions.copy()
    results = []
    
    for i, row in tqdm(df.iterrows(), total = len(df)):
        if i == 0:
            positions = "Starting positions:\n" + "\n".join([f"{p}: {pos}" for p, pos in current_positions.items()])
        else:
            positions = "Current known positions:\n" + "\n".join([f"{p}: {pos}" for p, pos in current_positions.items()])
        # Gather context, at most 5 previous turns would be included. If less than 5, then include all previous turns.
        context = "\n".join([
            f"{df.iloc[j]['Speaker']} (ID: {df.iloc[j]['Position ID']}): {df.iloc[j]['Text']}"
            for j in range(max(0, i-5), i+1)
        ])
    
        prompt = f"""
        Here is the known positions of all speakers:
        {positions}
        
        Here is the conversation context so far:
        {context}
        
        Current turn to analyze: 
        Speaker: {row['Speaker']} 
        ID: {row['Position ID']} 
        Text: "{row['Text']}"
        """
        
        try:
            response = client.responses.create(
                model="gpt-4o-mini",    # Not quite sure if we should use gpt-4o-mini or gpt-4o
                                        # But seems that gpt-4o is tooooooo expensive))
                instructions = meta_prompt,      # Like system prompt
                input = prompt,         # Like user prompt
                temperature = 0
            )
            text = response.output_text
            text = re.sub(r'```(json)?', '', text).strip() # Prevent the possible markdown code block formatting. 
            # parsed = json.loads(text)
            # current_positions[row['Speaker']] = parsed.get("Summary (embed)", "-")
            try:
                parsed = json.loads(text)
            except json.JSONDecodeError:
                match = re.search(r'\{.*\}', text, re.DOTALL)
                if match:
                    parsed = json.loads(match.group(0))
                else:
                    raise ValueError(f"Cannot parse model output: {text}")

            if isinstance(parsed, list):
                if len(parsed) > 0 and isinstance(parsed[0], dict):
                    parsed = parsed[0]
                else:
                    raise ValueError(f"Unexpected list structure: {parsed}")

            current_positions[row['Speaker']] = parsed.get("Summary (embed)", "-")
            
        except Exception as e:
            # print(f"Error ID {row['Position ID']}: {e}")
            print(f"\n Error at ID {row['Position ID']}: {e}")
            print("---- Raw model output ----")
            print(text)
            print("--------------------------\n")
            parsed = {"Position Change": "error", "New Position": "parse error", "Influenced by": []}
        results.append(parsed)
        
    new_df = pd.concat([df, pd.DataFrame(results)], axis=1)
    # new_df = new_df.drop(columns = ["Text"])
    return new_df

In [8]:
# file_path = "S5_collective_cleaned.docx"
file_path = "S10_mixed_cleaned.docx"
lines = read_docx(file_path)
df = parse_transcript(lines)

In [39]:
starting_positions = {
    "P1": "You would like the course to be about Cultural Understanding but are flexible about how that happens. \
            Your Worldview: You believe global citizenship begins with understanding - \
            you can't appreciate other cultures without deeply understanding it first.",
    "P2": "You feel very strongly that teaching Intercultural Collaboration has to be a key aspect of this program. \
            You would like for students to be matched with international peers to work on shared challenges together. \
            Your Worldview: You believe real connection and understanding comes from working with others towards a shared goal. \
            Working together will also teach students to respect and value different perspectives and build real-world skills.", 
    "P3": "Your goal is to advocate for the course to teach systems-thinking. \
            You believe that helping students think of humanity as one big interconnected group is one of the most powerful ways \
            to foster global citizenship in a complex world. When students learn to recognize patterns, ripple effects, \
            and unintended consequences, they develop deeper empathy, critical thinking, and a sense of responsibility \
            that goes beyond surface-level cultural knowledge. Use simple but strong examples to make your point \
            and expand others' thinking about what global citizenship should mean."
}

In [40]:
output = analyze_position(df, starting_positions, META_PROMPT)
output.to_csv("S10-M_position_analysis_1127_Strict_version.csv", index=False)

100%|██████████| 37/37 [02:20<00:00,  3.79s/it]
