In [1]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import os
from glob import glob
from dotenv import load_dotenv
import openai
import re

In [2]:
def load_guidelines(guidelines_path):
    """Load discussion guide questions from a CSV file."""
    guidelines = pd.read_csv(guidelines_path)
    return guidelines["guide_text"].tolist()

def load_transcript(transcript_path):
    """Load a transcript file into a string."""
    with open(transcript_path, "r", encoding="utf-8") as f:
        return f.read()

In [3]:
# def segment_transcript(transcript, nlp):    
#     """Group interviewer and interviewee turns into question-response pairs."""
#     # Preprocess transcript to split on speaker tags
#     doc = nlp(transcript)
#     groups = []
#     current_group = {"interviewer": [], "interviewee": []}
#     current_speaker = None

#     for sent in doc.sents:
#         sent_text = sent.text.strip()
#         if sent_text.startswith("Interviewer:"):
#             if current_group["interviewer"] or current_group["interviewee"]:
#                 groups.append(current_group)
#                 current_group = {"interviewer": [], "interviewee": []}
#             current_speaker = "Interviewer"
#             current_group["interviewer"].append(sent_text)
#         elif sent_text.startswith("Interviewee:"):
#             current_speaker = "Interviewee"
#             current_group["interviewee"].append(sent_text)
#         elif current_speaker:
#             current_group[current_speaker.lower()].append(sent_text)

#     if current_group["interviewer"] or current_group["interviewee"]:
#         groups.append(current_group)

#     for g in groups:
#         print(g)
    
#     return [g for g in groups if g["interviewer"] and g["interviewee"]]



def segment_transcript(transcript, nlp):
    """Group interviewer and interviewee turns into question-response pairs."""
    # Split the transcript on speaker tags, keeping the tags
    parts = re.split(r'(Interviewer:|Interviewee:)', transcript)
    
    # Initialize list to hold speaker turns
    turns = []
    current_speaker = None
    current_text = ""
    
    # Group text into speaker turns
    for part in parts:
        if part in ["Interviewer:", "Interviewee:"]:
            if current_speaker:
                turns.append((current_speaker, current_text.strip()))
            current_speaker = part
            current_text = ""
        else:
            current_text += part
    
    # Append the last turn
    if current_speaker:
        turns.append((current_speaker, current_text.strip()))
    
    # Create question-answer pairs
    groups = []
    for i in range(0, len(turns) - 1, 2):
        if turns[i][0] == "Interviewer:" and turns[i+1][0] == "Interviewee:":
            groups.append({
                "interviewer": [turns[i][1]],
                "interviewee": [turns[i+1][1]]
            })    
    return groups

In [4]:
def embed_groups(groups, model, device):
    """Convert all groups into embeddings for interviewer questions."""
    group_embeddings = []
    for group in groups:
        interviewer_question = " ".join(group["interviewer"]).replace("Interviewer: ", "")
        embedding = model.encode(interviewer_question, convert_to_tensor=True, device=device)
        group_embeddings.append({
            "interviewer_question": interviewer_question,
            "interviewee_response": " ".join(group["interviewee"]).replace("Interviewee: ", ""),
            "embedding": embedding
        })
    return group_embeddings

def match_top_k_questions(guide_question, group_embeddings, model, device, k=3):
    """Match a guideline question to the top k groups based on similarity."""
    question_embedding = model.encode(guide_question, convert_to_tensor=True, device=device)
    similarities = []
    
    for group in group_embeddings:
        similarity = util.cos_sim(question_embedding, group["embedding"]).cpu().numpy()[0][0]
        similarities.append({
            "response": group["interviewee_response"],
            "question": group["interviewer_question"],
            "similarity": float(similarity)
        })
    
    # Sort by similarity and select top k
    similarities.sort(key=lambda x: x["similarity"], reverse=True)
    return similarities[:k]

In [5]:
def query_chatgpt(context, query, gpt_model, api_key):
    """Query ChatGPT API with the provided context and query."""
    openai.api_key = api_key
    # prompt = f"""Here is the extract of an interview: {context}
    #              Using the relevant information from the context,
    #              concisely provide the main point of the answer to the query: {query}, 
    #              complete sentences are not required.
    #              If the context doesn't provide any relevant information, answer with 
    #              [I couldn't find a good match]"""

    # prompt = f"""Given the following dialogue as context: {context}
    #              Extract the interviewee's direct response to the question "{query}" 
    #              Provide a concise, one-sentence answer that summarizes their response 
    #              without additional details or elaboration. If the context does not 
    #              provide a relevant response, answer with "[No relevant response found]." """

    extract_prompt = f"""Given the following dialogue as context: {context}
                 Identify the exact phrase from the interviewee's response that directly answers the query "{query}" 
                 Return only the core phrase, excluding filler words (e.g., "um," "well") and irrelevant commentary. 
                 If no relevant response is found, return "[No relevant response found]." """

    
    # First call: Extract the exact phrase
    try:
        response = openai.chat.completions.create(
            model=gpt_model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": extract_prompt}
            ]
        )
        extracted_phrase = response.choices[0].message.content
        print("nExtract Prompt:", extract_prompt)
        print("\nExtracted Response:", extracted_phrase, "\n")

        try:
            # Second call: Summarize the extracted phrase
            summarize_prompt = f"""Given the following extracted response: {extracted_phrase}, for the query "{query}"
                            Summarize the response into a single, concise phrase, or just one sentence that captures its core meaning.
                            The number of words of the summary, should be less than or equal to the extracted responses provided.
                            If the response is "[No relevant response found]," return the same. Do not add 
                            introductory phrases or extra details."""
        
            response = openai.chat.completions.create(
                model=gpt_model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": extract_prompt}
                ]
            )
        
            response_content = response.choices[0].message.content.strip('"\'')
            print("Summarize Prompt:", summarize_prompt)
            print("\nSummarized Response:", response_content, "\n")
            return response_content
            
        except Exception as e:
            return f"Error querying ChatGPT: {str(e)}"
        
    except Exception as e:
        return f"Error querying ChatGPT: {str(e)}"

def generate_output(interview_files, matches_list, guide_questions, gpt_model, api_key):
    """Generate a structured CSV with one row per interview and columns for guide questions."""
    output_data = []
    for file_path, matches in zip(interview_files, matches_list):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        row = {"Interview File": file_name}
        
        # Initialize all question columns with empty strings
        for question in guide_questions:
            row[question] = ""
        
        # Populate with ChatGPT responses
        for match in matches:
            guide_question = match["guide_question"]

            # # Interviewee response in context only
            # context = "\n".join([m["response"] for m in match["matches"]])
            
            # Include both interviewer question and interviewee response in context
            context = "\n".join([f"Interviewer: {m['question']}\nInterviewee: {m['response']}" for m in match["matches"]])
            chatgpt_response = query_chatgpt(context, guide_question, gpt_model, api_key).strip('"\'')
            row[guide_question] = chatgpt_response
        
        output_data.append(row)
    
    output_df = pd.DataFrame(output_data)
    output_df.to_csv("rag_prototype_matched_interviews.csv", index=False)
    print("Output saved to matched_interviews.csv")
    print(output_df[["Interview File"] + guide_questions[:2]])  # Print subset for brevity

In [6]:
# Main Execution
def main(transcript_dir, guidelines_path, gpt_model, api_key):
    print("Program started ...")
    # Initialize NLP tools
    nlp = spacy.load("en_core_web_sm")
    
    # # Light weight embedding model, 384 dimensions
    # model = SentenceTransformer('all-MiniLM-L6-v2')

    # High performance, 768 dimensions, ~420 MB
    model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
    
    # device = torch.device("cpu")  # Use CPU to avoid MPS issues
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    
    # Load guidelines
    guide_questions = load_guidelines(guidelines_path)
    
    # Load all transcript files
    transcript_files = glob(os.path.join(transcript_dir, "*.txt"))
    if not transcript_files:
        raise FileNotFoundError("No transcript files found in directory")
    
    matches_list = []
    for transcript_path in transcript_files:
        # Load and segment transcript
        transcript = load_transcript(transcript_path)
        groups = segment_transcript(transcript, nlp)
        
        # Embed all groups
        group_embeddings = embed_groups(groups, model, device)
        
        # Match top k groups for each guideline question
        transcript_matches = []
        for guide_question in guide_questions:
            top_k_matches = match_top_k_questions(guide_question, group_embeddings, model, device, k=5)
            transcript_matches.append({
                "guide_question": guide_question,
                "matches": top_k_matches
            })
        matches_list.append(transcript_matches)
    
    # Generate output with ChatGPT responses
    generate_output(transcript_files, matches_list, guide_questions, gpt_model, api_key)

In [7]:
# Load environment variables and execute
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

data_directory = "../data/private_data/"
interview_name = "interview_1090"
interviews_directory = data_directory + interview_name + "/"
guidelines_path = data_directory + interview_name + "_guidelines.csv"

# gpt_model = "gpt-4"
gpt_model = "gpt-4o-mini"

main(interviews_directory, guidelines_path, gpt_model, api_key)

Program started ...
nExtract Prompt: Given the following dialogue as context: Interviewer: Alright, thanks for hanging out with me for a bit! Um, let's just jump right in—what’s the biggest news story or issue you’ve heard about lately?
Interviewee: Um, well, the upcoming election. That’s what everyone’s talking about.
Interviewer: Oh, yeah, of course. That’s a big one. Can you tell me more about that—like, what have you heard about it, where did you hear it, and, I guess, how do you feel about it?
Interviewee: It’s, um, a woman Kamala against Donald Trump. I heard about it from my parents and at school, and I think it’s kinda important. Oh, and on TV, too. Honestly, it makes me feel a little nervous but also, like, excited? I just want people to pick the best president for our country.
Interviewer: Right—so it feels like a pretty big decision, huh? Why does that feel like the most important thing right now to you?
Interviewee: I mean, it’s important we choose the best president for ou