In [8]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import os
from glob import glob

# Modular Functions
def load_guidelines(guidelines_path):
    """Load discussion guide questions from a CSV file."""
    guidelines = pd.read_csv(guidelines_path)
    return guidelines["guide_text"].tolist()

def load_transcript(transcript_path):
    """Load a transcript file into a string."""
    with open(transcript_path, "r", encoding="utf-8") as f:
        return f.read()

def segment_transcript(transcript, nlp):
    """Group interviewer and interviewee turns into question-response pairs."""
    doc = nlp(transcript)
    groups = []
    current_group = {"interviewer": [], "interviewee": []}
    current_speaker = None

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if sent_text.startswith("Interviewer:"):
            if current_group["interviewer"] or current_group["interviewee"]:
                groups.append(current_group)
                current_group = {"interviewer": [], "interviewee": []}
            current_speaker = "Interviewer"
            current_group["interviewer"].append(sent_text)
        elif sent_text.startswith("Interviewee:"):
            current_speaker = "Interviewee"
            current_group["interviewee"].append(sent_text)
        elif current_speaker:
            current_group[current_speaker.lower()].append(sent_text)

    if current_group["interviewer"] or current_group["interviewee"]:
        groups.append(current_group)

    return [g for g in groups if g["interviewer"] and g["interviewee"]]

def match_responses(groups, guide_questions, model, device, confidence_threshold=0.5):
    """Match interviewee responses to guide questions based on interviewer questions."""
    matches = []
    for group in groups:
        interviewer_question = " ".join(group["interviewer"]).replace("Interviewer: ", "")
        interviewee_response = " ".join(group["interviewee"]).replace("Interviewee: ", "")
        
        question_embedding = model.encode(interviewer_question, convert_to_tensor=True, device=device)
        guide_embeddings = model.encode(guide_questions, convert_to_tensor=True, device=device)
        
        similarities = util.cos_sim(question_embedding, guide_embeddings)[0].cpu().numpy()
        best_question_idx = np.argmax(similarities)
        best_similarity = similarities[best_question_idx]
        
        matches.append({
            "response": interviewee_response,
            "question": guide_questions[best_question_idx],
            "similarity": float(best_similarity),
            "is_uncertain": best_similarity < confidence_threshold
        })
    
    return matches

def generate_output(interview_files, matches_list, guide_questions):
    """Generate a structured CSV with one row per interview and columns for guide questions."""
    output_data = []
    for file_path, matches in zip(interview_files, matches_list):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        row = {"Interview File": file_name}
        
        # Initialize all question columns with empty strings
        for question in guide_questions:
            row[question] = ""
        
        # Populate matched responses
        for match in matches:
            if not match["is_uncertain"]:  # Only include confident matches
                row[match["question"]] = match["response"]
        
        output_data.append(row)
    
    output_df = pd.DataFrame(output_data)
    output_df.to_csv("matched_interviews.csv", index=False)
    print("Output saved to matched_interviews.csv")
    print(output_df[["Interview File"] + guide_questions[:2]])  # Print subset for brevity

# Main Execution
def main(transcript_dir, guidelines_path):
    # Initialize NLP tools
    nlp = spacy.load("en_core_web_sm")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    device = torch.device("cpu")  # Use CPU to avoid MPS issues
    
    # Load guidelines
    guide_questions = load_guidelines(guidelines_path)
    
    # Load all transcript files
    transcript_files = glob(os.path.join(transcript_dir, "*.txt"))
    if not transcript_files:
        raise FileNotFoundError("No transcript files found in directory")
    
    matches_list = []
    for transcript_path in transcript_files:
        # Load and segment transcript
        transcript = load_transcript(transcript_path)
        groups = segment_transcript(transcript, nlp)
        
        # Match responses
        matches = match_responses(groups, guide_questions, model, device)
        matches_list.append(matches)
    
    # Generate output
    generate_output(transcript_files, matches_list, guide_questions)

# if __name__ == "__main__":
#     transcript_dir = "data/synthetic_data/"
#     guidelines_path = "data/synthetic_data/interview_518_guidelines.csv"
#     main(transcript_dir, guidelines_path)

In [9]:
import os

data_directory = "../data/private_data/"
interview_name = "interview_1090"
interviews_directory = data_directory + interview_name + "/"
guidelines_path = data_directory + interview_name + "_guidelines.csv"
main(interviews_directory, guidelines_path)

Output saved to matched_interviews.csv
                         Interview File  \
0  c7d7640b-9344-48aa-9d48-7395eaeda149   
1  387bf5f4-4944-4247-9980-d69983b44a6f   
2  6c8cf423-2a8d-4a67-9133-c1c34e3ee04f   
3  f405a20e-d532-4abc-a197-8099f2270344   
4  7b18e570-043a-4b9d-8e6a-5880c770e96b   
5  fe189a48-e69d-4c97-b004-b25789b1f63d   
6  ea99ab44-c149-4919-9601-7d6c013af9c2   
7  fbfe46f7-24aa-465c-a744-587f472077a7   
8  57b45fe6-c016-4e1d-aef6-d50309d92c17   
9  19ab5410-a614-4a1b-99ca-f15ad467cb54   

  Hey, what’s the biggest news story or issue you’ve heard about lately?  \
0  Um, well, the upcoming election. That’s what e...                       
1  Yeah, actually, there was this thing about soc...                       
2  Yeah, for sure. The thing I keep hearing about...                       
3  I mean, it’s surprising, but also kind of sad,...                       
4  Uh, yeah, there is a election. And if Trump ve...                       
5  Uh, yeah, so, the shootings 