In [5]:
Welcome to LeMo, an app that uses voice memos to fill out online surveys!



In [4]:
pip install pydub


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from pydub import AudioSegment
import whisper
import pandas as pd

# Step 1: Define folder path explicitly
base_dir = "/Users/christopherbail/Dropbox/LEMO/"
folder_path = os.path.join(base_dir, "memos")

# Step 2: Load Whisper model once
model = whisper.load_model("base")

# Step 3: Prepare storage for results
transcriptions = []

# Step 4: Loop through all files in the "memos" folder
for filename in os.listdir(folder_path):
    if filename.lower().endswith((".m4a", ".mp3", ".wav")):
        file_path = os.path.join(folder_path, filename)

        # Convert audio to WAV (temporary)
        audio = AudioSegment.from_file(file_path)
        temp_wav = os.path.join(base_dir, "temp.wav")  # store temp in base dir
        audio.export(temp_wav, format="wav")

        # Transcribe
        result = model.transcribe(temp_wav)

        # Store results
        transcriptions.append({
            "filename": filename,
            "transcription": result["text"]
        })

        # Clean up temp file
        if os.path.exists(temp_wav):
            os.remove(temp_wav)

# Step 5: Convert to DataFrame
df = pd.DataFrame(transcriptions)

# Example: print or save to CSV

#print full text of transcription
pd.set_option("display.max_colwidth", None)

print(df)
# df.to_csv(os.path.join(base_dir, "transcriptions.csv"), index=False)




         filename  \
0  Lemur_Ln_7.m4a   
1  Lemur_Ln_6.m4a   
2  Lemur_Ln_2.m4a   
3  Lemur_Ln_3.m4a   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       transcription  
0                                            Okay, when

In [11]:
import os, re, json, unicodedata
import pandas as pd
from bs4 import BeautifulSoup
from openai import OpenAI
import getpass

# --------------------------------------------------------------------------------
# SETTINGS (adjust paths as needed)
# --------------------------------------------------------------------------------
SURVEY_HTML_PATH = "/Users/christopherbail/Dropbox/LEMO/Animal Handling Assessment QR.html"
OUTPUT_CSV_PATH  = "/Users/christopherbail/Dropbox/LEMO/survey_responses.csv"

# This script expects df to exist with columns: ["filename", "transcription"]
# Example:
# df = pd.DataFrame([{"filename":"memo1.m4a","transcription":"..."},
#                    {"filename":"memo2.m4a","transcription":"..."}])

# --------------------------------------------------------------------------------
# API key (prompted securely)
# --------------------------------------------------------------------------------
api_key = getpass.getpass("Please enter your OpenAI API key: ").strip()
client = OpenAI(api_key=api_key)

# --------------------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------------------
def normalize_key(s: str) -> str:
    """Canonicalize a field name: lowercase, strip accents, non-alnum->'_', collapse underscores."""
    if not s:
        return ""
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.strip().lower()
    s = "".join(ch if ch.isalnum() else "_" for ch in s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def get_label_for_input(tag: BeautifulSoup) -> str:
    """Find a human-friendly label for an input/select/textarea."""
    # 1) <label for="id">
    if tag.has_attr("id"):
        # search anywhere in the document for a label pointing to this id
        doc = tag if tag.parent is None else tag.find_parent()
        while doc and doc.parent:
            doc = doc.parent  # bubble up to root
        if doc:
            lbl = doc.find("label", attrs={"for": tag["id"]})
            if lbl and lbl.get_text(strip=True):
                return lbl.get_text(" ", strip=True)

    # 2) parent-wrapped label
    parent_lbl = tag.find_parent("label")
    if parent_lbl and parent_lbl.get_text(strip=True):
        return parent_lbl.get_text(" ", strip=True)

    # 3) preceding sibling label
    prev = tag.find_previous_sibling("label")
    if prev and prev.get_text(strip=True):
        return prev.get_text(" ", strip=True)

    # 4) nearby heading or prompt text (common in simple forms)
    for prev_tag in tag.find_all_previous(["h1","h2","h3","h4","h5","h6","p","div"], limit=3):
        txt = prev_tag.get_text(" ", strip=True)
        if txt and len(txt) <= 200:
            return txt

    # 5) fallbacks from attributes
    for attr in ("aria-label", "placeholder", "name", "id"):
        if tag.has_attr(attr) and tag[attr]:
            return str(tag[attr])

    return "unknown_field"

def option_text_for_input(opt_tag, doc_root) -> str:
    """Get display text for an option (radio/checkbox/select)."""
    # If it's a <option>, text is inside
    if opt_tag.name == "option":
        return opt_tag.get_text(" ", strip=True) or (opt_tag.get("value") or "").strip()

    # For radio/checkbox, label often uses <label for="id">
    if opt_tag.has_attr("id"):
        lbl = doc_root.find("label", attrs={"for": opt_tag["id"]})
        if lbl and lbl.get_text(strip=True):
            return lbl.get_text(" ", strip=True)

    # Fallbacks
    return (opt_tag.get("value") or "").strip()

def parse_survey_questions(html_text: str):
    """
    Extract a canonical list of questions from the HTML.
    Returns a list of dicts with:
      - key: stable column key
      - question: human-friendly question text
      - input_type: 'text' | 'textarea' | 'select' | 'radio' | 'checkbox' | other
      - options: list of option strings (for select/radio/checkbox), else []
    Groups radio/checkbox inputs by their 'name' (so one question per group).
    """
    soup = BeautifulSoup(html_text, "html.parser")

    # doc_root to help find labels by id across document
    doc_root = soup
    while doc_root.parent:
        doc_root = doc_root.parent

    questions = []
    seen_keys = set()

    # First, collect all controls
    controls = soup.find_all(["input", "select", "textarea"])

    # Group radio/checkbox by name
    groups = {}
    for tag in controls:
        t = (tag.get("type") or "").lower()
        if tag.name == "input" and t in {"submit", "button", "reset", "image", "hidden"}:
            continue

        name = tag.get("name")
        # If radio/checkbox with a name, group them
        if tag.name == "input" and t in {"radio", "checkbox"} and name:
            groups.setdefault(("group", name), []).append(tag)
        else:
            groups[("single", id(tag))] = [tag]

    for kind, members in groups.items():
        tag = members[0]
        t = (tag.get("type") or "").lower()
        name = tag.get("name") or tag.get("id") or ""

        # Determine input_type + question text
        if kind[0] == "group":
            input_type = "radio" if t == "radio" else "checkbox"
            question_text = get_label_for_input(tag)
            key_base = normalize_key(name) or normalize_key(question_text)
            # options from each member
            opts = []
            for m in members:
                opt_label = option_text_for_input(m, doc_root)
                if opt_label:
                    opts.append(opt_label)
            # uniquify key
            key = key_base
            i = 2
            while key in seen_keys:
                key = f"{key_base}_{i}"
                i += 1
        else:
            # single control (text/select/textarea/other)
            if tag.name == "select":
                input_type = "select"
            elif tag.name == "textarea":
                input_type = "textarea"
            elif tag.name == "input":
                if t in {"text","number","email","date","time","tel","url"}:
                    input_type = "text"
                else:
                    input_type = t or "text"
            else:
                input_type = "text"

            question_text = get_label_for_input(tag)
            key_base = normalize_key(name) or normalize_key(question_text)
            key = key_base or "field"
            i = 2
            while key in seen_keys:
                key = f"{key_base}_{i}"
                i += 1

            # options for select only
            opts = []
            if tag.name == "select":
                for opt in tag.find_all("option"):
                    txt = option_text_for_input(opt, doc_root)
                    if txt:
                        opts.append(txt)

        seen_keys.add(key)
        questions.append({
            "key": key,
            "question": question_text,
            "input_type": input_type,
            "options": opts
        })

    # Sort for stability
    questions.sort(key=lambda d: d["key"])
    return questions

def build_prompt(transcription: str, questions: list) -> str:
    """
    Build a strict instruction prompt. The model must return EXACTLY one JSON object
    with keys equal to the 'key' values below. If a value is not present in the
    transcription, return "NA". If options are provided, prefer choosing from them.
    """
    # Prepare a minimal schema description to guide the model
    lines = []
    for q in questions:
        if q["options"]:
            lines.append(f"- {q['key']}: {q['question']} (choose from: {', '.join(q['options'])}; if multiple, join with ';')")
        else:
            lines.append(f"- {q['key']}: {q['question']}")
    schema_hint = "\n".join(lines)
    allowed_keys = [q["key"] for q in questions]

    prompt = (
        "You are an expert annotator. Fill out a fixed survey based ONLY on the transcription.\n\n"
        "RULES:\n"
        "1) Output EXACTLY ONE JSON object.\n"
        "2) Use ONLY the keys listed in allowed_keys. Do not invent, rename, or omit keys.\n"
        "3) If a value is missing in the transcription, return \"NA\".\n"
        "4) Return strings for all values. For multi-select, join choices with semicolons.\n"
        "5) Prefer concise, direct answers.\n\n"
        f"allowed_keys = {json.dumps(allowed_keys, ensure_ascii=False)}\n\n"
        "key → question (and any options):\n"
        f"{schema_hint}\n\n"
        "TRANSCRIPTION:\n"
        f"'''{transcription}'''\n"
    )
    return prompt

# --------------------------------------------------------------------------------
# Build canonical questions from the survey HTML
# --------------------------------------------------------------------------------
with open(SURVEY_HTML_PATH, "r", encoding="utf-8") as f:
    html_text = f.read()

questions = parse_survey_questions(html_text)
if not questions:
    raise RuntimeError("No questions found in the survey HTML.")

print(f"Found {len(questions)} questions.")
print("First few keys:", [q["key"] for q in questions[:10]])

# --------------------------------------------------------------------------------
# Ask the model to answer all questions for each transcription
# --------------------------------------------------------------------------------
rows = []
for _, r in df.iterrows():
    transcription = r["transcription"]
    filename = r["filename"]

    prompt = build_prompt(transcription, questions)

    resp = client.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "You return a single JSON object with exactly the allowed keys."},
            {"role": "user", "content": prompt}
        ],
    )

    content = resp.choices[0].message.content.strip()

    # Parse JSON (the response_format should ensure valid JSON)
    try:
        obj = json.loads(content)
    except json.JSONDecodeError:
        # last-ditch: try to extract {...}
        m = re.search(r"\{.*\}", content, flags=re.DOTALL)
        if not m:
            raise
        obj = json.loads(m.group(0))

    # Coerce to the canonical schema
    row = {"filename": filename}
    for q in questions:
        key = q["key"]
        val = obj.get(key, "NA")
        if val is None or (isinstance(val, str) and not val.strip()):
            val = "NA"
        if isinstance(val, list):
            val = ";".join(str(x) for x in val)
        else:
            val = str(val)
        row[key] = val

    rows.append(row)

# --------------------------------------------------------------------------------
# Save final CSV
# --------------------------------------------------------------------------------
final_cols = ["filename"] + [q["key"] for q in questions]
final_df = pd.DataFrame(rows, columns=final_cols)
final_df.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"✅ Wrote {len(final_df)} rows to {OUTPUT_CSV_PATH}")
print(f"Columns: {len(final_df.columns)}")
print(final_df.columns.tolist()[:12])


Please enter your OpenAI API key:  ········


Found 22 questions.
First few keys: ['animal_name', 'capture_notes_v2v2', 'catch_categoryv2', 'chkn_handling_hubandry_v2v2', 'chkn_qrbehavior', 'end_v2v2', 'handling_categoryv2', 'handling_date_v2v2', 'handling_hubandry_other_v2v2', 'handling_notes_v2v2']
✅ Wrote 4 rows to /Users/christopherbail/Dropbox/LEMO/survey_responses.csv
Columns: 23
['filename', 'animal_name', 'capture_notes_v2v2', 'catch_categoryv2', 'chkn_handling_hubandry_v2v2', 'chkn_qrbehavior', 'end_v2v2', 'handling_categoryv2', 'handling_date_v2v2', 'handling_hubandry_other_v2v2', 'handling_notes_v2v2', 'handling_recorder_v2v2']
