In [None]:
"""
AF3 (Audio Flamingo 3) SFX captioning - 3 modes:
 A) keyword -> caption (two-pass; keywords included in result)
 B) direct caption (audio -> caption)
 C) in-prompt keywords (prompt says: extract keywords then caption; code doesn't pass keywords)
"""

import os
import re
import time
import json
import random
import argparse
from typing import List, Dict, Any, Optional, Tuple

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
import replicate

In [67]:
# =============================
# 0) 환경/상수
# =============================
load_dotenv(find_dotenv())
REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
if not REPLICATE_API_TOKEN:
    raise RuntimeError("No valid REPLICATE_API_TOKEN (.env 또는 환경변수 설정)")

# 모델 버전(필요시 최신 해시로 교체)
AF3_MODEL = "zsxkib/audio-flamingo-3:419bdd5ed04ba4e4609e66cc5082f6564e9d2c0836f9a286abe74bc20a357b84"

# 호출/재시도
MAX_RETRIES = 5
BASE_BACKOFF = 1.5
SLEEP_BETWEEN_CALLS = 0.2  # 과금/리밋 상황 따라 조정
# 무료/무결제 상태라면 한시적으로 늘리는 것도 옵션:
# SLEEP_BETWEEN_CALLS = 10.0

AUDIO_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg", ".aac", ".wma", ".webm"}

In [86]:
# =============================
# 1) 프롬프트 템플릿
# =============================
# 1-1) 1차 키워드 (출력: {"keywords":[...]} 만)
PROMPT_KEYWORDS = """You are an SFX keyword extractor.
Return ONLY this JSON object and nothing else (no code fences, no extra text):

{
  "keywords": ["<k1>", "<k2>", "..."]
}

Requirements:
- Output length: 8–15 items total (not fewer than 8, not more than 15).
- All items must be UNIQUE after normalization and lemmatization:
  - Lowercase, trimmed, no punctuation.
  - Remove duplicates and near-duplicates (e.g., "rain", "rainfall", "raindrops" → keep ONE best term).
  - Collapse morphological variants and inflections (e.g., "thunderclap"/"thunderclaps" → "thunderclap").
  - Prefer the most generic or acoustically representative parent term for a family of variants.
- Limit per family: do not output more than 2–3 terms from the same semantic family/root
  (e.g., rain-family, thunder-family, wind-family, water-family, etc.).
- Use concise generic nouns or bi-grams strictly grounded in the audio; prefer lowercase.
- No brands, proper names, quoted phrases, narrative text, or invented causes.
- Do NOT return a single comma-joined string; return a JSON array of strings.
- Before returning, validate:
  1) length is within 8–15 words;
  2) all terms are unique after normalization;
  3) no excessive repetition from the same family/root.

Output must be strictly valid JSON containing only the key "keywords".

"""

# 1-3) 2차 캡션(키워드 삽입형) – 방식 A
PROMPT_CAPTION_WITH_KW = """
You are “AF3 SFX Analyst.” 

Caption rules (MANDATORY):
- Exactly ONE sentence, 8–20 words.
- Present tense, objective, neutral; no narrative or speculation.
- Prefer starting with “A”, “An”, or “The”.
- Describe the foreground SFX source/action, then the environment (prepositional phrase),
  optionally a brief background clause (“with/while …”).
- Avoid “the sound(s) of …”, proper names, brands, and quotes.
- The sentence must end with a literal period “.” (do NOT write the word “end”).
- Focus on SFX traits: envelope (attack/decay), transient strength, texture/material,
  pitch motion, stereo/space, loopability, doppler, layering.

Anti-bias caption rule:
- Do not copy example keywords verbatim; prefer precise synonyms or enriched variants.

SFX keywords (from previous pass — provided input):
- {kw_list}

Using the provided SFX keywords, analyze the audio and return EXACTLY this JSON object and nothing else (no code fences, no extra text):
(no code fences, no extra text, no markers like end/[end]):
{
  "caption": "<ONE sentence, 8–20 words, present tense, article-first if natural, include environment phrase, end with a period; no 'sound(s) of', no names>"
}

Guidelines:
- Describe only what is heard (audio evidence), not unseen causes.
- Be concise, objective, and neutral; avoid brands and quotations.
- Output must be strictly valid JSON containing only the key “caption”.
"""

# 1-4) 캡션 직접 생성 – 방식 B (출력: caption만)
PROMPT_CAPTION_DIRECT = """
You are “AF3 SFX Analyst.”

Caption rules (MANDATORY):
- Exactly ONE sentence, 8–20 words.
- Present tense, objective, neutral; no narrative or speculation.
- Prefer starting with “A”, “An”, or “The”.
- Describe the foreground SFX source/action, then the environment (prepositional phrase),
  optionally a brief background clause (“with/while …”).
- Avoid “the sound(s) of …”, proper names, brands, and quotes.
- The sentence must end with a literal period “.” (do NOT write the word “end”).
- Focus on SFX traits: envelope (attack/decay), transient strength, texture/material,
  pitch motion, stereo/space, loopability, doppler, layering.

Anti-bias rule:
- Do not copy any example terms verbatim; prefer precise synonyms or enriched variants.

Return EXACTLY one JSON object and nothing else
(no code fences, no extra text, no markers like end/[end]):
{
  "caption": "<ONE sentence, 8–20 words, present tense, article-first if natural, include environment phrase, end with a period; no 'sound(s) of', no names>"
}

Guidelines:
- Describe only what is heard (audio evidence), not unseen causes.
- Be concise, objective, and neutral; avoid brands and quotations.
- Output must be strictly valid JSON containing only the key “caption”.
"""

# 1-5) 프롬프트 내부 키워드 후 캡션 – 방식 C (코드는 키워드 취급 X, 캡션만 받음)
PROMPT_CAPTION_IN_PROMPT_KW = """
You are “AF3 SFX Analyst.”


MANDATORY INTERNAL WORKFLOW (do not reveal and must requeired):
1) Silently derive 8–15 SFX keywords strictly grounded in the audio
   (e.g., source, event, material/texture, motion, environment, processing).
2) Use those internal keywords to guide your analysis and word choice for the final caption.
3) Do NOT output or mention the keywords; they are for internal reasoning only.

Caption rules (MANDATORY):
- Exactly ONE sentence, 8–20 words.
- Present tense, objective, neutral; no narrative or speculation.
- Prefer starting with “A”, “An”, or “The”.
- Describe the foreground SFX source/action, then the environment (prepositional phrase),
  optionally a brief background clause (“with/while …”).
- Avoid “the sound(s) of …”, proper names, brands, and quotes.
- The sentence must end with a literal period “.” (do NOT write the word “end”).
- Focus on SFX traits: envelope (attack/decay), transient strength, texture/material,
  pitch motion, stereo/space, loopability, doppler, layering.

Anti-bias rule:
- Do not copy any example terms verbatim; prefer precise synonyms or enriched variants.

Return EXACTLY one JSON object and nothing else
(no code fences, no extra text, no markers like end/[end]):
{
  "caption": "<ONE sentence, 8–20 words, present tense, article-first if natural, include environment phrase, end with a period; no 'sound(s) of', no names>"
}

Guidelines:
- Describe only what is heard (audio evidence), not unseen causes.
- Be concise, objective, and neutral; avoid brands and quotations.
- Output must be strictly valid JSON containing only the key “caption”.

"""


In [87]:
# =============================
# 2) 유틸
# =============================

def is_url(s: str) -> bool:
    return isinstance(s, str) and s.lower().startswith(("http://", "https://"))

def coerce_json(text: Any) -> Any:
    """ 문자열 응답에서 첫 '{'~마지막 '}' 구간을 잘라 JSON 파싱 시도 """
    if not isinstance(text, str):
        return text
    a, b = text.find("{"), text.rfind("}")
    if a == -1 or b == -1 or b < a:
        return text
    block = text[a:b+1]
    try:
        return json.loads(block)
    except Exception:
        return text



def list_audio_files(root: str, recursive: bool = True,
                     exts=(".wav",".mp3",".flac",".m4a",".ogg",".aac",".wma",".webm")) -> List[str]:
    exts = set(e.lower() for e in exts)
    files: List[str] = []
    if recursive:
        for dirpath, _, filenames in os.walk(root):
            for fn in filenames:
                if os.path.splitext(fn)[1].lower() in exts:
                    files.append(os.path.join(dirpath, fn))
    else:
        for fn in os.listdir(root):
            p = os.path.join(root, fn)
            if os.path.isfile(p) and os.path.splitext(fn)[1].lower() in exts:
                files.append(p)
    files.sort()
    return files

In [88]:
# =============================
# 3) Replicate 호출 (재시도)
# =============================
def af3_run(
    audio_input: str,
    prompt: str,
    system_prompt: str = "",
    temperature: float = 0.0,
    max_length: int = 512
) -> Dict[str, Any]:
    last_err: Optional[Exception] = None

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            payload = {
                "prompt": prompt,
                "max_length": max_length,
                "temperature": temperature,
                "system_prompt": system_prompt,
                "enable_thinking": False,
            }
            if is_url(audio_input):
                payload["audio"] = audio_input
                output = replicate.run(AF3_MODEL, input=payload)
            else:
                if not os.path.exists(audio_input):
                    raise FileNotFoundError(f"Audio not found: {audio_input}")
                with open(audio_input, "rb") as f:
                    payload["audio"] = f
                    output = replicate.run(AF3_MODEL, input=payload)
        

            if isinstance(output, str):
                txt = output.strip()
                parsed = txt
            else:
                parsed = output

            #parsed : json file pasing한 것
            #raw : captioning 원본 출력
            return {"success": True, "caption": parsed, "output": output, "error": None}

        except Exception as e:
            msg = str(e)
            # 402: 크레딧 없음 → 즉시 실패
            if "status: 402" in msg or "Insufficient credit" in msg:
                return {"success": False, "output": None, "raw": None, "error": msg}
            # 429: 레이트 리밋 → 조금 대기 후 재시도
            if "status: 429" in msg or "throttled" in msg or "rate limit" in msg.lower():
                time.sleep(10.0)
                last_err = e
                continue

            last_err = e
            sleep_s = (BASE_BACKOFF ** (attempt - 1)) + random.uniform(0, 0.3)
            time.sleep(sleep_s)

    return {"success": False, "ocaption": False, "output": False, "error": str(last_err)}


In [92]:
# =============================
# 4) 세 가지 방식 구현
# =============================
def extract_keywords(audio_input: str) -> Tuple[bool, List[str], Optional[str]]:
    """{"keywords":[...]} 응답에서 '정확히 같은 키워드'만 제거해서 반환."""
    res = af3_run(audio_input, prompt=PROMPT_KEYWORDS, temperature=0.0)
    if not res["success"]:
        return False, [], res["error"]

    data = res["output"]

    # 1) 키워드 리스트 파싱(딕트/리스트/문자열 대응)
    raw_list: List[str] = []
    if isinstance(data, dict) and isinstance(data.get("keywords"), list):
        raw_list = [str(x) for x in data["keywords"]]
    elif isinstance(data, list):
        raw_list = [str(x) for x in data]
    elif isinstance(data, str):
        txt = data.strip()
        # 우선 JSON으로 시도
        parsed = None
        try:
            parsed = json.loads(txt)
        except Exception:
            # 문자열 내 첫 [] 또는 {} 블록 재시도
            m = re.search(r"\[.*?\]", txt, flags=re.S) or re.search(r"\{.*?\}", txt, flags=re.S)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                except Exception:
                    parsed = None
        if isinstance(parsed, dict) and isinstance(parsed.get("keywords"), list):
            raw_list = [str(x) for x in parsed["keywords"]]
        elif isinstance(parsed, list):
            raw_list = [str(x) for x in parsed]
        else:
            # 콤마-조인 문자열 fallback
            s = txt.strip().strip("[]")
            if s:
                raw_list = [p.strip().strip('"').strip("'") for p in s.split(",") if p.strip()]
    else:
        raw_list = []

    # 2) 콤마로 붙은 항목 분해 + 앞뒤공백 제거
    items: List[str] = []
    for it in raw_list:
        if "," in it:
            items += [p.strip() for p in it.split(",") if p.strip()]
        else:
            items.append(it.strip())

    # 3) '정확히 동일' 중복 제거 (소문자화 + 공백 정규화만 적용)
    seen = set()
    uniq: List[str] = []
    for k in items:
        norm = re.sub(r"\s+", " ", k.lower().strip())
        if not norm:
            continue
        if norm in seen:
            continue
        seen.add(norm)
        uniq.append(norm)

    return True, uniq, None


def caption_with_keywords(audio_input: str, keywords: List[str]) -> Dict[str, Any]:
    """ 방식 A의 2단계: 키워드 삽입 캡션 (키워드 + 캡션 반환) """
    kw_list = ", ".join(keywords[:15]) if keywords else "(none)"

    prompt = PROMPT_CAPTION_WITH_KW.replace("{kw_list}", kw_list)
    
    res = af3_run(audio_input, prompt=prompt, system_prompt="", temperature=0.0)
    if not res["success"]:
        return {"success": True, "caption": False, "output": False, "error": None}

    return res



def caption_direct(audio_input: str) -> Dict[str, Any]:
    """ 방식 B: 오디오 -> 캡션 바로 (caption만) """
    res = af3_run(audio_input, prompt=PROMPT_CAPTION_DIRECT, system_prompt="", temperature=0.0)
    #sucess 하지 않았을 시
    if not res["success"]:
        return {"success": True, "caption": False, "output": False, "error": None}

    return res


def caption_in_prompt_keywords(audio_input: str) -> Dict[str, Any]:
    """ 방식 C: 프롬프트 내부에서 키워드 추출 후 캡션 (코드는 caption만 수신) """
    res = af3_run(audio_input, prompt=PROMPT_CAPTION_IN_PROMPT_KW, system_prompt="", temperature=0.0)
    if not res["success"]:
        return {"success": True, "caption": False, "output": False, "error": None}

    return res

In [94]:
# =============================
# 5) 디렉토리 배치
# =============================
def run_dir(
    audio_dir: str,
    out_xlsx_A: str = "af3_A_kw_to_cap.xlsx",
    out_xlsx_B: str = "af3_B_direct_cap.xlsx",
    out_xlsx_C: str = "af3_C_inprompt_cap.xlsx",
    limit: Optional[int] = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    files = list_audio_files(audio_dir, recursive=True)
    if limit:
        files = files[:limit]

    rows_a, rows_b, rows_c = [], [], []

    for p in tqdm(files, desc="AF3 - 3 modes (A:kw->cap, B:direct, C:prompt-kw)", ncols=100):
        file_name = os.path.basename(p)

        # ---- 방식 A: 키워드 -> 캡션
        A_caption = None
        try:
            ok_kw, kws, err_kw = extract_keywords(p)
        except Exception as e:
            ok_kw, kws, err_kw = False, [], str(e)

        if ok_kw:
            resA = caption_with_keywords(p, kws)
            A_caption = resA.get("caption") if resA.get("success") else resA.get("output")
        rows_a.append({"file": file_name, "caption": A_caption})

        time.sleep(SLEEP_BETWEEN_CALLS)

        # ---- 방식 B: 직접 캡션
        resB = caption_direct(p)
        B_caption = resB.get("caption") if resB.get("success") else resB.get("output")
        rows_b.append({"file": file_name, "caption": B_caption})

        time.sleep(SLEEP_BETWEEN_CALLS)

        # ---- 방식 C: 프롬프트 내부 키워드 후 캡션
        resC = caption_in_prompt_keywords(p)
        C_caption = resC.get("caption") if resC.get("success") else resC.get("output")
        rows_c.append({"file": file_name, "caption": C_caption})

        time.sleep(SLEEP_BETWEEN_CALLS)

    # DataFrame: 각 방식별로 파일이름 + 캡션만
    df_A = pd.DataFrame(rows_a, columns=["file", "caption"])
    df_B = pd.DataFrame(rows_b, columns=["file", "caption"])
    df_C = pd.DataFrame(rows_c, columns=["file", "caption"])

    # 엑셀 저장: 방식별 개별 파일
    with pd.ExcelWriter(out_xlsx_A, engine="openpyxl") as w:
        df_A.to_excel(w, index=False, sheet_name="A_kw_to_cap")
    with pd.ExcelWriter(out_xlsx_B, engine="openpyxl") as w:
        df_B.to_excel(w, index=False, sheet_name="B_direct")
    with pd.ExcelWriter(out_xlsx_C, engine="openpyxl") as w:
        df_C.to_excel(w, index=False, sheet_name="C_inprompt")

    return df_A, df_B, df_C


In [None]:
# =============================
# 6) 진입점
# =============================

def main():

    #사용자 음원 저장 경로로 변경
    dfA, dfB, dfC = run_dir(
        audio_dir=r"D:\sogang\AF3_captioning\Clotho_data\test",
        out_xlsx_A="af3_A_kw_to_cap.xlsx",
        out_xlsx_B="af3_B_direct_cap.xlsx",
        out_xlsx_C="af3_C_inprompt_cap.xlsx",
    limit=500
)
    print(f"Done")


if __name__ == "__main__":
    main()


AF3 - 3 modes (A:kw->cap, B:direct, C:prompt-kw):   0%|                     | 0/102 [00:00<?, ?it/s]

In [16]:
import os
p = r"D:\sogang\AF3_captioning\Clotho_data_development"
print(os.path.isdir(p))        # True가 떠야 정상
print(os.listdir(r"D:\sogang"))  # 상위 폴더 목록 확인


False
['AF3_captioning', 'computer_vision_ver']


생성한 각 excel파일에 GT 추가하는 코드

In [98]:
import os
import re
import random
import pandas as pd

def _normalize_basename(p: str) -> str:
    """경로 → 파일명(확장자 포함), 소문자/공백정리"""
    b = os.path.basename(str(p)).strip()
    return re.sub(r"\s+", " ", b.lower())

def _detect_file_col(df: pd.DataFrame):
    candidates = ["file_name", "file", "audio_file", "fname", "filename"]
    for c in candidates:
        if c in df.columns:
            return c
    return df.columns[0]

def _extract_captions_columns(df: pd.DataFrame):
    """caption 열(예: caption, caption_1..5)을 최대한 자동으로 찾는다."""
    caption_cols = [c for c in df.columns if re.match(r"^caption(_\d+)?$", str(c), flags=re.I)]
    if caption_cols:
        return caption_cols
    # 흔한 변형
    return [c for c in df.columns if "caption" in str(c).lower()]

def _build_gt_map_random_one(clotho_csv_path: str, seed: int | None = None) -> dict:
    """
    Clotho CSV를 읽어 파일명 → '랜덤 1개 캡션 문자열' 매핑(dict) 생성.
    - 캡션이 여러 열에 있을 때: 한 행에서 후보를 만들고
    - 같은 파일명이 여러 행인 경우: 후보들을 합쳐서 하나 랜덤 선택
    """
    if seed is not None:
        random.seed(seed)

    gt = pd.read_csv(clotho_csv_path)
    file_col = _detect_file_col(gt)
    gt["_file_norm"] = gt[file_col].astype(str).map(_normalize_basename)

    # caption 열 후보 찾기
    caps_cols = _extract_captions_columns(gt)

    if caps_cols:  # caption_1..caption_5 등의 멀티 열 형태
        def row_caps(row):
            vals = []
            for c in caps_cols:
                v = str(row[c]).strip()
                if v and v.lower() not in ("nan", "none"):
                    vals.append(v)
            return vals

        gt["_caps_list"] = gt.apply(row_caps, axis=1)
        # 파일명 기준으로 모든 후보를 합침
        agg = gt.groupby("_file_norm")["_caps_list"].sum().reset_index()

        # 파일별로 후보 중 한 개 랜덤 선택
        def pick_one(xs):
            xs = [x for x in xs if x]
            return random.choice(xs) if xs else ""
        agg["clotho_gt"] = agg["_caps_list"].apply(pick_one)

        return dict(zip(agg["_file_norm"], agg["clotho_gt"]))

    # 단일 열 형태(혹은 다른 이름) 처리
    for candidate in ["caption", "text", "transcript", "reference"]:
        if candidate in gt.columns:
            cap_col = candidate
            break
    else:
        return {}

    grouped = gt.groupby("_file_norm")[cap_col].apply(list).reset_index()

    def pick_one2(xs):
        xs = [str(x).strip() for x in xs if str(x).strip()]
        return random.choice(xs) if xs else ""
    grouped["clotho_gt"] = grouped[cap_col].apply(pick_one2)

    return dict(zip(grouped["_file_norm"], grouped["clotho_gt"]))

def append_random_gt_to_excels(
    clotho_csv_path: str,
    excel_A_path: str,
    excel_B_path: str,
    excel_C_path: str,
    out_A_path: str = None,
    out_B_path: str = None,
    out_C_path: str = None,
    filename_col_in_excel: str = "file",
    seed: int | None = 42,   # 재현 원하면 고정 seed, 비재현이면 None
):
    """
    각 방식(A/B/C) 엑셀을 불러와 file명으로 GT를 매칭한 뒤,
    'clotho_gt'를 '랜덤 1개'로만 추가하여 저장한다.
    최종 컬럼: file, caption, clotho_gt
    """
    gt_map = _build_gt_map_random_one(clotho_csv_path, seed=seed)

    def _process_one(in_path: str, out_path: str):
        df = pd.read_excel(in_path)
        if filename_col_in_excel not in df.columns:
            raise KeyError(f"{in_path}: '{filename_col_in_excel}' 컬럼을 찾을 수 없습니다.")
        if "caption" not in df.columns:
            raise KeyError(f"{in_path}: 'caption' 컬럼을 찾을 수 없습니다.")

        df["_file_norm"] = df[filename_col_in_excel].astype(str).map(_normalize_basename)
        df["clotho_gt"] = df["_file_norm"].map(gt_map).fillna("")

        final_df = df[[filename_col_in_excel, "caption", "clotho_gt"]].copy()
        with pd.ExcelWriter(out_path, engine="openpyxl") as w:
            final_df.to_excel(w, index=False, sheet_name="with_random_gt")

    out_A_path = out_A_path or excel_A_path.replace(".xlsx", "_with_random_gt.xlsx")
    out_B_path = out_B_path or excel_B_path.replace(".xlsx", "_with_random_gt.xlsx")
    out_C_path = out_C_path or excel_C_path.replace(".xlsx", "_with_random_gt.xlsx")

    _process_one(excel_A_path, out_A_path)
    _process_one(excel_B_path, out_B_path)
    _process_one(excel_C_path, out_C_path)

    return out_A_path, out_B_path, out_C_path





# 사용자 경로(예시) 
CLOTHO_CSV = r"D:\sogang\AF3_captioning\Clotho_data\clotho_captions_development.csv"

EXCEL_A = r"D:\sogang\AF3_captioning\af3_A_kw_to_cap.xlsx"
EXCEL_B = r"D:\sogang\AF3_captioning\af3_B_direct_cap.xlsx"
EXCEL_C = r"D:\sogang\AF3_captioning\af3_C_inprompt_cap.xlsx"

append_random_gt_to_excels(
    clotho_csv_path=CLOTHO_CSV,
    excel_A_path=EXCEL_A,
    excel_B_path=EXCEL_B,
    excel_C_path=EXCEL_C,
    # 옵션: 출력 경로를 지정하지 않으면 *_with_gt.xlsx로 저장됩니다.
    # out_A_path="af3_A_kw_to_cap_with_gt.xlsx",
    # out_B_path="af3_B_direct_cap_with_gt.xlsx",
    # out_C_path="af3_C_inprompt_cap_with_gt.xlsx",
     seed=42,
)

('D:\\sogang\\AF3_captioning\\af3_A_kw_to_cap_with_random_gt.xlsx',
 'D:\\sogang\\AF3_captioning\\af3_B_direct_cap_with_random_gt.xlsx',
 'D:\\sogang\\AF3_captioning\\af3_C_inprompt_cap_with_random_gt.xlsx')