In [3]:
!pip install -q "transformers>=4.43.0" accelerate sentencepiece librosa soundfile scikit-learn seaborn


In [4]:
from google.colab import drive
drive.mount('/content/drive')

import os

BASE_DIR      = "/content/drive/MyDrive/adsp/downloads"
ESD_ROOT      = os.path.join(BASE_DIR, "esd")
IEMOCAP_ROOT  = os.path.join(BASE_DIR, "iemocap")

print("ESD root     :", ESD_ROOT, " ->", os.path.exists(ESD_ROOT))
print("IEMOCAP root :", IEMOCAP_ROOT, " ->", os.path.exists(IEMOCAP_ROOT))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ESD root     : /content/drive/MyDrive/adsp/downloads/esd  -> True
IEMOCAP root : /content/drive/MyDrive/adsp/downloads/iemocap  -> True


In [5]:
import torch
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

QWEN_AUDIO_ID = "Qwen/Qwen2-Audio-7B-Instruct"

# Processor handles both audio + text
processor = AutoProcessor.from_pretrained(QWEN_AUDIO_ID)

# Large model: use half precision + accelerate device_map
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    QWEN_AUDIO_ID,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",          # let Accelerate shard across GPU/CPU if needed
)

print("Qwen2-Audio loaded.")


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]



Qwen2-Audio loaded.


In [6]:
import librosa
import numpy as np

EMO_LABELS = ["Neutral", "Happy", "Angry", "Sad", "Surprise"]

def build_prompt(labels, transcript=None, with_text=True):
    """
    Build an instruction prompt for zero-shot classification.
    """
    label_str = ", ".join(labels)
    base = (
        "<|audio_bos|><|AUDIO|><|audio_eos|>\n"
        "User: You will hear one short utterance. "
    )
    if with_text and transcript is not None and transcript != "":
        base += f"The transcript of the utterance is: \"{transcript}\". "

    base += (
        f"Classify the speaker's emotion into one of the following labels: "
        f"[{label_str}]. "
        "Answer with exactly **one** of those words and nothing else.\n"
        "Assistant:"
    )
    return base


def extract_label(generated_text, labels=EMO_LABELS):
    """
    Make the output robust: find first label mentioned in the generation.
    Fallback to 'Neutral' if nothing matches.
    """
    text = generated_text.lower()
    for lab in labels:
        if lab.lower() in text:
            return lab
    # as a very safe fallback:
    return "Neutral"


def classify_one(audio_path, transcript=None, with_text=True, max_new_tokens=8):
    """
    Zero-shot classification of a single sample.
    with_text = False  -> audio-only
    with_text = True   -> audio + transcript
    """
    # load audio at model's required sampling rate
    target_sr = processor.feature_extractor.sampling_rate
    audio, sr = librosa.load(audio_path, sr=target_sr)

    prompt = build_prompt(EMO_LABELS, transcript, with_text=with_text)

    inputs = processor(
        text=prompt,
        audios=audio,
        return_tensors="pt",
    )

    # DO NOT .to('cuda') here – accelerate will move chunks itself
    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    gen_text = processor.batch_decode(
        gen_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )[0]

    label = extract_label(gen_text, EMO_LABELS)
    return label, gen_text


In [7]:
import pandas as pd

def load_esd(esd_root):
    rows = []
    speakers = sorted(d for d in os.listdir(esd_root) if d.isdigit())

    for spk in speakers:
        spk_dir = os.path.join(esd_root, spk)

        # --- read transcript file (e.g. 0001.txt) ---
        transcript_path = None
        for fn in os.listdir(spk_dir):
            if fn.endswith(".txt"):
                transcript_path = os.path.join(spk_dir, fn)
                break

        rel2text = {}
        if transcript_path is not None:
            with open(transcript_path, encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    # Very generic parse: "Angry/xxx.wav    transcript..."
                    parts = line.split(maxsplit=1)
                    if len(parts) == 1:
                        continue
                    rel, text = parts
                    rel2text[rel.strip()] = text.strip()

        # --- collect wav files per emotion ---
        for emo in EMO_LABELS:
            emo_dir = os.path.join(spk_dir, emo)
            if not os.path.isdir(emo_dir):
                continue

            for fn in os.listdir(emo_dir):
                if not fn.lower().endswith(".wav"):
                    continue
                rel1 = f"{emo}/{fn}"
                rel2 = fn
                text = rel2text.get(rel1) or rel2text.get(rel2) or ""
                rows.append(
                    {
                        "path": os.path.join(emo_dir, fn),
                        "text": text,
                        "label": emo,
                    }
                )

    df = pd.DataFrame(rows)
    return df


df_esd = load_esd(ESD_ROOT)
print("ESD samples:", len(df_esd))
df_esd.head()


ESD samples: 35013


Unnamed: 0,path,text,label
0,/content/drive/MyDrive/adsp/downloads/esd/0001...,,Neutral
1,/content/drive/MyDrive/adsp/downloads/esd/0001...,,Neutral
2,/content/drive/MyDrive/adsp/downloads/esd/0001...,,Neutral
3,/content/drive/MyDrive/adsp/downloads/esd/0001...,,Neutral
4,/content/drive/MyDrive/adsp/downloads/esd/0001...,,Neutral


In [33]:
import os, re
import pandas as pd

# نگاشت لیبل‌های IEMOCAP به 5 کلاس نهایی
EMO_MAP = {
    "ang": "Angry",
    "hap": "Happy",
    "exc": "Happy",      # excited -> happy
    "neu": "Neutral",
    "sad": "Sad",
    "fru": "Angry",      # frustration -> angry-ish
    "sur": "Surprise",
    # بقیه احساسات (fea, dis, oth, xxx, ...) را نادیده می‌گیریم
}

def load_iemocap(iemocap_root):
    rows = []

    # Session1 ... Session5
    sessions = sorted(
        d for d in os.listdir(iemocap_root)
        if d.startswith("Session")
    )
    print("Sessions found:", sessions)

    for sess in sessions:
        sess_dir = os.path.join(iemocap_root, sess)

        emo_dir  = os.path.join(sess_dir, "dialog", "EmoEvaluation")
        tran_dir = os.path.join(sess_dir, "dialog", "transcriptions")
        wav_dir  = os.path.join(sess_dir, "dialog", "wav")

        if not os.path.isdir(emo_dir):
            print(f"[WARN] Missing EmoEvaluation in {sess}: {emo_dir}")
            continue

        # ---------------------- 1) خواندن لیبل‌ها ----------------------
        utt2emo = {}
        for fn in os.listdir(emo_dir):
            if not fn.endswith(".txt") or fn.startswith("._"):
                continue
            with open(os.path.join(emo_dir, fn),
                      encoding="utf-8", errors="ignore") as f:
                for line in f:
                    line = line.strip()
                    # خطوط اصلی این شکل‌اند:
                    # [2.4400 - 5.1953]  Ses01F_impro01_F000  angry  [3.5, ...]
                    if not (line.startswith("[") and "Ses" in line):
                        continue
                    m = re.search(r"\]\s+(Ses\w+)\s+([a-zA-Z]+)\s", line)
                    if not m:
                        continue
                    utt_id = m.group(1)
                    emo    = m.group(2).lower()
                    if emo not in EMO_MAP:
                        continue
                    utt2emo[utt_id] = EMO_MAP[emo]

        print(f"{sess}: {len(utt2emo)} labelled utterances (after emo filter)")

        if not utt2emo:
            # اگر حتی یکی هم پیدا نشد برو سراغ سشن بعدی
            continue

        # ---------------------- 2) خواندن متن‌ها -----------------------
        utt2text = {}
        if os.path.isdir(tran_dir):
            for fn in os.listdir(tran_dir):
                if not fn.endswith(".txt") or fn.startswith("._"):
                    continue
                with open(os.path.join(tran_dir, fn),
                          encoding="utf-8", errors="ignore") as f:
                    for line in f:
                        line = line.strip()
                        # مثال:
                        # Ses01F_impro01_F000 [2.4400 - 5.1953]:  I remember ...
                        m = re.match(r"(Ses\w+)\s+\[.*\]:\s*(.*)", line)
                        if not m:
                            continue
                        utt_id = m.group(1)
                        text   = m.group(2).strip()
                        utt2text[utt_id] = text

        # ---------------- 3) مپ کردن شناسه‌ها به فایل wav ----------------
        wav_index = {}
        if os.path.isdir(wav_dir):
            for root, dirs, files in os.walk(wav_dir):
                for fn in files:
                    if not fn.endswith(".wav") or fn.startswith("._"):
                        continue
                    utt_id = os.path.splitext(fn)[0]  # Ses01F_impro01_F000
                    wav_index[utt_id] = os.path.join(root, fn)

        # ---------------- 4) ساختن ردیف‌های دیتا ------------------------
        for utt_id, label in utt2emo.items():
            audio_path = wav_index.get(utt_id)
            if audio_path is None:
                # برای دیباگ اگر دوست داشتی چاپ کن
                # print(f"[NO WAV] {utt_id}")
                continue
            text = utt2text.get(utt_id, "")
            rows.append({
                "path":  audio_path,
                "text":  text,
                "label": label,
            })

    print("Raw IEMOCAP rows with mapped emotions:", len(rows))

    df = pd.DataFrame(rows)
    # اگر دوست داری فقط 5 لیبل اصلی را نگه داریم (در حال حاضر فقط همین‌ها را می‌سازیم)
    return df.reset_index(drop=True)


In [34]:
IEMOCAP_ROOT = "/content/drive/MyDrive/adsp/downloads/iemocap"  # مسیر خودت

df_iemo = load_iemocap(IEMOCAP_ROOT)

print("\nIEMOCAP head:")
print(df_iemo.head())

if not df_iemo.empty:
    print("\nIEMOCAP label counts:")
    print(df_iemo["label"].value_counts())
else:
    print("\n⚠️ IEMOCAP DataFrame is still empty – دوباره ساختار یا regex را باید چک کنیم.")


Sessions found: ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
Session1: 1390 labelled utterances (after emo filter)
Session2: 1365 labelled utterances (after emo filter)
Session3: 1561 labelled utterances (after emo filter)
Session4: 1531 labelled utterances (after emo filter)
Session5: 1640 labelled utterances (after emo filter)
Raw IEMOCAP rows with mapped emotions: 0

IEMOCAP head:
Empty DataFrame
Columns: []
Index: []

⚠️ IEMOCAP DataFrame is still empty – دوباره ساختار یا regex را باید چک کنیم.


In [28]:
IEMOCAP_ROOT = "/content/drive/MyDrive/adsp/downloads/iemocap"

df_iemo = load_iemocap(IEMOCAP_ROOT)

print("\nIEMOCAP head:")
print(df_iemo.head())

if not df_iemo.empty:
    print("\nIEMOCAP label counts:")
    print(df_iemo["label"].value_counts())


Sessions found: ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
Raw IEMOCAP rows: 0
⚠️  IEMOCAP DataFrame is empty! Check paths/structure.

IEMOCAP head:
Empty DataFrame
Columns: []
Index: []


In [29]:
MAX_SAMPLES = 200  # برای تست، بعداً می‌تونی None بذاری

print(">>> ESD - AUDIO ONLY")
esd_audio_only = classify_batch(df_esd, with_text=False, max_samples=MAX_SAMPLES)
acc_esd_a, cm_esd_a = evaluate_predictions(esd_audio_only, title="ESD - audio only")

print("\n>>> ESD - AUDIO + TRANSCRIPT")
esd_audio_text = classify_batch(df_esd, with_text=True, max_samples=MAX_SAMPLES)
acc_esd_at, cm_esd_at = evaluate_predictions(esd_audio_text, title="ESD - audio + transcript")

print("\n>>> IEMOCAP - AUDIO ONLY")
iemo_audio_only = classify_batch(df_iemo, with_text=False, max_samples=MAX_SAMPLES)
acc_iemo_a, cm_iemo_a = evaluate_predictions(iemo_audio_only, title="IEMOCAP - audio only")

print("\n>>> IEMOCAP - AUDIO + TRANSCRIPT")
iemo_audio_text = classify_batch(df_iemo, with_text=True, max_samples=MAX_SAMPLES)
acc_iemo_at, cm_iemo_at = evaluate_predictions(iemo_audio_text, title="IEMOCAP - audio + transcript")


>>> ESD - AUDIO ONLY


  0%|          | 0/200 [00:00<?, ?it/s]

  audio, sr = librosa.load(audio_path, sr=target_sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error on /content/drive/MyDrive/adsp/downloads/esd/0015/Sad/0015_001077.wav -> EOFError()
Error on /content/drive/MyDrive/adsp/downloads/esd/0019/Sad/0019_001258.wav -> EOFError()
Error on /content/drive/MyDrive/adsp/downloads/esd/0015/Sad/0015_001239.wav -> EOFError()
Error on /content/drive/MyDrive/adsp/downloads/esd/0018/Happy/0018_000701.wav -> EOFError()


KeyboardInterrupt: 