In [5]:
import torch
import torchaudio
from transformers import AutoModelForAudioClassification, AutoProcessor

# File paths
AUDIO_PATH = "sample.wav"
MODEL_PATH = "./ast-finetuned-model"

# Load model and processor from local path
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio and resample to 16kHz
waveform, sr = torchaudio.load(AUDIO_PATH)
if sr != 16000:
    resampler = torchaudio.transforms.Resample(sr, 16000)
    waveform = resampler(waveform)
waveform = waveform.mean(dim=0).numpy()  # Convert to mono

# Process input
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Predict
with torch.no_grad():
    logits = model(**inputs).logits
pred_id = logits.argmax(-1).item()
label = model.config.id2label[pred_id]

print(f"\n🎙️ Detected Emotion: **{label}**")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



🎙️ Detected Emotion: **DIS**


In [6]:
print(1)

1


In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m994.2 kB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.3

In [2]:
!pip list

Package                       Version
----------------------------- ------------
alembic                       1.12.0
altair                        5.1.2
anyio                         4.0.0
argon2-cffi                   23.1.0
argon2-cffi-bindings          21.2.0
arrow                         1.3.0
asttokens                     2.4.0
async-generator               1.10
async-lru                     2.0.4
attrs                         23.1.0
audioread                     3.0.1
Babel                         2.13.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.5
beautifulsoup4                4.12.2
bleach                        6.1.0
blinker                       1.6.3
bokeh                         3.3.0
boltons                       23.0.0
Bottleneck                    1.3.7
Brotli                        1.1.0
cached-property               1.5.2
cachetools                    6.1.0
certifi                       2023.7.22
certipy                       0.1.3
cffi     

In [5]:
!pip install streamlit



In [4]:
import torch
import torchaudio
import streamlit as st
from transformers import AutoModelForAudioClassification, AutoProcessor

# Title and instructions
st.title("🎧 Emotion Detection from Audio")
st.markdown("Upload a `.wav` file (16kHz recommended) to detect the emotion.")

# File uploader
uploaded_file = st.file_uploader("Upload an audio file", type=["wav"])

# Load model and processor
@st.cache_resource
def load_model():
    model_path = "./ast-finetuned-model"
    processor = AutoProcessor.from_pretrained(model_path)
    model = AutoModelForAudioClassification.from_pretrained(model_path)
    model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
    return processor, model

processor, model = load_model()

# Run prediction if file is uploaded
if uploaded_file is not None:
    st.audio(uploaded_file, format="audio/wav")
    
    # Load and resample audio
    waveform, sr = torchaudio.load(uploaded_file)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.mean(dim=0).numpy()  # Convert to mono

    # Preprocess
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_id = logits.argmax(-1).item()
    label = model.config.id2label[pred_id]

    # Display result
    st.success(f"🎙️ Detected Emotion: **{label}**")


2025-07-08 18:18:25.296 
  command:

    streamlit run /opt/conda/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
