In [None]:
!mkdir  -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# ðŸŽ¯ Problem Statement AI Mental Wellness Companion (Prototype)

Mental health issues like stress and depression are becoming more common these days.  
A lot of people either donâ€™t realize theyâ€™re struggling or avoid going to professionals because of stigma, cost, or just not knowing they need help.  
One thing thatâ€™s interesting is that our voice can actually give away how weâ€™re feeling â€” things like tone, pitch, speed, and energy often change when weâ€™re stressed.

For this project, I want to build an AI-powered companion* that can listen to a short voice sample and figure out whether the person sounds stressed or not stressed.  
Itâ€™s still a prototype, but the idea is that in the future, the AI could check in with people daily, give them tips to relax, or even suggest talking to a professional if it notices worrying patterns.

# Why it fits MumbaiHacks 2025
This goes under the *Healthtech* track and matches the *Agentic AI* theme because:
- Itâ€™s not just making predictions â€” the agent could actually start the conversation with the user and offer help when needed.
- Itâ€™s directly linked to health by supporting early detection and stress management.

# How Iâ€™m planning to do it
1. *Dataset* â€“ Iâ€™m using the RAVDESS Emotional Speech dataset to train and test the prototype.
2. *Features* â€“ Iâ€™ll extract MFCC features (basically numbers that capture how the voice sounds) from the audio.
3. *Model* â€“ Train a simple XGBoost model to classify stressed vs non-stressed.
4. *Demo* â€“ Make a small Gradio app where I can upload/record speech and see the result instantly.
5. *Future* â€“ Turn it into a mobile or web app that runs in real-time.

This way, even at the prototype stage, I can show both the *tech* and the *use case* clearly.

In [None]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio


Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
ravdess-emotional-speech-audio.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip ravdess-emotional-speech-audio.zip -d ravdess_data

Archive:  ravdess-emotional-speech-audio.zip
replace ravdess_data/Actor_01/03-01-01-01-01-01-01.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ravdess_data/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-01-01-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-01-01-02-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-01-01-02-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-02-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-02-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-02-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-02-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-03-01-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-03-01-01-02-

In [None]:
import os
import re
import pandas as pd

# Path to the folder that contains Actor_01 ... Actor_24
ROOT = "/content/ravdess_data"

# Find only Actor_* folders
actor_dirs = []
for name in os.listdir(ROOT):
    full = os.path.join(ROOT, name)
    if os.path.isdir(full) and re.fullmatch(r"Actor_\d{2}", name):
        actor_dirs.append(full)

actor_dirs = sorted(actor_dirs)
print(f"Found {len(actor_dirs)} actor folders:", [os.path.basename(d) for d in actor_dirs])

# Emotion mapping
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

rows = []
for actor_dir in actor_dirs:
    for fname in os.listdir(actor_dir):
        if not fname.lower().endswith(".wav"):
            continue

        stem = os.path.splitext(fname)[0]
        parts = stem.split('-')
        if len(parts) != 7:
            continue

        _, _, emotion_id, intensity_id, statement_id, repetition_id, actor_id = parts
        emotion = emotion_map.get(emotion_id, "unknown")
        intensity = "strong" if intensity_id == "02" else "normal"

        rows.append({
            "path": os.path.join(actor_dir, fname),
            "actor": f"Actor_{int(actor_id):02d}",
            "emotion": emotion,
            "intensity": intensity
        })

df = pd.DataFrame(rows)
print(f"\nTotal audio files indexed: {len(df)}")
print("\nCounts by emotion:")
print(df["emotion"].value_counts())
df.head()


Found 24 actor folders: ['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24']

Total audio files indexed: 1440

Counts by emotion:
emotion
calm         192
sad          192
disgust      192
happy        192
surprised    192
angry        192
fearful      192
neutral       96
Name: count, dtype: int64


Unnamed: 0,path,actor,emotion,intensity
0,/content/ravdess_data/Actor_01/03-01-02-01-02-...,Actor_01,calm,normal
1,/content/ravdess_data/Actor_01/03-01-04-02-01-...,Actor_01,sad,strong
2,/content/ravdess_data/Actor_01/03-01-04-01-01-...,Actor_01,sad,normal
3,/content/ravdess_data/Actor_01/03-01-07-01-01-...,Actor_01,disgust,normal
4,/content/ravdess_data/Actor_01/03-01-04-02-02-...,Actor_01,sad,strong


The RAVDESS Emotional Speech dataset contains speech recordings from 24 different actors.
Each actor folder (Actor_01 â€¦ Actor_24) contains .wav files, where the filename itself encodes important metadata such as emotion, intensity, and actor ID.

We parsed these filenames to extract:

Path â†’ The complete location of the audio file (used later for loading audio).

Actor â†’ The ID of the speaker.

Emotion â†’ The emotional category (neutral, calm, happy, sad, angry, fearful, disgust, surprised).

Intensity â†’ Whether the emotion is spoken with normal or strong intensity.


From our parsing, we found:

Total audio files: 1440

Each emotion has 192 samples except neutral, which has 96 samples (dataset design choice).

The dataset is evenly distributed across 24 actors.


This structured DataFrame will now allow us to filter emotions, group samples, and prepare labels for our binary classification task (stressed vs non-stressed).

03-01-06-01-01-02-19.wav
03=speech
01=vocal
06=emotion
01=emotional intensity(01=normal,02=strong)
statement(01 or 02)
repetions(01 or 02)
actor id


In [None]:
# Define which emotions are considered "stressed"
stressed_set = {"sad", "angry", "fearful", "disgust", "surprised"}

# Create a new column 'label'
df["label"] = df["emotion"].apply(lambda e: "stressed" if e in stressed_set else "non-stressed")

# Check class distribution
print("Counts by label:")
print(df["label"].value_counts())

# Peek at some samples
print("\nSample stressed files:")
print(df[df["label"] == "stressed"].sample(3, random_state=42)["path"].tolist())

print("\nSample non-stressed files:")
print(df[df["label"] == "non-stressed"].sample(3, random_state=42)["path"].tolist())

Counts by label:
label
stressed        960
non-stressed    480
Name: count, dtype: int64

Sample stressed files:
['/content/ravdess_data/Actor_21/03-01-08-02-01-02-21.wav', '/content/ravdess_data/Actor_12/03-01-04-02-01-01-12.wav', '/content/ravdess_data/Actor_09/03-01-07-02-01-01-09.wav']

Sample non-stressed files:
['/content/ravdess_data/Actor_04/03-01-03-01-02-01-04.wav', '/content/ravdess_data/Actor_21/03-01-02-01-02-01-21.wav', '/content/ravdess_data/Actor_20/03-01-03-01-02-01-20.wav']


In [None]:
import os
test_path = df.iloc[0]["path"]
print("Test file path:", test_path)
print("File exists:", os.path.exists(test_path))

import librosa
audio, sr = librosa.load(test_path)
print("Audio length (samples):", len(audio))
print("Sample rate:", sr)

Test file path: /content/ravdess_data/Actor_01/03-01-02-01-02-02-01.wav
File exists: True
Audio length (samples): 76517
Sample rate: 22050


In [None]:
import importlib.util
if importlib.util.find_spec("resampy") is None:
    !pip install resampy --quiet

import librosa
import numpy as np
from tqdm import tqdm

def extract_features(file_path, n_mfcc=30, max_pad_len=150):
    try:
        # Load audio
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')

        # MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

        # Delta and Delta-Delta
        delta_mfcc = librosa.feature.delta(mfccs)
        delta2_mfcc = librosa.feature.delta(mfccs, order=2)

        # Stack features: shape = (n_mfcc*3, time_steps)
        combined = np.vstack([mfccs, delta_mfcc, delta2_mfcc])

        # Pad or truncate
        if combined.shape[1] < max_pad_len:
            pad_width = max_pad_len - combined.shape[1]
            combined = np.pad(combined, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            combined = combined[:, :max_pad_len]

        return combined.flatten()

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Extract features
features = []
labels = []

print("Extracting richer MFCC features...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    feat = extract_features(row["path"])
    if feat is not None:
        features.append(feat)
        labels.append(row["label"])

X = np.array(features)
y = np.array(labels)

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)

Extracting richer MFCC features...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1440/1440 [02:22<00:00, 10.12it/s]


Feature matrix shape: (1440, 13500)
Labels shape: (1440,)


To train a machine learning model, raw audio must first be converted into a numerical form that algorithms can understand. We use Mel-Frequency Cepstral Coefficients (MFCCs) for this task.

Hereâ€™s how it works:

1. Load Audio â€“ Each .wav file is loaded as a waveform (a sequence of numbers representing sound pressure over time).


2. Frequency Analysis â€“ We break the audio into tiny segments (frames) and analyze the frequency content of each.


3. Mel Scale Conversion â€“ Human hearing perceives pitch non-linearly. The Mel scale transforms actual frequencies into a scale that better matches our earâ€™s sensitivity.


4. Logarithmic Compression â€“ We take the log of the Mel energies to reduce the effect of very loud sounds and highlight subtle details.


5. Discrete Cosine Transform (DCT) â€“ This compacts the information into a small set of numbers called coefficients.


6. Padding/Truncation â€“ Since audio clips vary in length, we pad short clips and trim long ones to make all feature vectors the same size.



The result is a fixed-length feature vector for each audio file.
These vectors, along with their labels (stressed / non-stressed), are now ready for training a classification model.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# 1. Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # "stressed" -> 1, "non-stressed" -> 0

# 2. Train-test split (stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 3. Define and train the XGBoost model
model = XGBClassifier(
    n_estimators=300,       # more trees
    max_depth=7,           # deeper trees
    learning_rate=0.05,    # smaller steps
    subsample=0.8,         # row sampling
    colsample_bytree=0.8,  # feature sampling
    reg_lambda=1.5,        # regularization
    eval_metric='logloss',
    n_jobs=-1,
    use_label_encoder=False
)

print("Training the model...")
model.fit(X_train, y_train)

# 4. Predictions
y_pred = model.predict(X_test)

# 5. Evaluation
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_)) # Decode both for classification report

Training the model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Accuracy: 0.7777777777777778

Classification Report:
               precision    recall  f1-score   support

non-stressed       0.77      0.48      0.59        96
    stressed       0.78      0.93      0.85       192

    accuracy                           0.78       288
   macro avg       0.77      0.70      0.72       288
weighted avg       0.78      0.78      0.76       288



Librosa's load

Opened each WAV file and converted it into a waveform (array of amplitudes) and a sample rate (number of samples per second).

We used res_type='kaiser_fast' for faster processing with minimal quality loss.



2. MFCC Extraction

For each short slice of the audio, we computed MFCC (Mel-Frequency Cepstral Coefficients).

MFCCs describe the shape of the sound spectrum in a way thatâ€™s closer to how humans hear.

Parameters:

n_mfcc = number of coefficients per frame (13 in our prototype)

max_pad_len = fixed number of time frames (100), so every audio file becomes the same size.


If an audio file was shorter, we padded it with zeros (silence).

If it was longer, we truncated extra frames.



3. Flattening

MFCCs are 2D (n_mfcc Ã— time_frames).

We flattened them into a single long vector so XGBoost can use them as input features.



4. Label Encoding

Converted our text labels ('stressed', 'non-stressed') into numeric form (0 and 1) since XGBoost works with numbers.



5. XGBoost Training

We split our data into training and testing sets (80% train, 20% test).

XGBoost built decision trees to separate the two classes based on MFCC patterns.

Parameters used:

n_estimators = number of trees

max_depth = maximum depth of each tree

learning_rate = step size for model learning


Finally, we evaluated accuracy and printed a classification report.




This step completed the core pipeline:
Raw Audio â†’ MFCC Feature Vectors â†’ XGBoost Classifier â†’ Accuracy: ~77%

In [None]:
import gradio as gr
import numpy as np
import librosa
import tempfile
import soundfile as sf

# Make sure your trained model and label encoder are already loaded:
# model = <your_trained_xgboost_model>
# le = <your_label_encoder>

# Use your existing feature extraction function
def extract_features(file_path, n_mfcc=30, max_pad_len=150):
    try:
        # Load audio
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')

        # MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

        # Delta and Delta-Delta
        delta_mfcc = librosa.feature.delta(mfccs)
        delta2_mfcc = librosa.feature.delta(mfccs, order=2)

        # Stack features: shape = (n_mfcc*3, time_steps)
        combined = np.vstack([mfccs, delta_mfcc, delta2_mfcc])

        # Pad or truncate
        if combined.shape[1] < max_pad_len:
            pad_width = max_pad_len - combined.shape[1]
            combined = np.pad(combined, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            combined = combined[:, :max_pad_len]

        return combined.flatten()

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


def predict_stress(audio_filepath):
    try:
        if audio_filepath is None:
            return "Please provide an audio input."

        feat = extract_features(audio_filepath)

        if feat is None:
            return "Error: Could not process audio."

        feat = np.array(feat).reshape(1, -1)
        pred = model.predict(feat)[0]
        label = le.inverse_transform([pred])[0]
        return f"Prediction: {label}"

    except Exception as e:
        return f"Error: {e}"

# Gradio Interface
demo = gr.Interface(
    fn=predict_stress,
    inputs=gr.Audio(type="filepath", label="Upload or Record your voice"),
    outputs=gr.Textbox(label="Stress Prediction"),
    title="Depression & Stress Detection from Speech",
    description="Record a short voice sample. The model predicts stress level."
)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6c2c1e896e75a139e0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


