# Speech Emotion Detection (Machine Learning)

Author: Yogesh (Yogesh6126)

This notebook builds a simple ML pipeline for speech emotion detection using synthetic audio samples. It is designed to run in Google Colab.

In [None]:
# Install dependencies in Colab/local environment (run once)
!pip install --quiet -r requirements.txt


In [None]:
import os
import numpy as np
import pandas as pd
import soundfile as sf
import matplotlib.pyplot as plt

# Create folder for audio samples
os.makedirs('audio_samples', exist_ok=True)

# Sampling params
sr = 16000
duration = 1.5  # seconds

def make_tone(freq, sr=sr, duration=duration, amplitude=0.5, phase=0.0):
    t = np.linspace(0, duration, int(sr*duration), endpoint=False)
    return amplitude * np.sin(2*np.pi*freq*t + phase)

# Generate synthetic audio for emotions
emotions = ['happy','sad','angry','neutral']
np.random.seed(42)
rows = []
for emo in emotions:
    for i in range(15):  # 15 samples per emotion
        if emo == 'happy':
            freq = np.random.uniform(300,600)  # higher pitch
            amp = np.random.uniform(0.4,0.8)
            y = make_tone(freq, amplitude=amp)
            # add light vibrato
            y += 0.02 * make_tone(freq*1.01, amplitude=1.0)
        elif emo == 'sad':
            freq = np.random.uniform(120,220)  # lower pitch
            amp = np.random.uniform(0.1,0.4)
            y = make_tone(freq, amplitude=amp)
        elif emo == 'angry':
            # mix of high energy and added noise
            freq = np.random.uniform(250,450)
            amp = np.random.uniform(0.6,1.0)
            y = make_tone(freq, amplitude=amp)
            y += 0.15 * np.random.randn(len(y))
        else:  # neutral
            freq = np.random.uniform(180,300)
            amp = np.random.uniform(0.2,0.5)
            y = make_tone(freq, amplitude=amp)
            y += 0.01 * np.random.randn(len(y))

        fname = f'audio_samples/{emo}_{i}.wav'
        sf.write(fname, y, sr)
        rows.append({'filename': fname, 'emotion': emo})

# Create a CSV manifest
df = pd.DataFrame(rows)
df.to_csv('metadata.csv', index=False)
print('Created', len(df), 'synthetic audio samples and metadata.csv')


In [None]:
# Feature extraction using librosa
import librosa
import numpy as np
import pandas as pd

meta = pd.read_csv('metadata.csv')
features = []
for idx,row in meta.iterrows():
    y, sr = librosa.load(row['filename'], sr=None)
    # MFCCs (take mean of first 13 coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = mfcc.mean(axis=1)
    # Spectral centroid
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    # Zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(y).mean()
    # RMS energy
    rms = librosa.feature.rms(y).mean()
    feat = np.concatenate([mfcc_mean, [spec_cent, zcr, rms]])
    features.append(feat)

feat_names = [f'mfcc_{i+1}' for i in range(13)] + ['spec_centroid','zcr','rms']
X = pd.DataFrame(features, columns=feat_names)
y = meta['emotion']
print('Feature matrix shape:', X.shape)
X.head()


In [None]:
# Train-test split and model training (RandomForest)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
print('Confusion matrix (rows=true, cols=pred):')
print(cm)


In [None]:
# Save predictions.csv
pred_df = X_test.copy()
pred_df['true'] = y_test.values
pred_df['predicted'] = y_pred
pred_df.to_csv('predictions.csv', index=False)
print('Saved predictions.csv with', len(pred_df), 'rows')


In [None]:
# Simple plot of feature importance
import matplotlib.pyplot as plt
import numpy as np
importances = clf.feature_importances_
inds = np.argsort(importances)[::-1][:10]
plt.figure(figsize=(8,5))
plt.title('Top 10 Feature importances')
plt.bar(range(len(inds)), importances[inds])
plt.xticks(range(len(inds)), [X.columns[i] for i in inds], rotation=45)
plt.tight_layout()
plt.show()


## Notes
- The notebook generates synthetic audio so you can run everything offline in Colab without external datasets.
- Replace `audio_samples` and `metadata.csv` with a real dataset (RAVDESS, CREMA-D) for production-level experiments.
- This uses classical ML (RandomForest) for speed and simplicity; you can switch to deep learning easily.