In [1]:
import tensorflow as tf
import numpy as np
import librosa
import random

# Simulate audio features for 5 scene classes (e.g., siren, dog, drilling, engine, children)

In [2]:
def generate_urban_audio_data(classes=5, samples_per_class=40, max_len=100, n_mfcc=20):
    X, y = [], []
    for label in range(classes):
        for _ in range(samples_per_class):
            freq = random.uniform(200, 1000) + label * 50  # Vary frequency per class
            signal = np.sin(np.linspace(0, 2 * np.pi * freq, 22050))
            mfcc = librosa.feature.mfcc(y=signal.astype(np.float32), sr=22050, n_mfcc=n_mfcc)
            mfcc = mfcc.T[:max_len]
            if mfcc.shape[0] < max_len:
                pad = max_len - mfcc.shape[0]
                mfcc = np.pad(mfcc, ((0, pad), (0, 0)), mode='constant')
            X.append(mfcc)
            y.append(label)
    return np.array(X), tf.keras.utils.to_categorical(y, num_classes=classes)

# Create dataset

In [3]:
X, y = generate_urban_audio_data()
X = X[..., np.newaxis]  # Add channel dimension

# Split into train and test sets

In [4]:
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Build CNN model for scene classification

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=X.shape[1:]),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Compile and train

In [6]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step - accuracy: 0.1823 - loss: 2.1172 - val_accuracy: 0.0000e+00 - val_loss: 4.1326
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.3417 - loss: 1.5581 - val_accuracy: 0.0000e+00 - val_loss: 4.9868
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.2823 - loss: 1.4189 - val_accuracy: 0.0000e+00 - val_loss: 4.1563
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.3385 - loss: 1.3333 - val_accuracy: 0.0000e+00 - val_loss: 3.4131
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3885 - loss: 1.2466 - val_accuracy: 0.0000e+00 - val_loss: 2.5640
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.3604 - loss: 1.3088 - val_accuracy: 0.0000e+00 - val_loss: 2.0541
Epoch 7/10
[1m4/4[0m

<keras.src.callbacks.history.History at 0x1b6af302000>

# Evaluate model

In [7]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"🏙️ Urban Sound Scene Classification Accuracy: {acc:.2f}")

🏙️ Urban Sound Scene Classification Accuracy: 0.00
