In [None]:
print("✅ Kernel started")

import torch
from datasets import load_dataset, Audio
from transformers import pipeline

print("torch:", torch.__version__)

# 1) Load small local slice (NOT streaming)
ds = load_dataset("superb", "ks", split="test[:20]")

# 2) Decode audio using librosa/soundfile
ds = ds.cast_column("audio", Audio(decode=True))

print("rows:", len(ds))
print("keys:", ds.column_names)

# 3) Load pre-trained audio classifier
clf = pipeline("audio-classification", model="superb/hubert-base-superb-ks")

# 4) Predict on 1 sample (demo)
sample = ds[0]
audio = sample["audio"]
true_label = ds.features["label"].names[sample["label"]]

preds = clf({"array": audio["array"], "sampling_rate": audio["sampling_rate"]})
top = preds[0]

print("\n--- Single sample ---")
print("True:", true_label)
print("Pred:", top["label"], "conf:", round(top["score"], 3))

# 5) Evaluate accuracy on 20 samples
correct = 0
for i in range(len(ds)):
    s = ds[i]
    a = s["audio"]
    true = ds.features["label"].names[s["label"]]
    pred = clf({"array": a["array"], "sampling_rate": a["sampling_rate"]})[0]["label"]
    if pred == true:
        correct += 1

N = 20  # evaluate on 20 samples (fast + works with streaming)
acc = correct / N
print(f"✅ Accuracy on {N} samples: {acc:.2%} ({correct}/{N})")


✅ Kernel started


In [2]:
from datasets import load_dataset

ds = load_dataset("superb", "ks", split="test", streaming=True)

n = 0
for sample in ds.take(3):
    n += 1
    print("ok sample", n, "label_id:", sample["label"])
print("✅ done")


ok sample 1 label_id: 10
ok sample 2 label_id: 10
ok sample 3 label_id: 10
✅ done


In [1]:
import torch
print("torch version:", torch.__version__)


torch version: 2.2.2


In [None]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

# 1) Load ONE sample (streaming-safe)
ds = load_dataset("superb", "ks", split="test", streaming=True)

sample = next(iter(ds))
audio = sample["audio"]
label_id = sample["label"]
true_label = ds.features["label"].names[label_id]

print("True label:", true_label)
print("Sampling rate:", audio["sampling_rate"])
print("Num samples:", len(audio["array"]))

# 2) Load the pretrained model (keyword spotting)
model_id = "superb/hubert-base-superb-ks"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModelForAudioClassification.from_pretrained(model_id)
model.eval()

# 3) Prepare input + predict
x = np.asarray(audio["array"], dtype=np.float32)
sr = int(audio["sampling_rate"])

inputs = feature_extractor(x, sampling_rate=sr, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1)[0]

top_idx = int(torch.argmax(probs))
pred_label = model.config.id2label[top_idx]
pred_conf = float(probs[top_idx])

print("\nPrediction:", pred_label, "| confidence:", round(pred_conf, 3))
print("Match:", pred_label == true_label)


True label: _silence_
Sampling rate: 16000
Num samples: 16000


Some weights of the model checkpoint at superb/hubert-base-superb-ks were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-ks and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametriza

In [1]:
print("still alive ✅")


still alive ✅


In [None]:
import numpy as np, torch
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

ds = load_dataset("superb", "ks", split="test", streaming=True)
sample = next(iter(ds))

audio = sample["audio"]
true_label = ds.features["label"].names[sample["label"]]

model_id = "superb/hubert-base-superb-ks"
fe = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModelForAudioClassification.from_pretrained(model_id).to("cpu")
model.eval()

x = np.asarray(audio["array"], dtype=np.float32)
sr = int(audio["sampling_rate"])

inputs = fe(x, sampling_rate=sr, return_tensors="pt")
with torch.no_grad():
    probs = torch.softmax(model(**inputs).logits, dim=-1)[0]

top5 = torch.topk(probs, k=5)
for score, idx in zip(top5.values, top5.indices):
    label = model.config.id2label[int(idx)]
    print(f"{label:15s} {float(score):.3f}")

pred = model.config.id2label[int(torch.argmax(probs))]
print("\nTrue:", true_label)
print("Pred:", pred)


Some weights of the model checkpoint at superb/hubert-base-superb-ks were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-ks and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametriza

In [None]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

# Load dataset (non-streaming is easiest for small N)
ds = load_dataset("superb", "ks", split="test[:50]")  # only first 50 to keep it fast

model_id = "superb/hubert-base-superb-ks"
fe = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModelForAudioClassification.from_pretrained(model_id)
model.eval()

N = 10
correct = 0

for i in range(N):
    sample = ds[i]
    audio = sample["audio"]
    x = np.asarray(audio["array"], dtype=np.float32)
    sr = int(audio["sampling_rate"])

    inputs = fe(x, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = int(torch.argmax(logits, dim=-1))

    pred_label = model.config.id2label[pred_id]
    true_label = ds.features["label"].names[sample["label"]]

    is_correct = (pred_label == true_label)
    correct += int(is_correct)

    print(f"{i+1:02d} | true={true_label:15s} pred={pred_label:15s} {'✓' if is_correct else '✗'}")

acc = correct / N
print(f"\n✅ Accuracy on {N} samples: {acc:.2%} ({correct}/{N})")


Some weights of the model checkpoint at superb/hubert-base-superb-ks were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-ks and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametriza

In [1]:
print("✅ END OF CELL REACHED")


✅ END OF CELL REACHED


## Reflection – Lab M1.04
Build Simple Neural Network & Audio Classification

Teaching Models to See (Images)
How did your neural network learn to distinguish dogs from cats?

The neural network learned to distinguish dogs from cats by adjusting its internal weights during training using backpropagation. Each image was flattened into numerical pixel values, and during training the model compared its predictions with the true labels. The error (loss) was used to update weights so that patterns associated with dogs or cats became more strongly represented in the network.

Over multiple epochs, the model gradually reduced its loss and improved accuracy by reinforcing features that helped separate the two classes.

What patterns do you think it learned?

The model likely learned low-level visual patterns such as:

edges and contours

color distributions

texture differences

shape density and contrast

Because the images were flattened, the model could not explicitly understand spatial structure, but it could still detect recurring pixel combinations that statistically correlate with cats or dogs.

Why did flattening the image work? What information might be lost?

Flattening worked because it converts images into a format compatible with a simple fully connected neural network. The model can still learn from pixel values, but it loses spatial relationships, such as:

relative position of eyes, ears, or body parts

local neighborhoods of pixels

This is why convolutional neural networks (CNNs) generally perform better on image data.

Teaching Models to Understand (Audio)
How is audio different from images as input?

Audio is a time-based signal, not a spatial one. Instead of pixels arranged in two dimensions, audio consists of waveforms sampled over time. This means:

order and timing matter

patterns unfold sequentially

context depends on surrounding samples

Because of this, models must understand temporal dependencies rather than spatial layouts.

What do you think the pre-trained audio model learned during training?

The pre-trained HuBERT model learned:

phonetic and acoustic patterns

temporal structure of speech

distinctions between silence, words, and sound events

During large-scale pretraining, the model learned general audio representations that can later be reused for downstream tasks such as keyword spotting or intent recognition.

Why do we use pre-trained models instead of training from scratch?

Training audio models from scratch requires:

massive datasets

high computational cost

long training time

Pre-trained models allow us to:

reuse learned representations

achieve good performance with minimal data

focus on inference and application rather than raw training

This is more efficient and practical for real-world use cases.

Transfer Learning
Why does the audio model work on a dataset it was not trained on?

The model learned general audio features rather than memorizing specific samples. These features (such as frequency patterns and temporal dynamics) transfer well across datasets, allowing the model to perform reasonably even on new tasks.

What knowledge was transferred?

Transferred knowledge includes:

how speech sounds are structured

how silence differs from speech

how temporal patterns map to semantic meaning

This is similar to how humans learn language sounds once and reuse that knowledge in new contexts.

How is this similar to human learning?

Humans do not relearn how to hear or see from scratch for every task. Similarly, pre-trained models reuse foundational knowledge and adapt it to new problems, which makes learning faster and more robust.

Model Architecture
Why are the image and audio models structured differently?

Image models focus on spatial relationships, while audio models focus on temporal relationships. This difference requires different architectures:

dense layers for simple image baselines

transformers for sequence modeling in audio

What makes transformers good for audio and text?

Transformers:

handle long-range dependencies

process sequences efficiently

use attention to focus on relevant parts of the input

This makes them especially effective for speech and language tasks.

How do convolutional layers differ from transformer layers?

Convolutional layers focus on local patterns

Transformer layers focus on global context

Both are powerful, but transformers are more flexible for sequence-based data like audio and text.