In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.8.1


In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
model.eval()

# Load Data
speech_array, sampling_rate = torchaudio.load("/content/drive/MyDrive/demo.mp3")
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
waveform = resampler(speech_array).squeeze()

# Prepare Target
# Because we know that uppercase and lowercase are not part of the vocabulary
target_sentence = "wav two vec two a s r model is under attack"
target_ids = processor(
    text=target_sentence,
    return_tensors="pt"
).input_ids.to(device)

# Prepare Audio
if waveform.ndim == 1:
    waveform = waveform.unsqueeze(0)
waveform = waveform.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [None]:
def differentiable_normalize(waveform):
    mean = waveform.mean(dim=-1, keepdim=True)
    var = waveform.var(dim=-1, keepdim=True, unbiased=False)
    return (waveform - mean) / torch.sqrt(var + 1e-5)

In [None]:
def compute_snr_db(original, perturbed):
    noise = perturbed - original
    signal_power = original.pow(2).mean()
    noise_power = noise.pow(2).mean()
    return 10 * torch.log10(signal_power / (noise_power + 1e-12))

In [None]:
min_snr_db = 22.0
signal_power = waveform.pow(2).mean()
max_noise_power = signal_power / (10 ** (min_snr_db / 10))

# Optimization Setup
steps = 500
learning_rate = 0.005

delta = torch.zeros_like(waveform, requires_grad=True, device=device)
optimizer = torch.optim.Adam([delta], lr=learning_rate)

print(f"Targeting: '{target_sentence}'")
print(f"Constraint: SNR must stay above {min_snr_db} dB")
print("Starting Attack...")

for step in range(steps):
    optimizer.zero_grad()

    adv_raw = waveform + delta

    # Loss
    adv_normalized = differentiable_normalize(adv_raw)
    logits = model(adv_normalized).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

    input_lengths = torch.full((logits.shape[0],), logits.shape[1], dtype=torch.long, device=device)
    target_lengths = torch.full((target_ids.shape[0],), target_ids.shape[1], dtype=torch.long, device=device)

    loss = torch.nn.functional.ctc_loss(
        log_probs.transpose(0, 1),
        target_ids,
        input_lengths,
        target_lengths,
        blank=processor.tokenizer.pad_token_id,
        zero_infinity=True
    )

    loss.backward()
    optimizer.step()

    with torch.no_grad():
        # Check current noise power
        current_noise_power = delta.pow(2).mean()

        if current_noise_power > max_noise_power:
            scale = torch.sqrt(max_noise_power / (current_noise_power + 1e-12))
            delta.data *= scale

        # Standard valid audio clip [-1, 1]
        delta.data = (waveform + delta).clamp(-1.0, 1.0) - waveform

    if step % 100 == 0:
        current_snr = compute_snr_db(waveform, waveform + delta)
        print(f"Step {step:04d} | CTC Loss: {loss.item():.4f} | SNR: {current_snr.item():.2f} dB")

        # Stop only if text is good
        if loss.item() < 0.05:
            print("Target reached with valid SNR.")
            break

print("Attack Finished.")

Targeting: 'wav two vec two a s r model is under attack'
Constraint: SNR must stay above 22.0 dB
Starting Attack...
Step 0000 | CTC Loss: 18.7194 | SNR: 32.22 dB
Step 0100 | CTC Loss: 1.1083 | SNR: 22.00 dB
Step 0200 | CTC Loss: 0.0081 | SNR: 22.00 dB
Target reached with valid SNR.
Attack Finished.


In [None]:
# --- Final Verification ---
print("\n--- Verification ---")
adv_audio_final = (waveform + delta).detach()

# Use processor normally to verify (this handles normalization internally)
inputs = processor(
    adv_audio_final.squeeze().cpu().numpy(),
    sampling_rate=16000,
    return_tensors="pt",
    padding=True
).to(device)

with torch.no_grad():
    logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

print(f"Final SNR: {compute_snr_db(waveform, adv_audio_final).item():.2f} dB")
print(f"Original Target: {target_sentence}")
print(f"Attack Result:   {transcription}")


--- Verification ---
Final SNR: 22.00 dB
Original Target: wav two vec two a s r model is under attack
Attack Result:   wav two vec two a s r model is under attack


In [None]:
save_path = "/content/drive/MyDrive/adv_result_CW.wav"

torchaudio.save(save_path, adv_audio_final.cpu(), 16000)
print(f"Audio saved to: {save_path}")

Audio saved to: /content/drive/MyDrive/adv_result_CW.wav
