# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [2]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-vapr3z7l
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-vapr3z7l
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


chat code

In [8]:
# First, upload your MP3 file to Colab
import torch
import whisper
import pandas as pd
import jiwer
from whisper.normalizers import EnglishTextNormalizer
from tqdm import tqdm

# Get the uploaded filename
# mp3_file = "sample_data/fireworks.mp3"
mp3_file = "sample_data/can-you-feel.mp3"
# Custom Audio Dataset Class for your MP3
class CustomAudioDataset(torch.utils.data.Dataset):
    def __init__(self, audio_path, device="cuda"):
        self.audio_path = audio_path
        self.device = device

        # Load and preprocess audio
        audio = whisper.load_audio(audio_path)
        audio_tensor = torch.from_numpy(whisper.pad_or_trim(audio)).float().to(device)
        self.audio = audio_tensor
        self.mel = whisper.log_mel_spectrogram(audio_tensor)

    def __len__(self):
        return 1  # Only one audio file

    def __getitem__(self, index):
        return self.mel, ""  # Return empty string as reference text

# Create dataset and loader for your MP3
custom_dataset = CustomAudioDataset(mp3_file)
loader = torch.utils.data.DataLoader(custom_dataset, batch_size=1)

# Load model (using base.en for English)
model = whisper.load_model("medium.en")

# Modified processing loop for custom audio
hypotheses = []
references = []

for mels, texts in loader:
    # Decode with timestamps enabled
    options = whisper.DecodingOptions(language="en", without_timestamps=torch._functionalize_set_storage_changed)
    results = model.decode(mels, options)

    hypotheses.extend([result.text for result in results])
    references.extend(texts)  # Will be empty strings

# Show full transcription with timestamps
print("Full Transcription:")
print(hypotheses[0])

# For detailed word-level timestamps, use transcribe() instead:
result = model.transcribe(mp3_file)
for segment in result["segments"]:
    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}")

100%|█████████████████████████████████████| 1.42G/1.42G [00:24<00:00, 62.4MiB/s]


Full Transcription:
Hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit, hit,
[30.00s - 48.00s]  Can you hear the silence? Can you see the dark? Can you fix the broken? Can you feel, can you feel the heart?
[60.00s - 74.00s]  Can you help the hopeless? Will I beg it on my knees? Can you save my bastard soul? Will you wait for me?
[75.00s - 82.00s]  I'm sorry brother, so sorry lover, forgive me father, I love you mother
[83.00s - 93.00s]  Can you hear the silence? Can you see the dark? Can you 

In [10]:
import torch
import whisper
import pandas as pd
import jiwer
from tqdm import tqdm
import re

# Custom normalizer for Polish text
class PolishTextNormalizer:
    def __init__(self):
        self.replacements = [
            (r'[.,!?;:()„”"«»—\-–]', ''),  # Remove punctuation
            (r'\s+', ' '),  # Multiple whitespace to single
        ]

    def __call__(self, text):
        text = text.lower()
        for pattern, replacement in self.replacements:
            text = re.sub(pattern, replacement, text)
        text = text.strip()
        return text

# First, upload your files
mp3_file = "sample_data/7dam.mp3"


# Load multilingual model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("medium", device=device)  # Medium model supports Polish better

# Transcribe with Polish settings
result = model.transcribe(
    mp3_file,
    language="pl",  # Explicitly set to Polish
    task="transcribe",
    word_timestamps=True,
    beam_size=5,
    fp16=torch.cuda.is_available()
)



# Prepare hypotheses and references
hypotheses = [seg['text'].strip() for seg in result['segments']]
# references = original_lyrics[:len(hypotheses)]

# Normalize text
normalizer = PolishTextNormalizer()
data = pd.DataFrame({
    'hypothesis': hypotheses
})

# Handle potential NaN values
data = data.fillna('')

# Apply normalization
data["hypothesis_clean"] = data["hypothesis"].apply(normalizer)


# Show timestamped segments
print("\nTimestamped Segments:")
for seg in result['segments']:
    print(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}")


Timestamped Segments:
[5.06s - 12.02s]  I nie wiem, czy to błąd, że chcę je na chwilę, czy to w jedną stronę bilet, obawiam się, że tak
[12.02s - 18.58s]  Nie wiem, czy coś ze mną jest nie tak, że chcę siedem dami, tak na końcu zostanę tu sam
[18.58s - 25.06s]  I nie wiem, czy to błąd, że chcę ich aż tyle, ale nie, że pojedynczo, tylko wszystkie na rad
[25.06s - 31.98s]  Nie wiem, czy coś ze mną jest nie tak, że chcę siedem dami, w mojej głowie mam najlepszy świat
[33.32s - 36.80s]  Daj mi jedną szansę, ona mówi do mnie siedem dam, siebie dam
[37.46s - 40.18s]  Ale ja nie chcę ciebie, tylko więcej szans, żeby żyć
[40.18s - 43.46s]  Wymarzyłem sobie stan, którego nie spełnię, więc skupię nie mam na myśl
[43.46s - 46.78s]  Nikt nie zabierze dostępu do marzeń, jestem w wiarze, projektuję sny
[46.78s - 50.12s]  My chcemy żyć na poważnie, ej, nie chcemy się bawić, ej, ej
[50.12s - 53.42s]  A zaraz chcemy się bawić, nas już nie da się naprawić, ej, ej
[53.42s - 56.70s]  Cztery na cztery jes