# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [9]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-e1v0mh_8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-e1v0mh_8
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


chat code

In [10]:
# First, upload your MP3 file to Colab


# Get the uploaded filename
mp3_file = "sample_data/one_time.mp3"

# Custom Audio Dataset Class for your MP3
class CustomAudioDataset(torch.utils.data.Dataset):
    def __init__(self, audio_path, device="cuda"):
        self.audio_path = audio_path
        self.device = device

        # Load and preprocess audio
        audio = whisper.load_audio(audio_path)
        audio_tensor = torch.from_numpy(whisper.pad_or_trim(audio)).float().to(device)
        self.audio = audio_tensor
        self.mel = whisper.log_mel_spectrogram(audio_tensor)

    def __len__(self):
        return 1  # Only one audio file

    def __getitem__(self, index):
        return self.mel, ""  # Return empty string as reference text

# Create dataset and loader for your MP3
custom_dataset = CustomAudioDataset(mp3_file)
loader = torch.utils.data.DataLoader(custom_dataset, batch_size=1)

# Load model (using base.en for English)
model = whisper.load_model("base.en")

# Modified processing loop for custom audio
hypotheses = []
references = []

for mels, texts in loader:
    # Decode with timestamps enabled
    options = whisper.DecodingOptions(language="en", without_timestamps=torch._functionalize_set_storage_changed)
    results = model.decode(mels, options)

    hypotheses.extend([result.text for result in results])
    references.extend(texts)  # Will be empty strings

# Show full transcription with timestamps
print("Full Transcription:")
print(hypotheses[0])

# For detailed word-level timestamps, use transcribe() instead:
result = model.transcribe(mp3_file)
for segment in result["segments"]:
    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}")

Full Transcription:
A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
[0.00s - 8.00s]  A, A, A, A, A, A, A, A, A, A, A, A, A, B, plus you
[8.00s - 20.00s]  I'ma tell you one time, one time, one time, one time, one time, one time, one time
[20.00s - 23.00s]  When a match got my heart went knock knock
[23.00s - 26.00s]  Now the butterflies in my stomach won't stop stop
[26.00s - 29.00s]  And even though it's a struggle of us all we got
[29.00s - 32.00s]  So we gon' keep keep climbing to the mountain top
[32.00s - 40.00s]  Your world is my world and my fight is your fight
[40.00s - 44.00s]  My breath is your breath when you're heard
[44.00s - 50.00s]  I'm gonna yell my one love, my one heart, my one life
[50

In [11]:
# import torch
# import whisper
# import pandas as pd
# import jiwer
# from whisper.normalizers import EnglishTextNormalizer
# from tqdm import tqdm

# # First, upload your MP3 file to Colab
# mp3_file = "sample_data/one_time.mp3"
# lyrics_file = "sample_data/one_time.txt"

# # Load the model with better configuration
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = whisper.load_model("small.en", device=device)  # Upgrade to small model

# # Transcribe with improved parameters for long audio
# result = model.transcribe(
#     mp3_file,
#     word_timestamps=True,
#     verbose=False,  # Disable default progress
#     fp16=False,  # More stable for long files
#     beam_size=5,  # Better accuracy
#     no_speech_threshold=0.3,  # Reduce early stopping
#     condition_on_previous_text=True  # Maintain context
# )

# # Show full duration processed
# audio = whisper.load_audio(mp3_file)
# duration = len(audio) / 16000  # Sample rate is 16kHz
# print(f"Processed {duration:.2f} seconds of audio")

# # Load original lyrics
# with open(lyrics_file, "r") as f:
#     original_lyrics = [line.strip() for line in f.readlines() if line.strip()]

# # Prepare hypotheses with time info
# segments = []
# for seg in result['segments']:
#     segments.append({
#         'start': seg['start'],
#         'end': seg['end'],
#         'text': seg['text'].strip()
#     })

# # Create DataFrame with timing info
# df = pd.DataFrame(segments)

# # Add original lyrics (assuming time-aligned)
# # df['reference'] = pd.Series(original_lyrics[:len(df)])  # Temporary alignment
# df['reference'] = original_lyrics[:len(df)]  # Positional alignment
# df['reference'] = df['reference'].fillna('')  # Critical fix

# # Normalize text
# normalizer = EnglishTextNormalizer()
# df['hypothesis_clean'] = df['text'].apply(normalizer)
# df['reference_clean'] = df['reference'].apply(normalizer)

# # Calculate Word Error Rate with alignment tolerance
# wer = jiwer.wer(
#     list(df['reference_clean'].dropna()),
#     list(df['hypothesis_clean'].dropna())
# )
# print(f"\nWord Error Rate: {wer * 100:.2f}%\n")

# # Improved alignment display
# print("Time-Aligned Comparison:")
# pd.set_option('display.max_rows', 100)
# display(df[['start', 'end', 'reference', 'text']])

# # Save full results
# df.to_csv("transcription_results.csv", index=False)
# print("Saved full results to transcription_results.csv")

In [13]:
import torch
import whisper
import pandas as pd
import jiwer
from whisper.normalizers import EnglishTextNormalizer
from tqdm import tqdm

# First, upload your MP3 file to Colab
mp3_file = "sample_data/one_time.mp3"
lyrics_file = "sample_data/one_time.txt"

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base.en", device=device)

# Transcribe with timestamps
result = model.transcribe(mp3_file, word_timestamps=True)

# Load original lyrics
with open(lyrics_file, "r") as f:
    original_lyrics = [line.strip() for line in f.readlines() if line.strip()]

# Prepare hypotheses and references
hypotheses = [seg['text'].strip() for seg in result['segments']]
references = original_lyrics[:len(hypotheses)]  # Align by position

# Normalize text
normalizer = EnglishTextNormalizer()
data = pd.DataFrame({
    'reference': references,
    'hypothesis': hypotheses
})

data["hypothesis_clean"] = data["hypothesis"].apply(normalizer)
data["reference_clean"] = data["reference"].apply(normalizer)

# Calculate Word Error Rate
wer = jiwer.wer(
    list(data["reference_clean"]),
    list(data["hypothesis_clean"])
)
print(f"\nWord Error Rate: {wer * 100:.2f}%\n")

# Display alignment comparison
print("Alignment Comparison:")
data[['reference', 'hypothesis']]




Word Error Rate: 127.22%

Alignment Comparison:


Unnamed: 0,reference,hypothesis
0,"Ay ay ay ay ay ay ay, let's go","A, A, A, A, A, A, A, A, A, A, A, A, A, B, plus..."
1,Me plus you I'mma tell you one time,"I'ma tell you one time, one time, one time, on..."
2,I'mma tell you one time,When a match got my heart went knock knock
3,"I'mma tell you one time, one time",Now the butterflies in my stomach won't stop stop
4,When I met you girl my heart went knock knock,And even though it's a struggle of us all we got
5,Now them butterflies in my stomach won't stop ...,So we gon' keep keep climbing to the mountain ...
6,Even though it's a struggle love is all we got,Your world is my world and my fight is your fight
7,So we gon' keep keep climbing 'til the mountai...,My breath is your breath when you're hurt
8,Your world is my world,"I'm gonna yell my one love, my one heart, my o..."
9,And my fight is your fight,"Let me tell you one time, I'ma tell you one time"


In [14]:
# Show timestamped segments
print("\nTimestamped Segments:")
for seg in result['segments']:
    print(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}")




Timestamped Segments:
[0.00s - 7.56s]  A, A, A, A, A, A, A, A, A, A, A, A, A, B, plus you
[8.24s - 19.18s]  I'ma tell you one time, one time, one time, one time, one time, one time, one time
[20.00s - 23.18s]  When a match got my heart went knock knock
[23.18s - 26.40s]  Now the butterflies in my stomach won't stop stop
[26.40s - 29.30s]  And even though it's a struggle of us all we got
[29.30s - 32.60s]  So we gon' keep keep climbing to the mountain town
[32.60s - 39.60s]  Your world is my world and my fight is your fight
[39.60s - 44.60s]  My breath is your breath when you're hurt
[44.60s - 50.98s]  I'm gonna yell my one love, my one heart, my one life for sure
[50.98s - 56.16s]  Let me tell you one time, I'ma tell you one time
[56.16s - 61.64s]  And I'ma be a one guy, you be my number one girl
[61.64s - 65.92s]  Always making time for you, I'ma tell you one time
[65.92s - 69.80s]  I'ma tell you one time, I'ma tell you one time
[69.80s - 72.06s]  And I'ma tell you one time, you love

In [15]:

from difflib import ndiff
print("\nDetailed Differences:")
for idx, row in data.iterrows():
    ref = row['reference_clean'].split()
    hyp = row['hypothesis_clean'].split()
    diff = ndiff(ref, hyp)
    print(f"\nSegment {idx+1}:")
    print(' '.join([d[2:] if d.startswith('  ') else f'[{d}]' for d in diff]))


Detailed Differences:

Segment 1:
[- ay] [- ay] [- ay] [- ay] [- ay] [- ay] [- ay] [- let] [- us] [- go] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ a] [+ b] [+ plus] [+ you]

Segment 2:
[- me] [- plus] [- you] i [- mma] [+ am] [+ going] [+ to] tell you one time [+ one] [+ time] [+ one] [+ time] [+ one] [+ time] [+ one] [+ time] [+ one] [+ time] [+ one] [+ time]

Segment 3:
[- i] [- mma] [- tell] [- you] [- one] [- time] [+ when] [+ a] [+ match] [+ got] [+ my] [+ heart] [+ went] [+ knock] [+ knock]

Segment 4:
[- i] [- mma] [- tell] [- you] [- one] [- time] [- one] [- time] [+ now] [+ the] [+ butterflies] [+ in] [+ my] [+ stomach] [+ will] [+ not] [+ stop] [+ stop]

Segment 5:
[- when] [- i] [- met] [- you] [- girl] [- my] [- heart] [- went] [- knock] [- knock] [+ and] [+ even] [+ though] [+ it] [+ is] [+ a] [+ struggle] [+ of] [+ us] [+ all] [+ we] [+ got]

Segment 6:
[- now] [+ so] [+ we] [+ gon] [+ keep] [+ keep] [+ climbing] [+ to] [- them] [?    -
]

In [19]:
import torch
import whisper
import pandas as pd
import jiwer
from tqdm import tqdm
import re

# Custom normalizer for Polish text
class PolishTextNormalizer:
    def __init__(self):
        self.replacements = [
            (r'[.,!?;:()„”"«»—\-–]', ''),  # Remove punctuation
            (r'\s+', ' '),  # Multiple whitespace to single
        ]

    def __call__(self, text):
        text = text.lower()
        for pattern, replacement in self.replacements:
            text = re.sub(pattern, replacement, text)
        text = text.strip()
        return text

# First, upload your files
mp3_file = "sample_data/oczy.m4a"
lyrics_file = "sample_data/one_time.txt"

# Load multilingual model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("medium", device=device)  # Medium model supports Polish better

# Transcribe with Polish settings
result = model.transcribe(
    mp3_file,
    language="pl",  # Explicitly set to Polish
    task="transcribe",
    word_timestamps=True,
    beam_size=5,
    fp16=torch.cuda.is_available()
)

# Load original lyrics
with open(lyrics_file, "r", encoding='utf-8') as f:
    original_lyrics = [line.strip() for line in f.readlines() if line.strip()]

# Prepare hypotheses and references
hypotheses = [seg['text'].strip() for seg in result['segments']]
references = original_lyrics[:len(hypotheses)]

# Normalize text
normalizer = PolishTextNormalizer()
data = pd.DataFrame({
    'reference': references,
    'hypothesis': hypotheses
})

# Handle potential NaN values
data = data.fillna('')

# Apply normalization
data["hypothesis_clean"] = data["hypothesis"].apply(normalizer)
data["reference_clean"] = data["reference"].apply(normalizer)

# Calculate Word Error Rate
wer = jiwer.wer(
    list(data["reference_clean"]),
    list(data["hypothesis_clean"])
)
print(f"\nWord Error Rate: {wer * 100:.2f}%\n")

# Display alignment comparison
print("Alignment Comparison:")
data[['reference', 'hypothesis']]

# Show timestamped segments
print("\nTimestamped Segments:")
for seg in result['segments']:
    print(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}")


Word Error Rate: 163.79%

Alignment Comparison:

Timestamped Segments:
[0.00s - 29.28s]  Aha, 2016, Body Christ, DJ Club Banger, Rock For Life Studio, Akcent, sprawdzaj to, yo, 2016, ta, ta, ta, łapy w górę, ta, ta, ta, aha.
[30.00s - 33.98s]  Ta, ta, ta, lecimy.
[37.66s - 45.00s]  Odkąd zobaczyłem Ciebie, nie mogę jeść, nie mogę spać.
[46.40s - 53.00s]  Jak do tego doszło, nie wiem, miłość o sobie dała znać.
[53.42s - 70.42s]  Co poradzić mogę na to, że miłość przyszła właśnie dziś, że w sercu mym jest lato, a w moich myślach jesteś Ty.
[70.93s - 78.93s]  Przez Twoje oczy te oczy zielone oszalałem.
[80.19s - 89.89s]  Gwiazdy chyba Twym oczom oddały cały blask, a ja...
