In [55]:
import os
import whisper
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.backends.backend_pdf import PdfPages

def generate_transcript(file_path, model):
    model = whisper.load_model(model)
    result = model.transcribe(file_path, word_timestamps=True)
    return result['segments'][0]['words']

def read_words(path, convert_to_seconds=True, sample_rate=16000):
    words = []
    with open(path) as f:
        for line in f:
            if line == '':
                break
            words.append({
                'word': line.split()[-1].strip(),
                'start': line.split()[0],
                'end': line.split()[1]
            })
    if convert_to_seconds:
        for word in words:
            word['start'] = float(word['start']) / sample_rate
            word['end'] = float(word['end']) / sample_rate
    return words

def plot_transcript(transcript, ax, y_pos, label):
    for word_info in transcript:
        start = word_info['start']
        end = word_info['end']
        word = word_info['word']
        duration = end - start
        rect = patches.Rectangle((start, y_pos), duration, 0.4, edgecolor='black', facecolor='skyblue')
        ax.add_patch(rect)
        ax.text(start + duration / 2, y_pos + 0.2, word, ha='center', va='center', fontsize=8)
    ax.text(0, y_pos + 0.2, label, ha='right', va='center', fontsize=10, fontweight='bold')

def visualize_transcripts(transcripts, labels):
    fig, ax = plt.subplots(figsize=(15, len(transcripts)))
    for i, transcript, label in zip(range(len(transcripts)), transcripts, labels):
        plot_transcript(transcript, ax, i * 0.5, label)
    max_duration = max([t[-1]['end'] for t in transcripts])
    ax.set_xlim([0, max_duration])
    ax.set_yticks([])
    ax.set_xlabel('Time (s)')
    ax.set_title('Transcripts Comparison')
    return fig

def generate_visualizations(directory_path, model='tiny.en'):
    transcripts = []
    labels = []
    file_names = [n for n in os.listdir(directory_path) if n.endswith('.WAV')]
    for i, file_name in enumerate(file_names):
        print(f'[{i+1}/{len(file_names)}] Processing {file_name}')
        file_path = os.path.join(directory_path, file_name)
        transcript = generate_transcript(file_path, model)
        wrd_file_path = os.path.join(directory_path, file_name.replace('.WAV', '.WRD'))
        words = read_words(wrd_file_path)
        transcripts.append(transcript)
        transcripts.append(words)
        labels.append(file_name + ' (whisper)')
        labels.append(file_name.replace('.WAV', '.WRD') + ' (ground truth)')
    print('Generating visualizations...')
    figs = []
    for i in range(0, len(transcripts), 2):
        fig = visualize_transcripts([transcripts[i], transcripts[i+1]], [labels[i], labels[i+1]])
        figs.append(fig)
    with PdfPages(f'transcripts_comparison-{model}.pdf') as pdf:
        for fig in figs:
            pdf.savefig(fig)
    plt.close('all')

generate_visualizations('TIMIT/TRAIN/DR1/FCJF0/') # replace with the actual directory path


[1/10] Processing SI1027.WAV
[2/10] Processing SI1657.WAV
[3/10] Processing SX307.WAV
[4/10] Processing SX397.WAV
[5/10] Processing SA1.WAV
[6/10] Processing SA2.WAV
[7/10] Processing SX217.WAV
[8/10] Processing SX37.WAV
[9/10] Processing SI648.WAV
[10/10] Processing SX127.WAV
Generating visualizations...


In [56]:
MODELS = [
    'tiny.en',
    'base.en',
    'small.en',
    'medium',
    'large-v3'
]

for model in MODELS:
    generate_visualizations('../TIMIT/TRAIN/DR1/FCJF0/', model)

[1/10] Processing SI1027.WAV
[2/10] Processing SI1657.WAV
[3/10] Processing SX307.WAV
[4/10] Processing SX397.WAV
[5/10] Processing SA1.WAV
[6/10] Processing SA2.WAV
[7/10] Processing SX217.WAV
[8/10] Processing SX37.WAV
[9/10] Processing SI648.WAV
[10/10] Processing SX127.WAV
Generating visualizations...
[1/10] Processing SI1027.WAV


100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 44.4MiB/s]


[2/10] Processing SI1657.WAV
[3/10] Processing SX307.WAV
[4/10] Processing SX397.WAV
[5/10] Processing SA1.WAV
[6/10] Processing SA2.WAV
[7/10] Processing SX217.WAV
[8/10] Processing SX37.WAV
[9/10] Processing SI648.WAV
[10/10] Processing SX127.WAV
Generating visualizations...
[1/10] Processing SI1027.WAV


100%|███████████████████████████████████████| 461M/461M [00:08<00:00, 53.8MiB/s]


[2/10] Processing SI1657.WAV
[3/10] Processing SX307.WAV
[4/10] Processing SX397.WAV
[5/10] Processing SA1.WAV
[6/10] Processing SA2.WAV
[7/10] Processing SX217.WAV
[8/10] Processing SX37.WAV
[9/10] Processing SI648.WAV
[10/10] Processing SX127.WAV
Generating visualizations...
[1/10] Processing SI1027.WAV


100%|█████████████████████████████████████| 1.42G/1.42G [00:23<00:00, 64.1MiB/s]


[2/10] Processing SI1657.WAV
[3/10] Processing SX307.WAV
[4/10] Processing SX397.WAV
[5/10] Processing SA1.WAV
[6/10] Processing SA2.WAV
[7/10] Processing SX217.WAV
[8/10] Processing SX37.WAV
[9/10] Processing SI648.WAV
[10/10] Processing SX127.WAV
Generating visualizations...
[1/10] Processing SI1027.WAV


100%|█████████████████████████████████████| 2.88G/2.88G [00:35<00:00, 86.9MiB/s]


[2/10] Processing SI1657.WAV
[3/10] Processing SX307.WAV
[4/10] Processing SX397.WAV
[5/10] Processing SA1.WAV
[6/10] Processing SA2.WAV
[7/10] Processing SX217.WAV
[8/10] Processing SX37.WAV
[9/10] Processing SI648.WAV
[10/10] Processing SX127.WAV
Generating visualizations...
