In [21]:
import whisperx
import gc

device = "cpu"
audio_file = "/Users/anton/Downloads/audio_trim.wav"
batch_size = 16 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
result["segments"]["text"] = "我喜欢机器学习"
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.venv/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.1. Bad things might happen unless you revert torch to 1.x.
Detected language: ru (0.85) in first 30s of audio...
[{'text': ' Во сих ван, свати!', 'start': 0.031, 'end': 1.499}]


TypeError: list indices must be integers or slices, not str

In [22]:
result

{'segments': [{'text': ' Во сих ван, свати!', 'start': 0.031, 'end': 1.499}],
 'language': 'ru'}

In [23]:
result["segments"][0]["text"] = "我喜欢机器学习"
model_a, metadata = whisperx.load_align_model(language_code="zh", device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment



[{'start': 0.031, 'end': 1.52, 'text': '我喜欢机器学习', 'words': [{'word': '我', 'start': np.float64(0.031), 'end': np.float64(1.394), 'score': np.float64(0.908)}, {'word': '喜', 'start': np.float64(1.394), 'end': np.float64(1.415), 'score': np.float64(0.0)}, {'word': '欢', 'start': np.float64(1.415), 'end': np.float64(1.436), 'score': np.float64(0.0)}, {'word': '机', 'start': np.float64(1.436), 'end': np.float64(1.457), 'score': np.float64(0.0)}, {'word': '器', 'start': np.float64(1.457), 'end': np.float64(1.478), 'score': np.float64(0.0)}, {'word': '学', 'start': np.float64(1.478), 'end': np.float64(1.499), 'score': np.float64(0.0)}, {'word': '习', 'start': np.float64(1.499), 'end': np.float64(1.52), 'score': np.float64(0.0)}]}]


In [None]:
import torch
import torchaudio
from torchaudio.models import wav2vec2_model
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from torchaudio.transforms import Resample
from torchaudio.utils import download_asset

import librosa
import numpy as np

# 📌 Example: Mandarin Wav2Vec2 CTC model (use a multilingual model if no Chinese-specific)
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.eval()

# 🧩 Load sample audio (replace with your own file)
# Example: Load with librosa
wav_path = "/Users/anton/Downloads/audio_trim.wav"
waveform, sr = librosa.load(wav_path, sr=16000)
input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

# 🧩 Inference: Get logits
with torch.no_grad():
    logits = model(input_values).logits

# 🧩 Get predicted tokens
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
print("Transcript:", transcription)

# 🧩 Forced alignment: use CTC segmentation
from torchaudio.models import RNNT

from torchaudio.models.decoder import ctc_decoder
from torchaudio.models.decoder import download_pretrained_files

# Torchaudio has CTC forced alignment utils
import torchaudio.functional as F

# 👇 Extract log_probs for alignment
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
log_probs = log_probs[0].cpu().detach().numpy()

# 👇 Generate alignment with torchaudio CTC forced alignment utilities
from torchaudio.backend.common import get_audio_backend

print("log_probs shape:", log_probs.shape)
print("First log_probs frame:", log_probs[0][:5])

# For full forced alignment, use torchaudio's `ctc_segmentation` helper:
from torchaudio.models.ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_token_list

# Tokenize target text: same vocab as processor
target_text = "我喜欢机器学习"
tokens = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize(target_text))
print("Target text tokens:", tokens)

# CTC parameters
params = CtcSegmentationParameters()
params.char_list = processor.tokenizer.convert_ids_to_tokens(range(len(processor.tokenizer)))
params.blank = processor.tokenizer.pad_token_id

# Compute segmentation
# 1) Prepare ground truth
ground_truth_mat, utt_begin_indices = prepare_token_list(params, [target_text])

# 2) Run segmentation
timings, char_probs, state_list = ctc_segmentation(
    params,
    log_probs,
    ground_truth_mat
)

print("Alignment timings:", timings)



Transcript: 我喜款随起


ImportError: cannot import name 'get_audio_backend' from 'torchaudio.backend.common' (/Users/anton/mlx/src/week5/.venv/lib/python3.11/site-packages/torchaudio/backend/common.py)

In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa

# 📌 Load Wav2Vec2 Chinese model
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.eval()

# 🧩 Load audio
wav_path = "/Users/anton/Downloads/audio_trim.wav"
waveform, sr = librosa.load(wav_path, sr=16000)
input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

# 🧩 Run CTC
with torch.no_grad():
    logits = model(input_values).logits

# 📌 Decode text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
print("Transcript:", transcription)

# 🧩 Forced alignment
from torchaudio.models.ctc_segmentation import (
    ctc_segmentation,
    CtcSegmentationParameters,
    prepare_token_list,
)

# 📌 Prepare target text
target_text = "我喜欢机器学习"
tokens = processor.tokenizer.convert_tokens_to_ids(
    processor.tokenizer.tokenize(target_text)
)
print("Target tokens:", tokens)

# 📌 Log probs
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
log_probs = log_probs[0].cpu().numpy()

# 📌 CTC params
params = CtcSegmentationParameters()
params.char_list = processor.tokenizer.convert_ids_to_tokens(
    range(len(processor.tokenizer))
)
params.blank = processor.tokenizer.pad_token_id

# 📌 Prepare alignment input
ground_truth_mat, utt_begin_indices = prepare_token_list(params, [target_text])

# 📌 Do segmentation
timings, char_probs, state_list = ctc_segmentation(
    params,
    log_probs,
    ground_truth_mat
)

print("CTC timings:\n", timings)

Transcript: 我喜款随起


ModuleNotFoundError: No module named 'torchaudio.models.ctc_segmentation'

In [14]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa

from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text

# Load model
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.eval()

# Load audio
wav_path = "/Users/anton/Downloads/audio_trim.wav"
waveform, sr = librosa.load(wav_path, sr=16000)
input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

# Get logits
with torch.no_grad():
    logits = model(input_values).logits

# Decode
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
log_probs = log_probs[0].cpu().numpy()

# Target text
ground_truth = ["我喜欢机器学习"]

# CTC seg params
params = CtcSegmentationParameters()
char_list = list(processor.tokenizer.get_vocab().keys())

ground_truth_mat, utt_begin_indices = prepare_text(params, char_list, ground_truth)

timings, char_probs, state_list = ctc_segmentation(params, log_probs, ground_truth_mat)

print("Timings:", timings)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/anton/mlx/src/week5/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/anton/mlx/src/week5/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/anton/mlx/src/week5/.venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_lo

ImportError: numpy.core.multiarray failed to import (auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; use '<void>numpy._import_array' to disable if you are certain you don't need it).

In [None]:
import numpy as np
np.__version___

AttributeError: module 'numpy' has no attribute '__version___'

In [16]:
import torch, librosa, numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torchaudio

# 1️⃣ Load pre‑trained Wav2Vec2 model & processor
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).eval()

# 2️⃣ Load and preprocess audio
waveform, sr = librosa.load("/Users/anton/Downloads/audio_trim.wav", sr=16000)
input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

# 3️⃣ Compute logits and take softmax
with torch.no_grad():
    logits = model(input_values).logits
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)[0].cpu().numpy()

# 4️⃣ Generate whispered transcript (for reference)
pred_ids = logits.argmax(dim=-1)[0]
whispered = processor.decode(pred_ids)
print("Whisper transcript:", whispered)

# 5️⃣ Prepare for forced-alignment
from torchaudio.models.ctc_segmentation import (
    CtcSegmentationParameters,
    ctc_segmentation,
    prepare_token_list,
)

target_text = "我喜欢机器学习"
# Token sequences accepted by CTC:
ground_truth_mat, utt_begin_indices = prepare_token_list(
    CtcSegmentationParameters(), [target_text]
)
# Character list for mapping:
params = CtcSegmentationParameters()
params.char_list = list(processor.tokenizer.get_vocab().keys())
params.blank = processor.tokenizer.pad_token_id

# 6️⃣ Run forced-alignment
timings, _, _ = ctc_segmentation(params, log_probs, ground_truth_mat)

# 📌 Convert frame indices to seconds
frame_duration = model.config.conv_stride[-1] / sr
word_times = [(start*frame_duration, end*frame_duration, target_text[i])
              for i, (start, end, _) in enumerate(timings)]

print("Word timings:", word_times)

Whisper transcript: 我喜款随起


ModuleNotFoundError: No module named 'torchaudio.models.ctc_segmentation'

In [24]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np
from dataclasses import dataclass
from typing import List

# --- Core Alignment Functions (Simplified from whisperx) ---

@dataclass
class Point:
    token_index: int
    time_index: int
    score: float

@dataclass
class Segment:
    label: str
    start: int
    end: int
    score: float

def get_trellis(emission, tokens, blank_id=0):
    """Calculates the alignment trellis for CTC."""
    num_frame = emission.size(0)
    num_tokens = len(tokens)
    trellis = torch.full((num_frame, num_tokens), -float("inf"))

    trellis[0, 0] = emission[0, blank_id]
    trellis[0, 1] = emission[0, tokens[1]]

    for t in range(1, num_frame):
        trellis[t, 0] = trellis[t - 1, 0] + emission[t, blank_id]
        for j in range(1, num_tokens):
            trellis[t, j] = max(trellis[t - 1, j], trellis[t - 1, j - 1]) + emission[t, tokens[j]]
            if j > 1 and tokens[j-1] == tokens[j]:
                 trellis[t,j] = max(trellis[t,j], trellis[t-1,j-2] + emission[t,tokens[j]])
    return trellis


def backtrack(trellis, emission, tokens, blank_id=0):
    """Backtracks through the trellis to find the most likely alignment path."""
    j = trellis.size(1) - 1
    t = trellis.size(0) - 1
    path = [Point(j, t, emission[t, tokens[j]].exp().item())]

    while j > 0 and t > 0:
        # 1. Look for a diagonal move (token change)
        if trellis[t-1, j-1] > trellis[t-1,j] and tokens[j-1] != tokens[j]:
            j = j-1
        
        # 2. Otherwise, stay at the same token (horizontal move)
        t = t - 1
        
        # Use the token from the path's last point if it's a non-blank token, else use the new 'j'
        current_token_idx = path[-1].token_index
        token_id = tokens[current_token_idx] if tokens[current_token_idx] != blank_id else tokens[j]
        prob = emission[t, token_id].exp().item()
        path.append(Point(j, t, prob))


    while t > 0:
        t -= 1
        prob = emission[t, blank_id].exp().item()
        path.append(Point(0, t, prob))
        
    return path[::-1]


def merge_repeats(path, transcript):
    """Merges consecutive repeated tokens in the alignment path."""
    i1, i2 = 0, 0
    segments = []
    while i1 < len(path):
        while i2 < len(path) and path[i1].token_index == path[i2].token_index:
            i2 += 1
        
        # Correctly get the character for the segment
        char_idx = path[i1].token_index
        # In the modified trellis, tokens are padded, so we need to adjust the index
        label = transcript[(char_idx - 1) // 2]
        
        score = sum(p.score for p in path[i1:i2]) / (i2 - i1)
        segments.append(
            Segment(
                label,
                path[i1].time_index,
                path[i2 - 1].time_index + 1,
                score,
            )
        )
        i1 = i2
    return segments

# --- Main Alignment Logic ---

# 1. Setup: Define file paths, transcription, and model details
audio_path = "/Users/anton/Downloads/audio_trim.wav"
transcription = "我喜欢机器学习"
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Load Model and Processor
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

# 3. Load and Preprocess Audio
waveform, sample_rate = torchaudio.load(audio_path)
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)
audio = waveform.squeeze().to(device)

# 4. Prepare Transcription Tokens
# The characters are the "words" for Chinese
words = list(transcription)
with processor.as_target_processor():
    labels = processor(transcription, return_tensors="pt").input_ids

# 5. Get Model Emissions
with torch.no_grad():
    logits = model(audio.unsqueeze(0)).logits
emissions = torch.log_softmax(logits, dim=-1)[0].cpu()

# 6. Generate Alignment
blank_id = processor.tokenizer.pad_token_id
# Insert blanks between characters for CTC alignment
tokens = []
for l in labels[0]:
    tokens.append(blank_id)
    tokens.append(l.item())
tokens.append(blank_id)


trellis = get_trellis(emissions, tokens, blank_id)
path = backtrack(trellis, emissions, tokens, blank_id)

# Filter out blank tokens from the path before merging repeats
# Path indices need to correspond to the non-blank tokens in `transcript`
filtered_path = [p for p in path if p.token_index % 2 != 0]

segments = merge_repeats(filtered_path, transcription)


# 7. Format Output
word_segments = []
# Ratio to convert frame indices to seconds
ratio = audio.shape[0] / emissions.shape[0] / 16000

for seg in segments:
    word_segments.append(
        {
            "word": seg.label,
            "start": round(seg.start * ratio, 3),
            "end": round(seg.end * ratio, 3),
            "score": round(seg.score, 3),
        }
    )

final_result = [{
    "start": word_segments[0]["start"],
    "end": word_segments[-1]["end"],
    "text": transcription,
    "words": word_segments,
}]

# 8. Print the final result
import json
print(json.dumps(final_result, indent=2, ensure_ascii=False))



TypeError: Wav2Vec2CTCTokenizer(name_or_path='jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn', vocab_size=3503, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	1: AddedToken("<s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	2: AddedToken("</s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	3: AddedToken("<unk>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
}
) got multiple values for keyword argument 'return_tensors'

In [25]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np

def align_audio_to_text(audio_path, transcription):
    """
    Performs forced alignment of an audio file to its transcription using a Wav2Vec2 model.

    Args:
        audio_path (str): Path to the audio file (e.g., .wav).
        transcription (str): The text transcription corresponding to the audio.

    Returns:
        list: A list containing a dictionary with the full segment and word-level (character-level for Chinese) timestamps.
    """
    # --- 1. Setup Model and Audio ---
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"
    
    # Load model and processor
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
    
    # Load and resample audio
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    
    # --- 2. Get Model Predictions ---
    # Process audio and text
    input_values = processor(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    labels = processor(text=transcription, return_tensors="pt").input_ids.to(device)

    # Get emission probabilities
    with torch.no_grad():
        logits = model(input_values).logits
    emissions = torch.log_softmax(logits, dim=-1)[0].cpu()
    
    # --- 3. Core Alignment Algorithm (CTC Forced Alignment) ---
    # Create the trellis (dynamic programming table)
    blank_id = processor.tokenizer.pad_token_id
    tokens = labels[0].tolist()
    
    # Insert blank tokens between characters for CTC
    token_path = [blank_id] + [val for t in tokens for val in (t, blank_id)]
    
    trellis = torch.full((emissions.shape[0], len(token_path)), -float("inf"))
    trellis[0, 0] = emissions[0, blank_id]
    trellis[0, 1] = emissions[0, token_path[1]]

    for t in range(1, emissions.shape[0]):
        for j in range(len(token_path)):
            # Case 1: Stay at the same token (can be blank or a character)
            prev_trellis = trellis[t - 1, j]
            # Case 2: Move from the previous token
            if j > 0:
                prev_trellis = max(prev_trellis, trellis[t-1, j-1])
            # Case 3: Skip a blank token
            if j > 1 and token_path[j] != blank_id and token_path[j-2] == token_path[j]:
                prev_trellis = max(prev_trellis, trellis[t-1, j-2])
            
            trellis[t, j] = prev_trellis + emissions[t, token_path[j]]

    # Backtrack to find the most likely path
    path = []
    j = trellis.shape[1] - 1
    for t in range(trellis.shape[0] - 1, -1, -1):
        # Find the index of the maximum predecessor in the trellis
        if j > 1 and token_path[j] != blank_id and token_path[j-2] == token_path[j] and trellis[t-1,j-2] >= trellis[t-1,j-1] and trellis[t-1,j-2] >= trellis[t-1,j]:
             path.append((token_path[j], t, emissions[t, token_path[j]].exp().item()))
             j = j-2
        elif j > 0 and trellis[t-1, j-1] >= trellis[t-1, j]:
            path.append((token_path[j], t, emissions[t, token_path[j]].exp().item()))
            j = j - 1
        else:
            path.append((token_path[j], t, emissions[t, token_path[j]].exp().item()))

    path.reverse()
    
    # --- 4. Merge Segments and Format Output ---
    # Filter out blank tokens and merge repeated characters
    char_segments = []
    for token, time_idx, score in path:
        if token != blank_id:
            char = processor.decode(token)
            if not char_segments or char_segments[-1]['char'] != char:
                char_segments.append({'char': char, 'start_frame': time_idx, 'end_frame': time_idx, 'scores': [score]})
            else:
                char_segments[-1]['end_frame'] = time_idx
                char_segments[-1]['scores'].append(score)
                
    # Convert frame indices to seconds
    ratio = waveform.shape[1] / emissions.shape[0] / 16000
    word_segs = []
    for seg in char_segments:
        word_segs.append({
            "word": seg['char'],
            "start": round(seg['start_frame'] * ratio, 3),
            "end": round((seg['end_frame'] + 1) * ratio, 3),
            "score": round(np.mean(seg['scores']), 3)
        })

    return [{
        "start": word_segs[0]["start"],
        "end": word_segs[-1]["end"],
        "text": transcription,
        "words": word_segs
    }]

# --- Example Usage ---
audio_file = "/Users/anton/Downloads/audio_trim.wav"
transcript = "我喜欢机器学习"

# Get the alignment
result = align_audio_to_text(audio_file, transcript)

# Print the final result
import json
print(json.dumps(result, indent=2, ensure_ascii=False))

[
  {
    "start": 0.06,
    "end": 1.27,
    "text": "我喜欢机器学习",
    "words": [
      {
        "word": "我",
        "start": 0.06,
        "end": 0.081,
        "score": 0.993
      },
      {
        "word": "喜",
        "start": 0.423,
        "end": 0.444,
        "score": 0.99
      },
      {
        "word": "欢",
        "start": 0.605,
        "end": 0.625,
        "score": 0.004
      },
      {
        "word": "机",
        "start": 0.847,
        "end": 0.867,
        "score": 0.0
      },
      {
        "word": "器",
        "start": 0.948,
        "end": 0.968,
        "score": 0.0
      },
      {
        "word": "学",
        "start": 1.109,
        "end": 1.129,
        "score": 0.0
      },
      {
        "word": "习",
        "start": 1.25,
        "end": 1.27,
        "score": 0.0
      }
    ]
  }
]
