In [10]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=25,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample, return_timestamps=True)


print(result["text"])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can discover in it but little of Rocky Ithaca. Linnell's pictures are a sort of up-ards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carcker used to flash his teeth, and Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a turkish bath next man


In [11]:
result["chunks"]

[{'timestamp': (0.0, 6.6),
  'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'},
 {'timestamp': (6.6, 11.32),
  'text': " Nor is Mr. Quilter's manner less interesting than his matter."},
 {'timestamp': (11.32, 16.86),
  'text': ' He tells us that at this festive season of the year, with Christmas and roast beef looming'},
 {'timestamp': (16.86, 23.67),
  'text': ' before us, similes drawn from eating and its results occur most readily to the mind.'},
 {'timestamp': (23.67, 29.67),
  'text': " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can"},
 {'timestamp': (29.67, 39.77),
  'text': " discover in it but little of Rocky Ithaca. Linnell's pictures are a sort of up-ards and Adam paintings, and Mason's exquisite"},
 {'timestamp': (39.77, 44.75),
  'text': ' idylls are as national as a jingo poem.'},
 {'timestamp': (44.75, 50.37),
  'text': " Mr. Burkett Foster's landscapes smile at one much in th

In [7]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=25,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample, return_timestamps=True)
print(result["text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can discover in it but little of Rocky Ithaca. Linnell's pictures are a sort of up-ards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carcker used to flash his teeth, and Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a turkish bath next man


In [9]:
result['chunks']

[{'timestamp': (0.0, 6.6),
  'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'},
 {'timestamp': (6.6, 11.32),
  'text': " Nor is Mr. Quilter's manner less interesting than his matter."},
 {'timestamp': (11.32, 16.86),
  'text': ' He tells us that at this festive season of the year, with Christmas and roast beef looming'},
 {'timestamp': (16.86, 23.67),
  'text': ' before us, similes drawn from eating and its results occur most readily to the mind.'},
 {'timestamp': (23.67, 29.67),
  'text': " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can"},
 {'timestamp': (29.67, 39.77),
  'text': " discover in it but little of Rocky Ithaca. Linnell's pictures are a sort of up-ards and Adam paintings, and Mason's exquisite"},
 {'timestamp': (39.77, 44.75),
  'text': ' idylls are as national as a jingo poem.'},
 {'timestamp': (44.75, 50.37),
  'text': " Mr. Burkett Foster's landscapes smile at one much in th

In [17]:
file_name = "/data/yinzi/babyview_20240507/audios/Babyview_Main/00220001_GX010011_03.11.2024-03.17.2024_03.06.2024-1:59pm.mp3"
result = pipe(file_name, return_timestamps=True)

In [None]:
result["chunks"]

[{'timestamp': (0.0, 1.9), 'text': " Okay, let's get right up to by three."},
 {'timestamp': (2.24, 3.82), 'text': " So I think we've grown away."},
 {'timestamp': (4.1, 6.32), 'text': ' This one is a bottle-up and up.'},
 {'timestamp': (6.72, 6.88), 'text': ' Yeah.'},
 {'timestamp': (8.08, 8.44), 'text': ' Yeah.'},
 {'timestamp': (24.67, 25.67),
  'text': ' Wow. that price. So. Oh, wow. Are you my Tristan love? Whoa.'},
 {'timestamp': (25.67, 26.67), 'text': ' Whoa.'},
 {'timestamp': (26.67, 27.67), 'text': ' Blue.'},
 {'timestamp': (27.67, 38.33),
  'text': ' Do you need a blue? Happy birthday! Oh!'},
 {'timestamp': (38.33, 39.33), 'text': ' Oh!'},
 {'timestamp': (39.33, 40.33), 'text': ' Oh!'},
 {'timestamp': (40.33, 79.67), 'text': " Yeah! Get it. It's a ball..."},
 {'timestamp': (79.67, 83.67), 'text': ' It happened today!'},
 {'timestamp': (83.67, 84.67), 'text': ' Yeah, look at that.'},
 {'timestamp': (84.67, 86.67), 'text': " Oh, it's a snowman!"},
 {'timestamp': (86.67, 90.33)