## Whisper

Тремя частями получены транскрибации звонков с помощью модели openai/whisper-large-v3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/records.zip" /content/records.zip
!unzip /content/records.zip

!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/add_materials.zip" /content/add_materials.zip
!unzip  /content/add_materials.zip

In [None]:
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]

In [None]:
from os import listdir
from os.path import isfile, join

path_records = '/content/records'
audio_files = [join(path_records, file) for file in sorted(listdir(path_records)) if isfile(join(path_records, file))]
names = [file for file in sorted(listdir(path_records)) if isfile(join(path_records, file))]

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "russian"}
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
START, END = 0, 1100

In [None]:
results = pipe(audio_files[START:END])

In [None]:
import pandas as pd

df = pd.DataFrame(data={'names': names[START:END], 'texts': [r['text'] for r in results]})
df.to_csv('part_1.csv', index=False, encoding='utf-8')

In [None]:
df

Unnamed: 0,names,texts
0,12446285_2024-03-05_13.19.30.mp3,"Пожалуйста, секундочку. Интересно, ни разу не..."
1,12446291_2024-03-26_13.07.27.mp3,Магазин «Велоспор» Добрый день. Добрый день. ...
2,12446291_2024-03-26_13.09.04.mp3,"Алло. Здравствуйте. Василий Федорович? Да, зд..."
3,12446317_2024-03-27_12.06.52.mp3,Вызываемый абонент не отвечает. Звонок был пе...
4,12446317_2024-03-27_12.08.01.mp3,"Отдел шок. Добрый день, Марина. Здравствуйте...."
...,...,...
995,12699709_2024-04-01_11.52.08.mp3,Здравствуйте! Вы за...
996,12699709_2024-04-01_11.52.56.mp3,"Алло. Алло, здравствуйте. Здравствуйте. Подск..."
997,12699709_2024-04-01_11.53.43.mp3,"Здравствуйте, салон преображения Наталья. Ой,..."
998,12699709_2024-04-01_13.11.08.mp3,"Пока, до свидания. Салон преображения Наталья..."


In [None]:
!cp /content/part_1.csv "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/part_1.csv"