## Диаризация

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/records.zip" /content/records.zip
!unzip /content/records.zip

!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/add_materials.zip" /content/add_materials.zip
!unzip  /content/add_materials.zip

In [None]:
!pip install --upgrade pyannote.audio accelerate

In [None]:
import torch
import torchaudio
import json

from IPython.display import Audio
from os import listdir
from os.path import isfile, join
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook

In [None]:
path_records = '/content/records'
audio_files = [join(path_records, file) for file in sorted(listdir(path_records)) if isfile(join(path_records, file))]
names = [file for file in sorted(listdir(path_records)) if isfile(join(path_records, file))]

In [None]:
diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token='**KEY**'
)

diarization_pipeline.to(torch.device("cuda"))

<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x7ca8d23b2500>

In [None]:
START, END = 1600, 3100

In [None]:
diarization_pipeline_result = []
for index, audio_file in enumerate(audio_files[START:END]):
  print(index, audio_file)
  waveform, sample_rate = torchaudio.load(audio_file)
  output = diarization_pipeline({"waveform": waveform, "sample_rate": sample_rate})
  diarization_pipeline_result.append(output)

In [None]:
diarization_results = []
for diarization_result in diarization_pipeline_result:
  segments = []
  for segment, track, label in diarization_result.itertracks(yield_label=True):
    segments.append({'segment': {'start': segment.start, 'end': segment.end},
                    'track': track,
                    'label': label})
  diarization_results.append(segments)

In [None]:
diarization[0]

[{'segment': {'start': 1.36409375, 'end': 1.71846875},
  'track': 'A',
  'label': 'SPEAKER_02'},
 {'segment': {'start': 2.98409375, 'end': 9.075968750000001},
  'track': 'B',
  'label': 'SPEAKER_02'},
 {'segment': {'start': 9.599093750000002, 'end': 10.51034375},
  'track': 'C',
  'label': 'SPEAKER_02'},
 {'segment': {'start': 10.797218750000003, 'end': 18.660968750000002},
  'track': 'D',
  'label': 'SPEAKER_02'},
 {'segment': {'start': 18.812843750000003, 'end': 24.60096875},
  'track': 'E',
  'label': 'SPEAKER_02'},
 {'segment': {'start': 21.25971875, 'end': 21.52971875},
  'track': 'F',
  'label': 'SPEAKER_00'},
 {'segment': {'start': 25.191593750000003, 'end': 34.06784375},
  'track': 'G',
  'label': 'SPEAKER_02'},
 {'segment': {'start': 35.06346875, 'end': 38.38784375},
  'track': 'H',
  'label': 'SPEAKER_01'},
 {'segment': {'start': 38.556593750000005, 'end': 39.38346875},
  'track': 'I',
  'label': 'SPEAKER_01'},
 {'segment': {'start': 39.38346875, 'end': 39.400343750000005},
 

In [None]:
with open('diarization_results_1600_3100.txt', 'w') as filehandle:
  json.dump(diarization_results, filehandle)

#with open('output.txt', 'r') as filehandle:
  #q = json.load(filehandle)

In [None]:
!cp /content/diarization_results_1600_3100.txt "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/diarization_results_1600_3100.txt"

## Транскрибация

In [None]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=256,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "russian"}
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
START, END = 2100, 3100

In [None]:
asr_results = asr_pipeline(audio_files[START:END])

In [None]:
with open('asr_results_2100_3100.txt', 'w') as filehandle:
  json.dump(asr_results, filehandle)

#with open('asr_results_0_1600.txt', 'r') as filehandle:
  #q = json.load(filehandle)

In [None]:
!cp /content/asr_results_2100_3100.txt "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/asr_results_2100_3100.txt"

## Speechbox

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/data/asr_results_0_1100.txt" /content/asr_results_0_1100.txt
!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/data/asr_results_1100_2100.txt" /content/asr_results_1100_2100.txt
!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/data/asr_results_2100_3100.txt" /content/asr_results_2100_3100.txt

!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/data/diarization_results_0_1600.txt" /content/diarization_results_0_1600.txt
!cp "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/data/diarization_results_1600_3100.txt" /content/diarization_results_1600_3100.txt

In [None]:
!pip install "git+https://github.com/huggingface/speechbox"

In [None]:
import pandas as pd
from speechbox import ASRDiarizationPipeline

In [None]:
with open('asr_results_0_1100.txt', 'r') as filehandle:
  asr_part_1 = json.load(filehandle)

with open('asr_results_1100_2100.txt', 'r') as filehandle:
  asr_part_2 = json.load(filehandle)

with open('asr_results_2100_3100.txt', 'r') as filehandle:
  asr_part_3 = json.load(filehandle)

asr = asr_part_1 + asr_part_2 + asr_part_3

assert(len(asr) == 3100)

In [None]:
with open('diarization_results_0_1600.txt', 'r') as filehandle:
  diarization_part_1 = json.load(filehandle)

with open('diarization_results_1600_3100.txt', 'r') as filehandle:
  diarization_part_2 = json.load(filehandle)

diarization = diarization_part_1 + diarization_part_2

assert(len(diarization) == 3100)

In [None]:
pipeline = ASRDiarizationPipeline(
    asr_pipeline=asr_pipeline,
    diarization_pipeline=diarization_pipeline,
    asr_array=asr,
    diar_array=diarization
)

In [None]:
def format_as_transcription(raw_segments):
    return "\n".join([f'{chunk["speaker"]} {chunk["text"]}' for chunk in raw_segments])

In [None]:
START, END = 0, 3100

In [None]:
results = []
for index, audio_file in enumerate(audio_files[START:END]):
  print(index, audio_file)
  outputs = pipeline(audio_file, index=index)
  h = format_as_transcription(outputs)
  results.append(h)

In [None]:
df = pd.DataFrame(data={'names': names[START:END], 'texts': [r for r in results]})
df.to_csv('td_data.csv', index=False, encoding='utf-8')

In [None]:
df

Unnamed: 0,names,texts
0,12352375_2024-03-14_16.04.49.mp3,"SPEAKER_02 Здравствуйте. Расскажите, пожалуйс..."
1,12352375_2024-03-14_16.11.08.mp3,"SPEAKER_03 Добрый день. Скажи, пожалуйста, эт..."
2,12352472_2024-03-20_11.25.20.mp3,"SPEAKER_00 Алло. Добрый день. Подскажите, пож..."
3,12352472_2024-03-20_15.19.45.mp3,"SPEAKER_01 Алло. Здравствуйте еще раз, это Ма..."
4,12352472_2024-03-20_15.21.22.mp3,SPEAKER_00 Алло. Добрый день. Добрый. День чи...
...,...,...
3095,12742797_2024-04-23_15.28.55.mp3,SPEAKER_01 Алло. Добрый день. Здравствуйте. К...
3096,12742797_2024-05-02_10.56.34.mp3,SPEAKER_02 Алло. Добрый день. Здравствуйте. К...
3097,12742797_2024-05-07_14.06.33.mp3,SPEAKER_02 Добрый день.\nSPEAKER_00 Спасибо....
3098,12742797_2024-05-13_09.55.51.mp3,SPEAKER_00 Здравствуйте! Вызываемый абонент с...


In [None]:
print(df.iloc[7]['texts'])

SPEAKER_02  Привет! На связи Лера и Артём.
SPEAKER_00  Основатель бренда LeTik Cosmetics.
SPEAKER_02  Обрати внимание, мы ведём запись разговора, чтобы иметь возможность ещё раз услышать твой прекрасный голос.
SPEAKER_00  Сейчас поможем с навигацией. Поехали!
SPEAKER_02  Хочешь оформить заказ по телефону или узнать о новинках? Жми 1.
SPEAKER_00  Есть вопрос по оформленным заказам? Жми 2.
SPEAKER_02  Чтобы узнать о сотрудничестве с нашим брендом, жми 3.
SPEAKER_00  Нужно послушать еще раз? Жми 4.
SPEAKER_02  Звонишь с другим вопросом?
SPEAKER_00  Просто оставайся на линии. Первый освободившийся сотрудник поможет тебе с решением. Продолжение следует... С вами была Евгения Косметик.
SPEAKER_02  С Евгением я. Добрый день. Чем могу помочь?
SPEAKER_04  Добрый день. Подскажите, пожалуйста, с кем могу переговорить по поводу коммерческого предстоящего?
SPEAKER_01  Подскажите, пожалуйста, как могу к могу переговорить по поводу коммерческого предприятия? Подскажите, пожалуйста, как могу к вам обр

In [None]:
!cp /content/td_data.csv "/content/drive/MyDrive/Colab Notebooks/hac_rtk_2024/data/td_data.csv"