# Voice Model

References:

[Speaker Recognition - pyannote-audio]("https://github.com/pyannote/pyannote-audio")

[Building a Speaker Identification System]("https://medium.com/analytics-vidhya/building-a-speaker-identification-system-from-scratch-with-deep-learning-f4c4aa558a56")

[상담사 통화녹음 화자분리]("https://youngseo-computerblog.tistory.com/120")

_

Implementations:

[Audio Embedding - wespeaker-voxceleb-resnet34-LM]("https://huggingface.co/pyannote/wespeaker-voxceleb-resnet34-LM")

[whisper_streaming]("https://github.com/ufal/whisper_streaming")

In [None]:
!pip install speechbrain==0.5.16

In [None]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q git+https://github.com/pyannote/pyannote-audio

In [None]:
# upload audio file
from google.colab import files
uploaded = files.upload()
path = next(iter(uploaded))

In [7]:
path = "sample.wav"

In [8]:
num_speakers = 2 #@param {type:"integer"}

language = 'English' #@param ['any', 'English']

model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large']


model_name = model_size
if language == 'English' and model_size != 'large':
    model_name += '.en'

In [9]:
import whisper
import datetime

import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

OSError: [WinError 1314] 클라이언트가 필요한 권한을 가지고 있지 않습니다: 'C:\\Users\\IRACK\\.cache\\huggingface\\hub\\models--speechbrain--spkrec-ecapa-voxceleb\\snapshots\\eac27266f68caa806381260bd44ace38b136c76a\\hyperparams.yaml' -> 'C:\\Users\\IRACK\\.cache\\torch\\pyannote\\speechbrain\\hyperparams.yaml'

<img src="https://raw.githubusercontent.com/b-re-w/2024-1_BPL_STalk_Model_Research/main/SpeakerDiarization/res/1_V6kstNiDGG3knzsZ-DcFyw.jpg"/>

In [None]:
if path[-3:] != 'wav':
    subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
    path = 'audio.wav'

In [None]:
model = whisper.load_model(model_size)

In [None]:
result = model.transcribe(path)
segments = result["segments"]

In [None]:
with contextlib.closing(wave.open(path,'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    duration = frames / float(rate)

In [None]:
audio = Audio()

def segment_embedding(segment):
    start = segment["start"]
    # Whisper overshoots the end timestamp in the last segment
    end = min(duration, segment["end"])
    clip = Segment(start, end)
    waveform, sample_rate = audio.crop(path, clip)
    return embedding_model(waveform[None])

In [None]:
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
    embeddings[i] = segment_embedding(segment)

embeddings = np.nan_to_num(embeddings)

<img src="https://raw.githubusercontent.com/b-re-w/2024-1_BPL_STalk_Model_Research/main/SpeakerDiarization/res/1_cGMVhv0dNZTM6gPua4uzAA.jpg"/>

<img src="https://raw.githubusercontent.com/b-re-w/2024-1_BPL_STalk_Model_Research/main/SpeakerDiarization/res/1_yzq0c8tEruvTEf1UlVezSA.jpg"/>

In [None]:
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

In [None]:
def time(secs):
    return datetime.timedelta(seconds=round(secs))

f = open("transcript.txt", "w")
x = ""
for (i, segment) in enumerate(segments):
    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
        f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
    f.write(segment["text"][1:] + ' ')
    x += "\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
    x += segment["text"][1:] + ' '
f.close()

In [None]:
print(open('transcript.txt').read())

In [None]:
print(x)

In [None]:
print(result)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Your existing code for clustering and labeling segments...

# Perform PCA to reduce the dimensionality of embeddings to 2D
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings)

# Plot the clusters
plt.figure(figsize=(10, 8))
for i, segment in enumerate(segments):
    speaker_id = labels[i] + 1
    x, y = embeddings_2d[i]
    plt.scatter(x, y, label=f'SPEAKER {speaker_id}')

plt.title("Speaker Diarization Clusters (PCA Visualization)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import matplotlib.cm as cm

# Your existing code for clustering and labeling segments...

# Perform PCA to reduce the dimensionality of embeddings to 2D
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings)

# Get the number of unique speakers from the labels
num_unique_speakers = len(np.unique(labels))

# Create a colormap for speakers, ensuring each speaker gets a unique color
colors = cm.tab20b(np.linspace(0, 1, num_unique_speakers))

# Plot the clusters
plt.figure(figsize=(10, 8))
for i, segment in enumerate(segments):
    speaker_id = labels[i] + 1
    x, y = embeddings_2d[i]
    color = colors[labels[i] % num_unique_speakers]  # Get the corresponding color for the speaker
    plt.scatter(x, y, label=f'SPEAKER {speaker_id}', color=color)

plt.title("Speaker Diarization Clusters (PCA Visualization)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()


In [None]:
import numpy as np
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import matplotlib.cm as cm

# Your existing code for clustering and labeling segments...

# Perform PCA to reduce the dimensionality of embeddings to 3D
pca = PCA(n_components=3, random_state=42)
embeddings_3d = pca.fit_transform(embeddings)

# Get the number of unique speakers from the labels
num_unique_speakers = len(np.unique(labels))

# Create a colormap for speakers, ensuring each speaker gets a unique color
colors = cm.tab20b(np.linspace(0, 1, num_unique_speakers))

# Prepare the data for the 3D scatter plot
data = []
for i, segment in enumerate(segments):
    speaker_id = labels[i] + 1
    x, y, z = embeddings_3d[i]
    color = colors[labels[i] % num_unique_speakers]  # Get the corresponding color for the speaker
    trace = go.Scatter3d(x=[x], y=[y], z=[z], mode='markers',
                         marker=dict(size=5, color=color),
                         name=f'SPEAKER {speaker_id}')
    data.append(trace)

# Layout for the 3D scatter plot
layout = go.Layout(
    title="Speaker Diarization Clusters (3D Visualization)",
    scene=dict(
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        zaxis_title="Principal Component 3"
    )
)

# Create the figure and plot the 3D scatter plot
fig = go.Figure(data=data, layout=layout)
fig.show()
