# STalk Model Architecture

## Preparation

In [None]:
USE_CUDA = False

In [None]:
!pip install huggingface_hub transformers

In [None]:
!pip install intel-npu-acceleration-library

In [None]:
if USE_CUDA:
    !pip uninstall torch torchvision torchaudio -y
    !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
else:
    !pip uninstall torch torchvision torchaudio -y
    !pip install torch torchvision torchaudio

In [None]:
if USE_CUDA:
    !pip uninstall llama-cpp-python -y
    !pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
else:
    !pip uninstall llama-cpp-python -y
    !pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu

In [None]:
!pip install pyannote-audio

In [None]:
!pip install faster-whisper

In [None]:
!pip install git+https://github.com/wenet-e2e/wespeaker.git

## Initialize Pretrained Model

In [1]:
import intel_npu_acceleration_library

import gc

import torch
import torchaudio

from llama_cpp import Llama

from pyannote.audio import Audio
from pyannote.core import Segment

from faster_whisper import WhisperModel

from huggingface_hub import hf_hub_download
import wespeaker

DEVICE = "cuda"
if not torch.cuda.is_available():
    DEVICE = "cpu"
    print("INFO: CUDA is diabled on this machine.\n\n")

print("PyTorch:", torch.__version__)
print("TorchAudio:", torchaudio.__version__)
print("Uses Device:", DEVICE.upper())

INFO: CUDA is diabled on this machine.


PyTorch: 2.3.1+cpu
TorchAudio: 2.3.1+cpu
Uses Device: CPU


In [2]:
class ChatHistory(list):
    messages = []
    
    @classmethod
    def add_messages(cls, role, content):
        if isinstance(content, str):
            cls.messages.append({ 'role': role, 'content': content })
        else:
            for r, c in zip(role, content):
                cls.messages.append({ 'role': r, 'content': c })
    
    @classmethod
    def create_prompt(cls, system_prompt: str, user_prompt: str = ""):
        return [
            {
                "role": "system",
                "content": system_prompt
            },
            *cls.messages,
            {
                "role": "user",
                "content": user_prompt
            }
        ]

In [3]:
def token_stream(token):
    delta = token["choices"][0]["delta"]
    if "content" not in delta:
        return ""
    else:
        return delta["content"]

In [4]:
def get_llama3():
    model_id = "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"

    chat = Llama.from_pretrained(
        repo_id=model_id,
        filename="*Q4_K_M.gguf",
        #chat_format="llama-3",
        verbose=False
    ).create_chat_completion
    
    def llama3(system_prompt, user_prompt, temp=0.5):
        prompt = ChatHistory.create_prompt(system_prompt, user_prompt)

        print("PROMPT:")
        for line in prompt:
            print(line)
        print()
        
        return chat(prompt, temperature=temp, stream=True)
    
    return llama3

In [5]:
def get_whisper():
    model_size = "medium"  #@param ['tiny', 'base', 'small', 'medium', 'large', 'large-v2', 'large-v3']
    compute_type = "int8"  #@param ['float16', 'int8']

    return WhisperModel(model_size, device=DEVICE, compute_type=compute_type).transcribe

In [6]:
def extract_embedding(model, pcm, sample_rate):
    pcm = pcm.to(torch.float)
    if sample_rate != model.resample_rate:
        pcm = torchaudio.transforms.Resample(
            orig_freq=sample_rate, new_freq=model.resample_rate)(pcm)
    feats = model.compute_fbank(
        pcm,
        sample_rate=model.resample_rate,
        cmn=True
    )
    feats = feats.unsqueeze(0)
    feats = feats.to(model.device)
    model.model.eval()
    with torch.no_grad():
        outputs = model.model(feats)
        outputs = outputs[-1] if isinstance(outputs, tuple) else outputs
    embedding = outputs[0].to(torch.device('cpu'))
    return embedding

In [7]:
def recognize(model, pcm, sample_rate):
    q = extract_embedding(model, pcm, sample_rate)
    best_score = 0.0
    best_name = ''
    for name, e in model.table.items():
        score = model.cosine_similarity(q, e)
        if best_score < score:
            best_score = score
            best_name = name
        del score
        gc.collect()
    return {'name': best_name, 'confidence': best_score}

In [8]:
def get_resnet152():
    model_id = "Wespeaker/wespeaker-voxceleb-resnet152-LM"
    model_name = model_id.replace("Wespeaker/wespeaker-", "").replace("-", "_")
    
    root_dir = hf_hub_download(model_id, filename=model_name+".onnx").replace(model_name+".onnx", "")
    
    import os
    if not os.path.isfile(root_dir+"avg_model.pt"):
        os.rename(hf_hub_download(model_id, filename=model_name+".pt"), root_dir+"avg_model.pt")
    if not os.path.isfile(root_dir+"config.yaml"):
        os.rename(hf_hub_download(model_id, filename=model_name+".yaml"), root_dir+"config.yaml")

    resnet = wespeaker.load_model_local(root_dir)

    #print("Compile model for the NPU")
    #resnet.model = intel_npu_acceleration_library.compile(resnet.model)
    
    def resnet152(pcm, sample_rate):
        return recognize(resnet, pcm, sample_rate)
    
    resnet152.__dict__['register'] = lambda *args, **kwargs: resnet.register(*args, **kwargs)
    
    return resnet152

In [9]:
llama3 = get_llama3()
print("INFO: Llama3 Ready -", llama3)

INFO: Llama3 Ready - <function get_llama3.<locals>.llama3 at 0x000002A4A1939DA0>


In [10]:
whisper = get_whisper()
print("INFO: Whisper Ready -", whisper)

INFO: Whisper Ready - <bound method WhisperModel.transcribe of <faster_whisper.transcribe.WhisperModel object at 0x000002A4B6E09150>>


In [11]:
audio = Audio()
resnet152 = get_resnet152()
print("INFO: ResNet152 Ready -", resnet152)



Compile model for the NPU
INFO: ResNet152 Ready - <function get_resnet152.<locals>.resnet152 at 0x000002A4A198CEA0>


## Model Ready

#### Insert System Chat Template to Llama3

In [12]:
system_prompt = "You are a helpful, smart, kind, and efficient Conversation Analysis and Recommendation AI System. You always fulfill the user's requests to the best of your ability. You need to keep listen to the conversations. Please answer in Korean language."

In [13]:
for chunk in llama3(system_prompt, ""):
    print(token_stream(chunk), end="", flush=True)
print()

PROMPT:
{'role': 'system', 'content': "You are a helpful, smart, kind, and efficient Conversation Analysis and Recommendation AI System. You always fulfill the user's requests to the best of your ability. You need to keep listen to the conversations. Please answer in Korean language."}
{'role': 'user', 'content': ''}

안녕하세요! 😊

저는 Chatbot입니다. 당신의 요청을 최선을 다해 응대하겠습니다. 무엇이 필요한가요? 🤔


#### Speaker Registration to ResNet293

In [14]:
speaker1 = "민서", "./SpeakerDiarization/sample_conversation/real/sentence_F.wav"
speaker2 = "연우", "./SpeakerDiarization/sample_conversation/real/sentence_M.wav"
speaker1, speaker2

(('민서', './SpeakerDiarization/sample_conversation/real/sentence_F.wav'),
 ('연우', './SpeakerDiarization/sample_conversation/real/sentence_M.wav'))

In [15]:
resnet152.register(*speaker1)
resnet152.register(*speaker2)

In [16]:
user_prompt = f"Based on the conversations between {speaker1[0]} and {speaker2[0]}, on be half of {speaker2[0]}, do recommend a new topic sentence related the current situation or their personal interests."

## Test

In [17]:
audio_path = "./SpeakerDiarization/sample_conversation/real/conversation_0530.wav"

In [18]:
segments, info = whisper(audio_path, beam_size=5, word_timestamps=False)

In [19]:
for segment, _ in zip(segments, range(5)):
    embedding = audio.crop(audio_path, Segment(segment.start, segment.end))
    speaker = resnet152(*embedding)
    print("[%s] [%.2fs -> %.2fs] %s" % (speaker['name'], segment.start, segment.end, segment.text))
    ChatHistory.add_messages(speaker['name'], segment.text)
    del embedding, speaker
    gc.collect()

[민서] [0.00s -> 3.00s]  요즘 가족들이 잘 지내고 계신가요?
[연우] [3.00s -> 6.00s]  네, 가족들은 잘 지내고 있어요
[민서] [6.00s -> 10.00s]  저는 아이들과 스포츠를 즐기는 걸 좋아해요
[민서] [10.00s -> 13.00s]  어떤 종류의 스포츠를 함께 하시나요?
[연우] [13.00s -> 16.00s]  주로 축구와 테니스를 함께 하고 있어요


In [20]:
for chunk in llama3(system_prompt, user_prompt):
    print(token_stream(chunk), end="", flush=True)
print()

PROMPT:
{'role': 'system', 'content': "You are a helpful, smart, kind, and efficient Conversation Analysis and Recommendation AI System. You always fulfill the user's requests to the best of your ability. You need to keep listen to the conversations. Please answer in Korean language."}
{'role': '민서', 'content': ' 요즘 가족들이 잘 지내고 계신가요?'}
{'role': '연우', 'content': ' 네, 가족들은 잘 지내고 있어요'}
{'role': '민서', 'content': ' 저는 아이들과 스포츠를 즐기는 걸 좋아해요'}
{'role': '민서', 'content': ' 어떤 종류의 스포츠를 함께 하시나요?'}
{'role': '연우', 'content': ' 주로 축구와 테니스를 함께 하고 있어요'}
{'role': 'user', 'content': 'Based on the conversations between 민서 and 연우, on be half of 연우, do recommend a new topic sentence related the current situation or their personal interests.'}

Based on the conversation, I can see that 민서 and 연우 share an interest in sports, particularly football (soccer) and tennis. Considering this, I'll recommend a new topic sentence to continue the conversation:

"저는 최근에 축구 월드컵이 얼마나 재미있는지 알아요?" (I'm curious about how excit

In [21]:
for segment, _ in zip(segments, range(5)):
    embedding = audio.crop(audio_path, Segment(segment.start, segment.end))
    speaker = resnet152(*embedding)
    print("[%s] [%.2fs -> %.2fs] %s" % (speaker['name'], segment.start, segment.end, segment.text))
    ChatHistory.add_messages(speaker['name'], segment.text)
    del embedding, speaker
    gc.collect()

[민서] [20.00s -> 25.00s]  축구와 테니스는 정말 가족끼리 함께하기 좋은 스포츠죠
[민서] [25.00s -> 31.00s]  활동적인 시간을 보내면서 가족 간의 유대감도 높일 수 있을 거예요
[민서] [31.00s -> 38.00s]  다음에는 함께 스포츠를 즐기며 가족끼리 더 많은 시간을 보내는 건 어떨까요?
[연우] [38.00s -> 40.00s]  네, 좋아요
[연우] [40.00s -> 45.00s]  함께 활동을 하면서 가족 간의 유대감을 높이는 건 정말 좋은 생각이에요


In [22]:
for chunk in llama3(system_prompt, user_prompt):
    print(token_stream(chunk), end="", flush=True)
print()

PROMPT:
{'role': 'system', 'content': "You are a helpful, smart, kind, and efficient Conversation Analysis and Recommendation AI System. You always fulfill the user's requests to the best of your ability. You need to keep listen to the conversations. Please answer in Korean language."}
{'role': '민서', 'content': ' 요즘 가족들이 잘 지내고 계신가요?'}
{'role': '연우', 'content': ' 네, 가족들은 잘 지내고 있어요'}
{'role': '민서', 'content': ' 저는 아이들과 스포츠를 즐기는 걸 좋아해요'}
{'role': '민서', 'content': ' 어떤 종류의 스포츠를 함께 하시나요?'}
{'role': '연우', 'content': ' 주로 축구와 테니스를 함께 하고 있어요'}
{'role': '민서', 'content': ' 축구와 테니스는 정말 가족끼리 함께하기 좋은 스포츠죠'}
{'role': '민서', 'content': ' 활동적인 시간을 보내면서 가족 간의 유대감도 높일 수 있을 거예요'}
{'role': '민서', 'content': ' 다음에는 함께 스포츠를 즐기며 가족끼리 더 많은 시간을 보내는 건 어떨까요?'}
{'role': '연우', 'content': ' 네, 좋아요'}
{'role': '연우', 'content': ' 함께 활동을 하면서 가족 간의 유대감을 높이는 건 정말 좋은 생각이에요'}
{'role': 'user', 'content': 'Based on the conversations between 민서 and 연우, on be half of 연우, do recommend a new topic sentence related the current situ