In [12]:
import torch
import torchaudio
from datasets import load_metric
import datasets
from transformers import (Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2CTCTokenizer, 
                          Wav2Vec2FeatureExtractor, AutoTokenizer, AutoModelForSequenceClassification)
from ctcdecode import CTCBeamDecoder
import os
from ipywebrtc import AudioRecorder, CameraStream
import soundfile as sf
import librosa 
import IPython.display as ipd
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import random 

In [3]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
tokenizer = Wav2Vec2CTCTokenizer(
    "./ASR_vocab/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

feature_extractor = Wav2Vec2FeatureExtractor()
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [5]:
vocab=processor.tokenizer.convert_ids_to_tokens(range(0, processor.tokenizer.vocab_size))
space_ix = vocab.index('|')
vocab[space_ix]=' '

ctcdecoder = CTCBeamDecoder(vocab, 
    model_path=os.path.join('kenlm_model/kenlm_vi', "lm.binary"),
    alpha=1.2359788738676971,
    beta=4.966395194796742,
    cutoff_top_n=40,
    cutoff_prob=1.0,
    beam_width=100,
    num_processes=4,
    blank_id=processor.tokenizer.pad_token_id,
    log_probs_input=True
)

In [6]:
ASR_model = Wav2Vec2ForCTC.from_pretrained("./ASR_model")

In [48]:
def ASR_predict(wav_file):
    ASR_model.to("cuda")
    sig, sr = librosa.load(wav_file, sr=16000)
    sig = torch.flatten(torch.tensor(sig))
    input_values = processor(sig.to("cuda"),sampling_rate=16000,return_tensors="pt").input_values.to("cuda")
    with torch.no_grad():
        logits = ASR_model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(pred_ids)[0]

In [16]:
def ASR_predict_beam_search(wav_file):
    ASR_model.to("cuda")
    sig, sr = librosa.load(wav_file, sr=16000)
    sig = torch.flatten(torch.tensor(sig))
    input_values = processor(sig.to("cuda"),sampling_rate=16000,return_tensors="pt").input_values.to("cuda")
    with torch.no_grad():
        logits = ASR_model(input_values).logits

    beam_results, beam_scores, timesteps, out_lens = ctcdecoder.decode(logits)
    return "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]).strip()

In [26]:
SA_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
SA_model = AutoModelForSequenceClassification.from_pretrained("./SA_model")

In [28]:
def SA_predict(text):
    SA_model.to("cuda")
    input_values = SA_tokenizer(text,return_tensors="pt").to("cuda")
    with torch.no_grad():
        logits = SA_model(**input_values).logits
    result = torch.argmax(logits, dim=-1)
    return result.item() 

In [29]:
SA_predict("đừng bảo cảm_ơn mà bị chửi nếu đéo biết cái giải này nó trao để tôn_vinh đối_tượng như nào nha còn là nói_thẳng mấy con vô nói chỉ cảm_ơn mà bị chửi ấy thì là ngu thật là giả ngu vô kháy vài phát cảm_ơn teddy thì viết hoa tên ngta đàng_hoàng chút với cả lựa đúng bài mà vô_cảm ơn ha cmt ngu ngục đi kháy mà ko dám nhận bị ngta chửi kêu chỉ cảm_ơn")

2

In [49]:
dic = ["CLEAN", "OFFENSIVE BUT NOT HATE", "HATE"]
def ASR2SA_predict(wav_file):
    text = ASR_predict(wav_file)
    print("text:",text)
    result = SA_predict(text.lower())
    print("SA:", dic[result])

In [40]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [70]:
with open('./temp/recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!ffmpeg -i './temp/recording.webm' -ac 1 -f wav './temp/file.wav' -y -hide_banner -loglevel panic
ASR2SA_predict('./temp/file.wav')

text: ANH THƯƠNG EM MÀ
SA: CLEAN
