<a href="https://colab.research.google.com/github/bygreencn/SenseVoice_Colab/blob/main/SenseVoice_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/FunAudioLLM/SenseVoice.git

In [None]:
!cd SenseVoice && pip install -r requirements.txt

In [None]:
!pip install -U funasr funasr-onnx

In [None]:
!git clone https://huggingface.co/funasr/FSMN-VAD

In [None]:
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

model_dir = "iic/SenseVoiceSmall"


model = AutoModel(
    model=model_dir,
    trust_remote_code=True,
    remote_code="./model.py",
    vad_model="fsmn-vad",
    vad_kwargs={"max_single_segment_time": 30000},
    device=None,
)

# en
res = model.generate(
    input=f"{model.model_path}/example/en.mp3",
    cache={},
    language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)
# zh
res = model.generate(
    input=f"{model.model_path}/example/zh.mp3",
    cache={},
    language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)

**No streaming**

In [None]:
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
                  vad_model="fsmn-vad", vad_model_revision="v2.0.4",
                  punc_model="ct-punc-c", punc_model_revision="v2.0.4",
                  # spk_model="cam++", spk_model_revision="v2.0.2",
                  )
res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
                     batch_size_s=300,
                     hotword='魔搭')
print(res)

**Streaming**

In [None]:
from funasr import AutoModel

chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention

model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4"#,
                  #vad_model="fsmn-vad", vad_model_revision="v2.0.4",
                  #punc_model="ct-punc-c", punc_model_revision="v2.0.4",
                  # spk_model="cam++", spk_model_revision="v2.0.2",
                  )

import soundfile
import os

#wav_file = os.path.join(model.model_path, "example/asr_example.wav")
wav_file = "./FSMN-VAD/example/vad_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms

cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
    is_final = i == total_chunk_num - 1
    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
    print(res)

In [None]:
print(model.model_path)

**Voice Activity Detection (Non-Streaming)**

In [None]:
from funasr import AutoModel

model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
#wav_file = f"{model.model_path}/example/asr_example.wav"
wav_file = "./FSMN-VAD/example/vad_example.wav"
res = model.generate(input=wav_file,data_type=("sound"))
print(res)

**Voice Activity Detection (Streaming)**

In [None]:
from funasr import AutoModel

chunk_size = 200 # ms
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")

import soundfile

wav_file = f"{model.model_path}/example/vad_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = int(chunk_size * sample_rate / 1000)

cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
    is_final = i == total_chunk_num - 1
    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
    if len(res[0]["value"]):
        print(res)

Run with onnx offline test

In [None]:
from funasr_onnx import Paraformer
from pathlib import Path

model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
# model_dir = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True, disable_update=True)
# model = Paraformer(model_dir, batch_size=1, device_id=0)  # gpu

# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")

wav_path = ["{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)]

result = model(wav_path)
print(result)

In [None]:
!ls -alt ~/.cache/modelscope/hub/damo/
!ls -lat ~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
!ls -lat ~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example

**FSMN-VAD**

In [None]:
from funasr_onnx import Fsmn_vad
from pathlib import Path

model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

model = Fsmn_vad(model_dir, quantize=True)

result = model(wav_path)
print(result)

In [None]:
!ls -alt ~/.cache/modelscope/hub/damo/
!ls -lat ~/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch
!ls -lat ~/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example

**FSMN-VAD-online**

In [None]:
from funasr_onnx import Fsmn_vad_online
import soundfile
from pathlib import Path

model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

model = Fsmn_vad_online(model_dir)


##online vad
speech, sample_rate = soundfile.read(wav_path)
speech_length = speech.shape[0]
#
sample_offset = 0
step = 1600
param_dict = {'in_cache': []}
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
    if sample_offset + step >= speech_length - 1:
        step = speech_length - sample_offset
        is_final = True
    else:
        is_final = False
    param_dict['is_final'] = is_final
    segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
                            param_dict=param_dict)
    if segments_result:
        print(segments_result)

In [None]:
!ls -alt ~/.cache/modelscope/hub/damo/
!ls -lat ~/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch
!ls -lat ~/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example

**CT-Transformer**

In [None]:
from funasr_onnx import CT_Transformer

model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
model = CT_Transformer(model_dir, quantize=True)

text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
result = model(text_in)
print(result[0])

In [None]:
!ls -alt ~/.cache/modelscope/hub/damo/
!ls -lat ~/.cache/modelscope/hub/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch
!ls -lat ~/.cache/modelscope/hub/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/example

**CT-Transformer-online**

In [None]:
from funasr_onnx import CT_Transformer_VadRealtime

model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
model = CT_Transformer_VadRealtime(model_dir, quantize=True)

text_in  = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"

vads = text_in.split("|")
rec_result_all=""
param_dict = {"cache": []}
for vad in vads:
    result = model(vad, param_dict=param_dict)
    rec_result_all += result[0]

print(rec_result_all)

In [None]:
!ls -alt ~/.cache/modelscope/hub/damo/
!ls -lat ~/.cache/modelscope/hub/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727
!ls -lat ~/.cache/modelscope/hub/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/example

In [None]:
from google.colab import drive
drive.mount('/content/gd/')

In [None]:
!mkdir /content/gd/MyDrive/SenseVoice

In [None]:
!ls -alt /content/gd/MyDrive/SenseVoice

In [None]:
!rm -rf /content/gd/MyDrive/SenseVoice/*

In [None]:
!cp -rf ~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch /content/gd/MyDrive/SenseVoice/
!cp -rf ~/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch /content/gd/MyDrive/SenseVoice/
!cp -rf ~/.cache/modelscope/hub/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch /content/gd/MyDrive/SenseVoice/
!cp -rf ~/.cache/modelscope/hub/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727 /content/gd/MyDrive/SenseVoice/