In [1]:
import pyaudio
import numpy as np
import librosa
import torch
import whisper
import sys

common_path = '/Users/yiding/personal_projects/ML/github_repo/littleSeven/'
if common_path not in sys.path:
    sys.path.append(common_path)

from common.config import cfg

In [2]:
FORMAT = pyaudio.paInt16

p = pyaudio.PyAudio()

# open the audio streaming
stream = p.open(
    format=FORMAT,
    channels=cfg.audio_channels,
    rate=cfg.sample_rate,
    input=True,
    frames_per_buffer=cfg.frame_chunk,
)

print("start recording...")

# record voice
frames = []
for i in range(0, int(cfg.sample_rate / cfg.frame_chunk * cfg.voice_duration)):
    data = stream.read(cfg.frame_chunk)
    frames.append(data)

print("recording completed")

# stop audio streaming
stream.stop_stream()
stream.close()
p.terminate()

# convert audio to NumPy array（16-bit PCM format）
audio_data = np.frombuffer(b"".join(frames), dtype=np.int16)

start recording...
recording completed


In [3]:
audio_data

array([  32,   74,   90, ..., -199, -191, -194], dtype=int16)

In [4]:
# convert int-typed audio data into float-type and do normalization to [-1,1]
audio_data_float = audio_data.astype(np.float32) / np.max(np.abs(audio_data))

# adjust the audio to 16000HZ sample rate
audio_data_resampled = librosa.resample(
    audio_data_float, orig_sr=cfg.sample_rate, target_sr=16000
)

# normalize the audio to [-1,1]
audio_data_resampled = audio_data_resampled / np.max(np.abs(audio_data_resampled))

# prepare data as a tensor
input_audio = torch.tensor(audio_data_resampled, dtype=torch.float32)

In [5]:
input_audio

tensor([ 0.0091,  0.0242,  0.0211,  ..., -0.0557, -0.0594, -0.0446])

In [6]:
input_audio.shape

torch.Size([79877])

In [7]:
whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large-v1',
 'large-v2',
 'large-v3',
 'large',
 'large-v3-turbo',
 'turbo']

In [8]:
## transcribe speech to text

# load whisper model
model = whisper.load_model(
    name=cfg.whisper_model_name, download_root=cfg.whisper_model_path
)



  checkpoint = torch.load(fp, map_location=device)


In [9]:
torch.backends.mps.is_available()

True

In [10]:
# model = model.to("mps")

In [11]:
result = model.transcribe(input_audio)

# print output
print("text:", result['text'])



text: Hello Hello Hello How are you?


In [12]:
## IF using whisper package, it would be quite slow when runing model.
# So try the transformer package and send the model to MPS for acceleration

from transformers import WhisperProcessor, WhisperForConditionalGeneration
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large",cache_dir='./whisper_model')
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large",cache_dir='./whisper_model')
model.config.forced_decoder_ids = None

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
input_features = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_features

In [14]:
# acceleration

model=model.to('mps')


In [15]:
input_features=input_features.to("mps")

In [16]:
predicted_ids = model.generate(input_features)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [17]:
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [18]:
transcription[0]

'Hello, hello, hello, how are you?'

In [19]:
# # for ease of use, add transformer pipeline here

# from transformers import pipeline

# transformer_pipe=pipeline(task="automatic-speech-recognition",processor=processor,model=model,tokenizer=processor.tokenizer,feature_extractor=processor.feature_extractor)

In [20]:
# next i would add Voice Activity Detection (VAD) so that the process could recevice custom length voice

In [21]:
# # Import the py-webrtcvad library
# import webrtcvad

# # Initialize a vad object
# vad = webrtcvad.Vad()

# # Run the VAD on 10 ms of silence and 16000 sampling rate 
# sample_rate = 16000
# frame_duration = 10  # in ms

# # Creating an audio frame of silence
# frame = b'\x00\x00' * int(sample_rate * frame_duration / 1000)

# # Detecting speech
# print(f'Contains speech: {vad.is_speech(frame, sample_rate)}')

In [23]:
import pyaudio
import webrtcvad
import numpy as np
import collections
import wave
import time

# set up parameters
SAMPLE_RATE = 16000  
FRAME_DURATION = 10  # duration tome for each frame，/ms
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION / 1000) 
VAD_MODE = 1  # VAD mode
SILENCE_LIMIT = 100  # voice tolerance
SPEECH_TIMEOUT = 1 

# initialize WebRTC VAD
vad = webrtcvad.Vad(VAD_MODE)

# initialize PyAudio
p = pyaudio.PyAudio()

stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=SAMPLE_RATE,
                input=True,
                frames_per_buffer=FRAME_SIZE)

print("start recording...")


frames = collections.deque(maxlen=SAMPLE_RATE * 100)  
no_speech_count = 0 
start_time = time.time()  

while True:
    audio_frame = stream.read(FRAME_SIZE)  
    frames.append(audio_frame)  
    
    audio_data = np.frombuffer(audio_frame, dtype=np.int16)
    
    is_speech = vad.is_speech(audio_data.tobytes(), SAMPLE_RATE)
    
    if is_speech:
        print("detect voice...")
        no_speech_count = 0  
        start_time = time.time() 
    else:
        print("with no voice...")
        no_speech_count += 1 
    
    if no_speech_count >= SILENCE_LIMIT:
        print("end recording...")
        break  

# stop audio streaming
stream.stop_stream()
stream.close()
p.terminate()

# save audio as wav file
filename = "./output.wav"
with wave.open(filename, 'wb') as wf:
    wf.setnchannels(1)  # 单声道
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))  
    wf.setframerate(SAMPLE_RATE)  
    wf.writeframes(b''.join(frames))  

print(f"save audio as {filename}")

start recording...
with no voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
dete

In [24]:
import pyaudio
import webrtcvad
import numpy as np
import collections
import wave
import io
import whisper
import time
import librosa


SAMPLE_RATE = 16000  
FRAME_DURATION = 10  
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION / 1000) 
VAD_MODE = 1  # VAD mode
SILENCE_LIMIT = 100 

# initilization WebRTC VAD
vad = webrtcvad.Vad(VAD_MODE)

# initilization PyAudio
p = pyaudio.PyAudio()

# open audio streaming
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=SAMPLE_RATE,
                input=True,
                frames_per_buffer=FRAME_SIZE)

print("start recording...")

frames = collections.deque(maxlen=SAMPLE_RATE * 100)  
no_speech_count = 0  
start_time = time.time() 

while True:
    audio_frame = stream.read(FRAME_SIZE)  
    frames.append(audio_frame)  
    
    audio_data = np.frombuffer(audio_frame, dtype=np.int16)
    
    is_speech = vad.is_speech(audio_data.tobytes(), SAMPLE_RATE)
    
    if is_speech:
        print("detect voice...")
        no_speech_count = 0  
        start_time = time.time()  
    else:
        print("with no voice...")
        no_speech_count += 1 
    
    if no_speech_count >= SILENCE_LIMIT:
        print("end recording...")
        break  


stream.stop_stream()
stream.close()
p.terminate()

audio_in_memory = io.BytesIO()
with wave.open(audio_in_memory, 'wb') as wf:
    wf.setnchannels(1)  
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))  
    wf.setframerate(SAMPLE_RATE)  
    wf.writeframes(b''.join(frames))  

# save audio data into memory
audio_in_memory.seek(0)
with wave.open(audio_in_memory, 'rb') as wf:
    audio_data = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)

model = whisper.load_model(name="base")  

# convert audio data type to the format whisper model request
input_audio = np.float32(audio_data) / 32768.0  # normalization to [-1, 1]

result = model.transcribe(input_audio)

# output
print("text:", result['text'])

start recording...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detect voice...
detec

  checkpoint = torch.load(fp, map_location=device)


text:  Hello, hello, hello, how's going?


In [25]:
import torch
# convert int-typed audio data into float-type and do normalization to [-1,1]
audio_data_float = audio_data.astype(np.float32) / np.max(np.abs(audio_data))

# adjust the audio to 16000HZ sample rate
audio_data_resampled = librosa.resample(
    audio_data_float, orig_sr=16000, target_sr=16000
)

# normalize the audio to [-1,1]
audio_data_resampled = audio_data_resampled / np.max(np.abs(audio_data_resampled))

# prepare data as a tensor
input_audio = torch.tensor(audio_data_resampled, dtype=torch.float32)

In [26]:
input_audio

tensor([0.0028, 0.0080, 0.0080,  ..., 0.0242, 0.0211, 0.0193])

In [27]:
## transcribe speech to text

# load whisper model
model = whisper.load_model(
    name=cfg.whisper_model_name, download_root=cfg.whisper_model_path
)

In [28]:
result = model.transcribe(input_audio)

# print output
print("text:", result['text'])

text:  Hello hello hello how's it going


In [29]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large",cache_dir='./whisper_model')
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large",cache_dir='./whisper_model')
model.config.forced_decoder_ids = None

In [30]:
input_features = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_features

In [31]:
# acceleration

model=model.to('mps')

In [32]:
input_features=input_features.to("mps")

In [33]:
predicted_ids = model.generate(input_features)

In [34]:
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [35]:
transcription

[" hello hello hello how's going"]