# Part I

This code is designed to capture real-time speech audio using the pyaudio library, process the captured data with numpy, and display the audio using the IPython library.

In [3]:
#Use pyaudio for speech capture and use numpy for processing data
import pyaudio
import numpy as np

#Initialize parameters of pyaudio speech capture
CHUNK = 1024
resolution = pyaudio.paInt16 
CHANNELS = 1 
sampling_rate = 16000
record_time=5
Expected_num_of_chunks= int(sampling_rate/CHUNK*record_time) 
p=pyaudio.PyAudio()
stream = p.open(format=resolution, channels=CHANNELS, rate=sampling_rate, input=True, frames_per_buffer=CHUNK)
frames=[]

#Get data of chunks
for i in range(0,Expected_num_of_chunks): 
    data = stream.read(CHUNK)
    frames.append(data)      
array_frames=np.array(frames)

#Convert the hexadecimal data to decimal one
intframes=np.frombuffer(array_frames, np.int16)

#Stop speech capture
stream.stop_stream()
stream.close()
p.terminate()

#display the captured speech signals using IPython package
import IPython.display as ipd
ipd.Audio(intframes,rate=16000)

# Part II

The program waits for a keypress ('s') and then records audio until there is a sufficiently long period of silence, saving the recorded speech as a WAV file.

In [7]:
import keyboard
import time
import wave
import pyaudio
import numpy as np
import IPython.display as ipd

#Define hit-to-talk function and set 's' as the start key
def hit_to_talk():
    keyboard.wait('s')
    time.sleep(1)
    print("Please talk")

#Define the function to calculate the energy of samples in decibel
def EnergyPerSampleInDecibel(audioframe):
    audioframe=np.frombuffer(np.array(audioframe),np.int16)
    np_audioframe=np.array(audioframe,dtype='int64')
    energy= 10*np.log10(np.sum(np.square(np_audioframe)))
    return energy

#Define the function to calculate the initial backgorund energy 
def find_ini_background(audioframe):
    background_energy=0
    background_energy+=EnergyPerSampleInDecibel(audioframe)
    return background_energy/10

#Define the calssifyFrame function to find the endpointing
def classifyFrame(audioframe,background,level, First_run):
    forgetfactor=1
    isSpeech=False
    current = EnergyPerSampleInDecibel(audioframe)
    if First_run:
        First_run=False
        level=EnergyPerSampleInDecibel(audioframe)
    else:
        level=((level*forgetfactor)+current)/(1+forgetfactor)
    threshold=55
    adjustment=0.05
    #print('Background',background,'current',current, 'level', level)
    if current < background:
        background = current
    else:
        background+=(current - background)*adjustment
    print('level-background', level-background)
    if (level < background): level = background
    if (level - background > threshold): isSpeech = True
    return level,isSpeech


def record():
    #parameters for recording
    CHUNK = 1600
    resolution = pyaudio.paInt16
    CHANNELS = 1
    sampling_rate = 16000
    stop_record_time=1.5
    stop_speak_time=0
    stop_speak_per_loop=0.1
    isSpeech=False
    p=pyaudio.PyAudio()
    stream = p.open(format=resolution, channels=CHANNELS, rate=sampling_rate, input=True, frames_per_buffer=CHUNK)
    frames=[]
    level=0
    First_run=True
    
    while True:
        #If it is the first run, read the data of 10 chunks to calculate the average energy of these 10 chunks and set it as the initial value of background
        if First_run:
            audioframe=stream.read(CHUNK*10)
            background=find_ini_background(audioframe)
        #Otherwise, read the data of 1 chunk to calculate energy in the classifyFrame function
        else: audioframe=stream.read(CHUNK)
        #Save the data of each chunk in the list 'frames'
        frames.append(audioframe)
        level,isSpeech=classifyFrame(audioframe,background,level,First_run)
        #After the first iteration set First_run as False
        First_run=False
        #stop_speak_time is to record the duration for which the energy remains below the threshold
        if isSpeech==True:
            stop_speak_time=0
        else:
            stop_speak_time+=0.1
        #if the duration for which the energy remains below the threshold is too long, stop recording
        if stop_speak_time>=stop_record_time:
            #Save the speech to a wav file
            wf = wave.open('1.wav', 'wb')
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
            wf.setframerate(sampling_rate)
            wf.writeframes(b''.join(frames))
            wf.close()
            print("Audio data saved as 1.wav")
            stream.stop_stream()
            stream.close()
            p.terminate()
            break

def start():
    hit_to_talk()
    record()

In [6]:
a=start()

Please talk
level-background 86.99626996851549
level-background 86.57531087416335
level-background 84.90117898300409
level-background 85.39811851976592
level-background 85.50452600450167
level-background 85.48728623424984
level-background 83.95574772738945
level-background 84.77301747412527
level-background 82.7916963263139
level-background 84.027967705345
level-background 83.43264025250824
level-background 80.42108077348315
level-background 73.3064739973488
level-background 64.18811636880639
level-background 54.83326283660253
level-background 47.86604660981043
level-background 49.95699583384781
level-background 63.37989114035571
level-background 73.44681017211244
level-background 74.03970953621476
level-background 74.54685860241095
level-background 77.44012052903992
level-background 77.94739310933863
level-background 76.4801922721118
level-background 77.39498637152667
level-background 74.25006757503962
level-background 70.9726454878202
level-background 61.23900408522678
level-backgrou