Skip to content
Permalink
Browse files

Enhance recording utility (#35)

* Refine recording utility

* Update README with recording utility
  • Loading branch information
daemon committed Nov 10, 2017
1 parent aebac6a commit 359f4dbed1969527f5d300705dc3511029b24a81
Showing with 151 additions and 79 deletions.
  1. +8 −6 README.md
  2. +14 −11 server.py
  3. +2 −62 utils/manage_audio.py
  4. +127 −0 utils/record.py
@@ -115,18 +115,20 @@ There are command options available:
### Recording audio

You may do the following to record sequential audio and save to the same format as that of speech command dataset:
```bash
python manage_audio.py record
```
Input any key (return is fastest) to open the microphone. After one second of silence, recording automatically halts.
python -m utils.record
```
Input return to record, up arrow to undo, and "q" to finish. After one second of silence, recording automatically halts.

Several options are available:
```
--output-prefix: Prefix of the output audio sequence
--min-sound-lvl: Minimum sound level at which audio is not considered silent
--timeout-seconds: Duration of silence after which recording halts
--output-begin-index: Starting sequence number
--output-prefix: Prefix of the output audio sequence
--post-process: How the audio samples should be post-processed. One or more of "trim" and "discard_true".
```
Post-processing consists of trimming or discarding "useless" audio. Trimming is self-explanatory: the audio recordings are trimmed to the loudest window of *x* milliseconds, specified by `--cutoff-ms`. Discarding "useless" audio (`discard_true`) uses a pre-trained model to determine which samples are confusing, discarding correctly labeled ones. The pre-trained model and correct label are defined by `--config` and `--correct-label`, respectively.

For example, consider `python -m utils.record --post-process trim discard_true --correct-label no --config config.json`. In this case, the utility records a sequence of speech snippets, trims them to one second, and finally discards those not labeled "no" by the model in `config.json`.

### Listening to sound level

@@ -116,18 +116,8 @@ def make_abspath(rel_path):
rel_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)
return rel_path

def start(config):
cherrypy.config.update({
"environment": "production",
"log.screen": True
})
cherrypy.config.update(config["server"])
rest_config = {"/": {
"request.dispatch": cherrypy.dispatch.MethodDispatcher()
}}
def load_service(config):
model_path = make_abspath(config["model_path"])
train_script = make_abspath(config["train_script"])
speech_dataset_path = make_abspath(config["speech_dataset_path"])
commands = ["__silence__", "__unknown__"]
commands.extend(config["commands"].split(","))

@@ -138,7 +128,20 @@ def start(config):
lbl_service = TorchLabelService(model_path, labels=commands, no_cuda=config["model_options"]["no_cuda"])
else:
raise ValueError("Backend {} not supported!".format(backend))
return lbl_service

def start(config):
cherrypy.config.update({
"environment": "production",
"log.screen": True
})
cherrypy.config.update(config["server"])
rest_config = {"/": {
"request.dispatch": cherrypy.dispatch.MethodDispatcher()
}}
train_script = make_abspath(config["train_script"])
speech_dataset_path = make_abspath(config["speech_dataset_path"])
lbl_service = load_service(config)
train_service = TrainingService(train_script, speech_dataset_path, config["model_options"])
cherrypy.tree.mount(ListenEndpoint(lbl_service), "/listen", rest_config)
cherrypy.tree.mount(DataEndpoint(train_service), "/data", rest_config)
@@ -209,42 +209,6 @@ def print_sound_level():
for audio in generator:
print("Sound level: {}".format(audio.amplitude_rms()), end="\r")

def record_speech_sequentially(file_name_prefix="output", min_sound_lvl=0.01, speech_timeout_secs=1., i=0):
"""Records audio in sequential audio files.
Args:
file_name_prefix: The prefix of the output filenames
min_sound_lvl: The minimum sound level as measured by root mean square
speech_timeout_secs: Timeout of audio after that duration of silence as measured by min_sound_lvl
i: The beginning index of sequence
"""
while True:
input("Input any key to record: ")
with AudioSnippetGenerator() as generator:
timeout_len = int(speech_timeout_secs * generator.sr / generator.chunk_size)
active_count = timeout_len
curr_snippet = None
for audio in generator:
if curr_snippet:
curr_snippet.append(audio)
else:
curr_snippet = audio
if audio.amplitude_rms() < min_sound_lvl:
active_count -= 1
else:
active_count = timeout_len
print("Time left: {:<10}".format(active_count), end="\r")
if active_count == 0:
output_name = "{}.{}.wav".format(file_name_prefix, i)
i += 1
with wave.open(output_name, "w") as f:
f.setnchannels(1)
f.setsampwidth(generator.audio.get_sample_size(generator.fmt))
f.setframerate(generator.sr)
f.writeframes(curr_snippet.byte_data)
print("Saved to {}".format(output_name))
break

def generate_dir(directory):
for filename in os.listdir(directory):
fullpath = os.path.join(os.path.abspath(directory), filename)
@@ -290,7 +254,7 @@ def clean_dir(directory=".", cutoff_ms=1000):

def main():
parser = argparse.ArgumentParser()
commands = dict(record=record_speech_sequentially, trim=clean_dir, listen=print_sound_level)
commands = dict(trim=clean_dir, listen=print_sound_level)
commands["generate-contrastive"] = generate_dir
parser.add_argument("subcommand")
def print_sub_commands():
@@ -299,31 +263,7 @@ def print_sub_commands():
print_sub_commands()
return
subcommand = sys.argv[1]
if subcommand == "record":
parser.add_argument(
"--output-prefix",
type=str,
default="output",
help="Prefix of the output audio sequence")
parser.add_argument(
"--min-sound-lvl",
type=float,
default=0.01,
help="Minimum sound level at which audio is not considered silent")
parser.add_argument(
"--timeout-seconds",
type=float,
default=1.,
help="Duration of silence after which recording halts")
parser.add_argument(
"--output-begin-index",
type=int,
default=0,
help="Starting sequence number")
flags, _ = parser.parse_known_args()
record_speech_sequentially(file_name_prefix=flags.output_prefix, i=flags.output_begin_index,
min_sound_lvl=flags.min_sound_lvl, speech_timeout_secs=flags.timeout_seconds)
elif subcommand == "generate-contrastive":
if subcommand == "generate-contrastive":
parser.add_argument(
"directory",
type=str,
@@ -0,0 +1,127 @@
import argparse
import enum
import json
import wave

from .manage_audio import AudioSnippetGenerator
from server import load_service

class KeyInput(enum.Enum):
QUIT = b"q"
REDO = b"\x1b[A"

def record_speech_sequentially(min_sound_lvl=0.01, speech_timeout_secs=1.):
"""Records audio in sequential audio files.
Args:
min_sound_lvl: The minimum sound level as measured by root mean square
speech_timeout_secs: Timeout of audio after that duration of silence as measured by min_sound_lvl
Returns:
The recorded audio samples.
"""
samples = []
i = 0
while True:
cmd = input("> ").encode()
if cmd == KeyInput.QUIT.value:
return samples
elif cmd == KeyInput.REDO.value:
print("Index now at {}.".format(i))
i = max(i - 1, 0)
try:
samples.pop()
except IndexError:
pass
continue
with AudioSnippetGenerator() as generator:
timeout_len = int(speech_timeout_secs * generator.sr / generator.chunk_size)
active_count = timeout_len
curr_snippet = None
for audio in generator:
if curr_snippet:
curr_snippet.append(audio)
else:
curr_snippet = audio
if audio.amplitude_rms() < min_sound_lvl:
active_count -= 1
else:
active_count = timeout_len
print("Time left: {:<10}".format(active_count), end="\r")
if active_count == 0:
i += 1
samples.append(curr_snippet)
print("Recorded #{:<10}".format(i))
break

def trim_sequence(samples, cutoff_ms):
for sample in samples:
n_samples = int((cutoff_ms / 1000) * 16000)
sample.trim_window(n_samples * 2)
return samples

def do_record_sequence():
parser = argparse.ArgumentParser()
parser.add_argument(
"--min-sound-lvl", type=float, default=0.01,
help="Minimum sound level at which audio is not considered silent")
parser.add_argument(
"--timeout-seconds", type=float, default=1.,
help="Duration of silence after which recording halts")
flags, _ = parser.parse_known_args()
return record_speech_sequentially(min_sound_lvl=flags.min_sound_lvl, speech_timeout_secs=flags.timeout_seconds)

def do_trim(audio_samples):
parser = argparse.ArgumentParser()
parser.add_argument("--cutoff-ms", type=int, default=1000)
flags, _ = parser.parse_known_args()
trim_sequence(audio_samples, flags.cutoff_ms)

def do_discard_true(audio_samples):
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="config.json")
parser.add_argument("--correct-label", type=str)
flags, _ = parser.parse_known_args()
with open(flags.config) as f:
config = json.loads(f.read())
lbl_service = load_service(config)
false_samples = []
for i, snippet in enumerate(audio_samples):
label, _ = lbl_service.label(snippet.byte_data)
if label != flags.correct_label:
action = "Keep"
false_samples.append(snippet)
else:
action = "Discard"
print("#{:<5} Action: {:<8} Label: {}".format(i, action, label))
return false_samples

def main():
parser = argparse.ArgumentParser()
record_choices = ["sequence"]
process_choices = ["discard_true", "trim"]
parser.add_argument("--mode", type=str, default="sequence", choices=record_choices)
parser.add_argument("--output-begin-index", type=int, default=0)
parser.add_argument("--output-prefix", type=str, default="output")
parser.add_argument("--post-process", nargs="+", type=str, choices=process_choices, default=[])
args, _ = parser.parse_known_args()

if args.mode == "sequence":
audio_samples = do_record_sequence()
for choice in args.post_process:
if choice == "discard_true":
audio_samples = do_discard_true(audio_samples)
elif choice == "trim":
do_trim(audio_samples)

for i, snippet in enumerate(audio_samples):
fullpath = "{}.{}.wav".format(args.output_prefix, i)
with wave.open(fullpath, "w") as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(16000)
f.writeframes(snippet.byte_data)
print("Saved {}.".format(fullpath))

if __name__ == "__main__":
main()

0 comments on commit 359f4db

Please sign in to comment.
You can’t perform that action at this time.