From 04833bfe58ed26c1a534d6c3a093b5cc9542eeac Mon Sep 17 00:00:00 2001 From: Jaejun Lee Date: Sun, 12 Jan 2020 12:01:02 -0600 Subject: [PATCH 1/4] cleaning up and adding working version of keyword generator --- .gitmodules | 3 - keyword_spotting_data_generator/README.md | 99 +--- keyword_spotting_data_generator/drop_audio.py | 42 -- .../evaluation/evaluate.py | 114 ----- .../evaluation/evaluation_audio_generator.py | 92 ---- .../evaluation/evaluation_data_generator.py | 238 --------- .../evaluation/extractor/__init__.py | 1 - .../evaluation/extractor/base_extractor.py | 9 - .../extractor/edit_distance_extractor.py | 65 --- .../evaluation/kws-gen-data | 1 - .../evaluation/url_fetcher/__init__.py | 2 - .../evaluation/url_fetcher/url_file_reader.py | 19 - .../evaluation/url_file_generator.py | 96 ---- .../evaluation/utils/__init__.py | 4 - .../evaluation/utils/color_print.py | 35 -- .../evaluation/utils/csv_writer.py | 19 - .../evaluation/utils/util.py | 42 -- .../evaluation/utils/youtube_crawler.py | 31 -- .../extractor/__init__.py | 1 + .../extractor/base_extractor.py | 10 + .../extractor/sphinx_stt_extractor.py | 40 ++ .../keyword_data_generator.py | 465 ++++++++++-------- .../requirements.txt | 6 +- keyword_spotting_data_generator/sample.tar.gz | Bin 586420 -> 0 bytes keyword_spotting_data_generator/search.py | 47 -- .../url_fetcher => }/url_fetcher.py | 0 .../utils/__init__.py | 8 + .../utils/color_print.py | 39 ++ .../utils/file_utils.py | 12 + .../youtube_processor.py | 106 ++++ .../url_fetcher => }/youtube_searcher.py | 3 +- 31 files changed, 481 insertions(+), 1168 deletions(-) delete mode 100644 .gitmodules delete mode 100644 keyword_spotting_data_generator/drop_audio.py delete mode 100644 keyword_spotting_data_generator/evaluation/evaluate.py delete mode 100644 keyword_spotting_data_generator/evaluation/evaluation_audio_generator.py delete mode 100644 keyword_spotting_data_generator/evaluation/evaluation_data_generator.py delete mode 100644 keyword_spotting_data_generator/evaluation/extractor/__init__.py delete mode 100644 keyword_spotting_data_generator/evaluation/extractor/base_extractor.py delete mode 100644 keyword_spotting_data_generator/evaluation/extractor/edit_distance_extractor.py delete mode 160000 keyword_spotting_data_generator/evaluation/kws-gen-data delete mode 100644 keyword_spotting_data_generator/evaluation/url_fetcher/__init__.py delete mode 100644 keyword_spotting_data_generator/evaluation/url_fetcher/url_file_reader.py delete mode 100644 keyword_spotting_data_generator/evaluation/url_file_generator.py delete mode 100644 keyword_spotting_data_generator/evaluation/utils/__init__.py delete mode 100644 keyword_spotting_data_generator/evaluation/utils/color_print.py delete mode 100644 keyword_spotting_data_generator/evaluation/utils/csv_writer.py delete mode 100644 keyword_spotting_data_generator/evaluation/utils/util.py delete mode 100644 keyword_spotting_data_generator/evaluation/utils/youtube_crawler.py create mode 100644 keyword_spotting_data_generator/extractor/__init__.py create mode 100644 keyword_spotting_data_generator/extractor/base_extractor.py create mode 100644 keyword_spotting_data_generator/extractor/sphinx_stt_extractor.py delete mode 100644 keyword_spotting_data_generator/sample.tar.gz delete mode 100644 keyword_spotting_data_generator/search.py rename keyword_spotting_data_generator/{evaluation/url_fetcher => }/url_fetcher.py (100%) create mode 100644 keyword_spotting_data_generator/utils/__init__.py create mode 100644 keyword_spotting_data_generator/utils/color_print.py create mode 100644 keyword_spotting_data_generator/utils/file_utils.py create mode 100644 keyword_spotting_data_generator/youtube_processor.py rename keyword_spotting_data_generator/{evaluation/url_fetcher => }/youtube_searcher.py (97%) diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index ed4e943..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "keyword_spotting_data_generator/evaluation/kws-gen-data"] - path = keyword_spotting_data_generator/evaluation/kws-gen-data - url = https://github.com/castorini/kws-gen-data diff --git a/keyword_spotting_data_generator/README.md b/keyword_spotting_data_generator/README.md index 08e927d..1a7c426 100644 --- a/keyword_spotting_data_generator/README.md +++ b/keyword_spotting_data_generator/README.md @@ -1,101 +1,30 @@ # Keyword Spotting Data Generator --- -In order to add flexibility of keyword spotting, we are working on dataset generator using youtube videos. Key idea is to decrease the search space by utilizing subtitles. - -This is still in development but it's possible to generate some dataset. -Note that current version has precision of ~ 0.5. +In order to add flexibility of keyword spotting, we are working on dataset generator using youtube videos. +Key idea is to decrease the search space by utilizing subtitles and extract target audio using [PocketSphinx](https://github.com/cmusphinx/pocketsphinx). ## < Preparation > -___ -1. Current version is implemented with technique called [forced alignment](https://github.com/pettarin/forced-alignment-tools#definition-of-forced-alignment). Install [Aeneas](https://github.com/readbeyond/aeneas#system-requirements-supported-platforms-and-installation). If it complains about `/usr/bin/ld: cannot find -lespeak`, this [page](https://github.com/readbeyond/aeneas/issues/189) may help. -2. Instal necessary packages by running `pip install -r requirements.txt` -3. [Obtain a Google API key](https://support.google.com/googleapi/answer/6158862?hl=en), and set `API_KEY = google_api_key` in `search.py` +- Necessary python packages can be downloaded with `pip -r install requirements.txt` +- [ffmpeg](https://www.ffmpeg.org/) and [SoX](http://sox.sourceforge.net/) must be available as well. +- YouTube Data API - follow [this instruction](https://developers.google.com/youtube/v3/getting-started) to obtain a new API key ## < Usage > -___ -### Generating Dataset - ``` -python keyword_data_generator.py -k < keywords to search > -s < size of keyword > +python keyword_data_generator.py + -a < youtube data v3 API key > + -k < list of keywords to search > + -s < number of samples to collect per keyword (default: 10) > + -o < output path (default: "./generated_keyword_audios") > ``` -### Filtering Correct Audios -by running `drop_audio.py` script, user can manually drop false positive audios. This script plays the audio in the folder and asks whether the audio file contains target keyword. - +example: ``` -python3 drop_audio.py < folder_name > +python keyword_data_generator.py -a $YOUTUBE_API_KEY -k google slack -s 20 -o ./generated ``` ## < Improvements > ___ - filtering non-english videos -- ffmpeg handling more dynamic vidoe types : mov,mp4,m4a,3gp,3g2,mj2 -- if video contains any of target words, generate a block +- adjust ffmpeg command to handle different types of video : mov,mp4,m4a,3gp,3g2,mj2 - dynamic handling of long videos (currently simple filter) -- increase the number of youtube videos retrieved from search (ex. searching similar words) -- increase rate of finding target term by stemming words - -## Evaluation of Improvements -In order to quantify the improvements, we are working on evaluation framework which measures the quality of selected audio. We are hoping that this helps us to develop robust keyword spotting data generator. - -Evaluation process involves following steps: - -1. `python url_file_generator.py` : collect urls which contains target keyword in the audio and store it in a single .txt file (url file) -2. `evaluation_data_generator.py` : for each audio block containing target keyword, record how many times the target keyword actually appear; csv file is generated summarizing details of each audio block (summary file) -3. `evaluation_audio_generator.py` : generate audio dataset from summary file -4. `evaluate.py` : measure the quality of the specified similar audio extraction algorithm on given summary file - -##### Setting up Experiment -After cloning this repo, run following command to clone submodule [kws-gen-data](https://github.com/castorini/kws-gen-data) -`git submodule update --init --recursive` - -##### `url_file_generator.py` -Collect urls of videos which subtitle contains target keywords - -``` -python url_file_generator.py - -a < youtube data v3 API key > - -k < keywords to search > - -s < number of urls > -``` - -##### `evaluation_data_generator.py` -For each audio block with keyword, allow users to record how many times the target keyword actually appear. This is the ground truth for measuring quality. -A csv file generated is called a summary file where each column represents `url`, `start_ms`, `end_ms`, `cc_count`, `audio_count` -- url - unique id of youtube video -- start_ms - start time of the given subtitle section -- end_ms - end time of the given subtitle section -- cc_count - how many keyword appeared in subtitle -- audio_count - how many time keyword appeared in the audio (user input) - -``` -python evaluation_data_generator.py - -a < youtube data v3 API key > - -k < keywords to search > - -s < number of urls > - -f < url file name (when unspecified, directly search youtube) > - -c < url in url file to start from > - -l < length of maximum length for a video (s) > - -o < output csv file to append output to > -``` - -##### `evaluation_data_generator.py` -Generate set of `.wav` files from the provided summary file - -``` -python evaluation_audio_generator.py - -a < youtube data v3 API key > - -k < keywords to search > - -f < summary file > -``` - -##### `evaluate.py` -Measure the quality of the specified similar audio retrieval process on given summary file - -``` -python evaluation_audio_generator.py - -k < keywords to search > - -f < summary file > - -r < type of extraction algorithm to use > - -th < threshold for retrieving a window > -``` +- improve throughput by parallelizing the process diff --git a/keyword_spotting_data_generator/drop_audio.py b/keyword_spotting_data_generator/drop_audio.py deleted file mode 100644 index 2c83537..0000000 --- a/keyword_spotting_data_generator/drop_audio.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -import subprocess -import sys -from tqdm import tqdm - -if len(sys.argv) < 2: - print("usage: python3 drop_audio.py ") - sys.exit() - -DIR_NAME = sys.argv[1] - -def play_audio(file): - subprocess.check_output(["ffplay", "-nodisp", "-autoexit", file]) - -FILE_LIST = os.listdir(DIR_NAME) -TOTAL_COUNT = len(FILE_LIST) - -DELETE_COUNT = 0 -for file_name in tqdm(FILE_LIST): - keep = '' - path = os.path.join(DIR_NAME, file_name) - print(path) - if not file_name.endswith("wav"): - os.remove(path) - continue - - while keep != "s" and keep != "d": - play_audio(path) - keep = input("\n\n> keep? (yes = s / no = d)\n") - - if keep == "d": - print("deleting audio ...") - DELETE_COUNT += 1 - os.remove(path) - -REMAINING_COUNT = TOTAL_COUNT - DELETE_COUNT - -print("deleted : " + str(DELETE_COUNT)) -print("false positive : " + str(round(100*DELETE_COUNT/TOTAL_COUNT)) + " %") -print("\nremaining : " + str(REMAINING_COUNT)) -print("true positive : " + str(round(100*REMAINING_COUNT/TOTAL_COUNT)) + " %") -print("\ntotal : " + str(TOTAL_COUNT)) diff --git a/keyword_spotting_data_generator/evaluation/evaluate.py b/keyword_spotting_data_generator/evaluation/evaluate.py deleted file mode 100644 index 655e277..0000000 --- a/keyword_spotting_data_generator/evaluation/evaluate.py +++ /dev/null @@ -1,114 +0,0 @@ -""" -For each audio entry in the evaluation data, -find where the target target keyword would likely occur by computing measure of similarity -""" -import argparse -import csv -import os -import librosa - -from utils import color_print as cp - -from extractor import EditDistanceExtractor - -SAMPLE_RATE = 16000 - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument( - "-k", - "--keyword", - type=str, - required=True, - help="keyword for the given evaluation data list") - - parser.add_argument( - "-f", - "--summary_file", - type=str, - help="file containing list of evaluation data to be generated") - - parser.add_argument( - "-e", - "--extractor", - type=str, - default="edit_distance_extractor", - help="type of extraction algorithm to use") - - parser.add_argument( - "-th", - "--threshold", - type=float, - default=0.95, - help="threshold for retrieving a window") - - args = parser.parse_args() - data_folder_path = "./kws-gen-data" - if not os.path.exists(data_folder_path): - cp.print_error("please clone kws-gen-data folder using git submodule") - exit() - - keyword = args.keyword.lower() - audio_dir = os.path.join(data_folder_path, "audio_data/"+keyword) - - if not os.path.exists(audio_dir): - cp.print_error("audio data is missing - ", audio_dir) - exit() - - total = sum([1 for i in open(args.summary_file, "r").readlines() if i.strip()]) - - cp.print_progress("evaluation data file - ", args.summary_file) - - # load pre recorded target audios - target_audios = [] - target_audio_dir = os.path.join(data_folder_path, "target_audio/"+keyword) - - if not os.path.exists(target_audio_dir): - cp.print_error("target audio data is missing - ", target_audio_dir) - exit() - - for file_name in os.listdir(target_audio_dir): - target_audios.append(librosa.core.load(os.path.join(target_audio_dir, file_name))[0]) - - # instantiate extractor - extractor = None - if args.extractor == "edit_distance_extractor": - cp.print_progress("extractor type :", args.extractor, "( threshold :", args.threshold, ", number of target audios : ", len(target_audios), ")") - extractor = EditDistanceExtractor(target_audios, args.threshold) - - # extract similar audio from each audio blocks - with open(args.summary_file, "r") as file: - reader = csv.reader(file, delimiter=",") - - for i, line in enumerate(reader): - vid = line[0] - start_time = line[1] - end_time = line[2] - wav_file = os.path.join(audio_dir, vid + "~" + start_time + "~" + end_time + ".wav") - - start_time = int(start_time) - end_time = int(end_time) - - cp.print_progress(i + 1, " / ", total, " - ", wav_file) - - if not os.path.exists(wav_file): - cp.print_warning("audio file is missing - ", wav_file) - continue - - data = librosa.core.load(wav_file, SAMPLE_RATE)[0] - - extracted_audio_times = extractor.extract_keywords(data) - - # TODO :: count how many window has been extracted and compare it against true count - - # TODO :: might be good idea to update threshold if the accuracy is way too low - - cp.print_progress("evaluation is completed for ", keyword, " - ", total) - - # TODO :: calculate accuracy and report metrics - - # TODO :: if we update threshold, report threshold as well - -if __name__ == "__main__": - main() diff --git a/keyword_spotting_data_generator/evaluation/evaluation_audio_generator.py b/keyword_spotting_data_generator/evaluation/evaluation_audio_generator.py deleted file mode 100644 index 1eb8db5..0000000 --- a/keyword_spotting_data_generator/evaluation/evaluation_audio_generator.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -For each audio entry in the evaluation data, generate .wav file -store data under data/, with following naming convention: -