Skip to content
Permalink
Browse files

generate audio data set from the evaluation data list (#68)

* keyword-generator

* cleaning up code

* evaluation data generator base

* script for generating evaluation data

* removing api key from codebase

* locating evaluation files together

* handling input failure

* updated readme and removed api keys

* url files for initial evaluations

* taking in api_key from command arguments

* fixing url_file_generator filter condition

* fixing url_file_generator filter condition

* turning subtitiles to lowercase

* url files udpated

* using inflect for plural

* utilizing library for plural

* support continuation of evaluation data generation

* dropping redundant videos

* volume evaluation data file

* utilizing set for catching duplicate & bug fix for url_file_reader

* evaluation audio data generator

* handling merge issue
  • Loading branch information
ljj7975 committed Feb 8, 2019
1 parent 21a7068 commit 2aa9b3cebc570bb0e9b444c1e3935255e771188e
@@ -105,7 +105,7 @@ venv.bak/
.mypy_cache/

# keyword generator
keyword_spotting_data_generator/data
keyword_spotting_data_generator/evaluation/audio_data

.DS_STORE
training_data/
@@ -0,0 +1,96 @@
"""
For each audio entry in the evaluation data, generate .wav file
store data under data/<keyword>, with following naming convention:
<video id>~<start time (ms)>~<end time (ms)>.wav
"""
import argparse
import csv
import os
import librosa

import color_print as cp

from youtube_crawler import YoutubeCrawler

SAMPLE_RATE = 16000

def main():
parser = argparse.ArgumentParser()

parser.add_argument(
"-k",
"--keyword",
type=str,
required=True,
help="keyword for the given evaluation data list")

parser.add_argument(
"-f",
"--evaluation_data_file",
type=str,
help="file containing list of evaluation data to be generated")

parser.add_argument(
"-a",
"--api_key",
type=str,
required=True,
help="API key for youtube data v3 API")

parser.add_argument(
"-o",
"--output_folder",
type=str,
default="audio_data",
help="folder to store the audio data")

args = parser.parse_args()
keyword = args.keyword.lower()

directory = os.path.join(args.output_folder, keyword)

cp.print_progress("location of audio data - ", directory)

if not os.path.exists(directory):
os.makedirs(directory)

total = sum([1 for i in open(args.evaluation_data_file, "r").readlines() if i.strip()])

cp.print_progress("evaluation data file - ", args.evaluation_data_file)

with open(args.evaluation_data_file, "r") as file:
reader = csv.reader(file, delimiter=",")

prev_vid = None
for i, line in enumerate(reader):
curr_vid = line[0]
start_time = line[1]
end_time = line[2]
wav_file = os.path.join(directory, curr_vid + "~" + start_time + "~" + end_time + ".wav")

start_time = int(start_time)
end_time = int(end_time)

cp.print_progress(i + 1, " / ", total, " - ", wav_file)

if os.path.exists(wav_file):
cp.print_warning(wav_file, "already exist")
continue

if prev_vid != curr_vid:
try:
crawler = YoutubeCrawler(curr_vid)
audio_data = crawler.get_audio()
except Exception as exception:
cp.print_error("failed to download audio file for video ", curr_vid)
cp.print_warning(exception)
continue

librosa.output.write_wav(wav_file, audio_data[start_time:end_time], SAMPLE_RATE)

prev_vid = curr_vid

cp.print_progress("audio file generation is completed for ", keyword, " - ", total)

if __name__ == "__main__":
main()
@@ -123,7 +123,8 @@ def main():
try:
video = PyTube(utils.get_youtube_url(url))
except Exception as exception:
cp.print_error("failed to generate PyTube representation for vidoe ", url)
cp.print_error("failed to generate PyTube representation for video ", url)
cp.print_error(exception)
continue
if int(video.length) > args.video_length:
continue
@@ -136,7 +137,8 @@ def main():
try:
srt_captions = caption.generate_srt_captions().split('\n\n')
except Exception as exception:
cp.print_error("failed to retrieve for vidoe - ", url)
cp.print_error("failed to retrieve srt for video - ", url)
cp.print_error(exception)
continue

translator = str.maketrans('', '', string.punctuation) # to remove punctuation
@@ -156,6 +158,7 @@ def main():
crawler = YoutubeCrawler(url)
audio_data = crawler.get_audio()
except Exception as exception:
cp.print_error("failed to download audio file for video ", url)
cp.print_warning(exception)
continue

0 comments on commit 2aa9b3c

Please sign in to comment.
You can’t perform that action at this time.