In [None]:

import matplotlib.pyplot as plt
import IPython.display as ipd


import os
import csv
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write

import random

from phonemizer.backend.espeak.wrapper import EspeakWrapper
_ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib'
EspeakWrapper.set_library(_ESPEAK_LIBRARY)

In [None]:
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [None]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")

In [None]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained_ljs.pth", net_g, None)

In [None]:
hps_ms = utils.get_hparams_from_file("./configs/vctk_base.json")

In [None]:
net_g_ms = SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model)
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained_vctk.pth", net_g_ms, None)

In [None]:
def generate_speech(text, speaker_id, output_path):
    text_input = get_text(text, hps_ms)  # Convert text to phonemes
    with torch.no_grad():
        x_tst = text_input.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([text_input.size(0)])
        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=torch.LongTensor([speaker_id]), noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
        write(output_path, hps_ms.data.sampling_rate, audio)

In [None]:
os.makedirs("dataset/positive", exist_ok=True)
os.makedirs("dataset/negative_wrong_speaker", exist_ok=True)
os.makedirs("dataset/negative_wrong_text", exist_ok=True)
os.makedirs("dataset/negative_random", exist_ok=True)

In [None]:
# Target speaker
for i in range(100):
    generate_speech("Alexa,", 0, f"dataset/positive/wake_{i}.wav")
    print(f"Generated positive wake word {i}", end="\r")

In [None]:
# Other speakers
num_speakers = net_g_ms.n_speakers

In [None]:
for i in range(300):
    speaker = random.choice(range(1, num_speakers))
    generate_speech("Alexa,", speaker, f"dataset/negative_wrong_speaker/wake_{i}.wav")
    print(f"Generated negative wake word {i}", end="\r")

In [None]:
non_wake_words = [
    "Hello,",
    "Goodbye,",
    "How are you?",
    "What is your name?",
    "Tell me a joke.",
    "Play some music.",
    "Set a timer for 10 minutes.",
    "Turn on the lights.",
    "What is the weather like?",
    "Remind me to call mom.",
    "Add milk to the shopping list.",
    "Play my favorite song.",
    "Set an alarm for 7 AM.",
    "Tell me the news.",
    "Find a recipe for pasta.",
    "Translate 'hello' to Spanish.",
    "What is the capital of France?",
]    

In [None]:
for i in range(300):
    text = random.choice(non_wake_words)
    generate_speech(text, 0, f"dataset/negative_wrong_text/wake_{i}.wav")
    print(f"Generated negative wake word {i}", end="\r")
    

In [None]:
for i in range(500):
    text = random.choice(non_wake_words)
    speaker = random.choice(range(1, num_speakers))
    generate_speech(text, speaker, f"dataset/negative_random/wake_{i}.wav")
    print(f"Generated negative wake word {i}", end="\r")

In [None]:
label_map = {
    "positive": 1,
    "negative_wrong_speaker": 0,
    "negative_wrong_text": 0,
    "negative_random": 0
}

In [None]:
data_entries = []
dataset_root = "./dataset"
csv_path = "./dataset.csv"

for folder_name, label in label_map.items():
    folder_path = os.path.join(dataset_root, folder_name)
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".wav"):
            file_path = os.path.join(folder_path, file_name)
            data_entries.append((file_path, label))

In [None]:
with open(csv_path, mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["path", "label"])
    for entry in data_entries:
        writer.writerow(entry)

In [None]:
print(f"âœ… CSV saved to: {csv_path}")
print(f"Total samples: {len(data_entries)}")