# Voice Recognition.

## Install dependencies

In [7]:
!pip install -q transformers datasets torchaudio

import os
from google.colab import userdata

# Getting Kaggle credentials and setting in environment
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

## Download dataset

In [6]:
import torch
from datasets import load_dataset, Audio
from transformers import Wav2Vec2FeatureExtractor

# 1. Cargar el dataset (usamos una porción para la PoC)
dataset = load_dataset("fluent_speech_commands", split="train[:2000]")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

# 2. Crear diccionarios de etiquetas
# Vamos a predecir 'speakerId' e 'intent' (combinación de action+object+location)
speaker_labels = dataset.unique("speakerId")
intent_labels = list(set([f"{a}_{o}_{l}" for a,o,l in zip(dataset['action'], dataset['object'], dataset['location'])]))

label2id_spk = {label: i for i, label in enumerate(speaker_labels)}
label2id_int = {label: i for i, label in enumerate(intent_labels)}

# 3. Procesador de Audio
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(audio_arrays, sampling_rate=16000, padding=True, max_length=160000, truncation=True)

    inputs["speaker_labels"] = [label2id_spk[s] for s in examples["speakerId"]]
    inputs["intent_labels"] = [label2id_int[f"{a}_{o}_{l}"] for a,o,l in zip(examples['action'], examples['object'], examples['location'])]
    return inputs

encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

DatasetNotFoundError: Dataset 'fluent_speech_commands' doesn't exist on the Hub or cannot be accessed.