In [None]:
!pip uninstall tensorflow

In [None]:
!pip install tensorflow==2.12.0

In [None]:
!unzip 'audios.zip' -d audios/

## Ejemplo de visualización de audio

In [None]:
from tensorflow.python.ops.logging_ops import audio_summary
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

def plot_audio(filepath):

  #Cargar el archivo binario
  audio_binary=tf.io.read_file(filepath)

  # Decodificar el contenido del archivo binario
  # Secuencia de escalares (magnitudes) y frecuencia de muestreo
  audio, audioSR = tf.audio.decode_wav(audio_binary)
  #print(audio.shape)

  # Trabajaremos con un eje
  audio = tf.squeeze(audio, axis=-1)

  t = np.arange(0, len(audio))
  plt.plot(t, audio, 'g')
  plt.show()

In [None]:
filepath='/content/audios/on/on_0.wav'
plot_audio(filepath)

## Reproducción de audio

In [None]:
from IPython.display import Audio, display

def play_audio(filepath):

  #Cargar el archivo binario
  audio_binary=tf.io.read_file(filepath)

  # Decodificar el contenido del archivo binario
  # Secuencia de escalares (magnitudes) y frecuencia de muestreo
  audio, audioSR = tf.audio.decode_wav(audio_binary)

  audio = tf.squeeze(audio, axis=-1)
  sr = tf.get_static_value(audioSR)
  display(Audio(audio, rate=audioSR, autoplay=False))


In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
play_audio('/content/audios/on/on_0.wav')



## Uso de STFT para obtener los espectros de frecuencia y magnitudes

In [None]:
import tensorflow as tf
import numpy as np

def audioToTensor(filepath):
  #Cargar el archivo binario
  audio_binary=tf.io.read_file(filepath)

  # Decodificar el contenido del archivo binario
  # Secuencia de escalares (magnitudes) y frecuencia de muestreo
  audio, audioSR = tf.audio.decode_wav(audio_binary)

  audio = tf.squeeze(audio, axis=-1)
  sr = tf.get_static_value(audioSR)

  spectro = tf.signal.stft(audio, frame_length=512, frame_step=128)
  spectro = tf.abs(spectro)
  return spectro

In [None]:
s = audioToTensor('/content/audios/off/off_0.wav')
print(s.shape)

## Visualizar el espectrograma

In [None]:
def plot_spectrogram(spectrogram):
  h = spectrogram.shape[0]
  w = spectrogram.shape[1]
  spec_log = np.log(tf.transpose(spectrogram) + np.finfo(float).eps)
  plt.xlabel("Tiempo")
  plt.ylabel("Frecuencia")
  plt.pcolormesh(range(h), range(w), spec_log)
  plt.show()

In [None]:
plot_spectrogram(s)

## Crear generador de datos

In [None]:
# Crear las listas archivos de audio y su etiqueta correspondiente

from tensorflow.keras.utils import to_categorical
import glob
import os

def getExamples(datafolder):
  X_audio = []
  Y_command = []

  # Comandos
  commands = [os.path.basename(x) for x in glob.glob(datafolder + '*')]
  print(commands)

  for i, command in enumerate(commands):
    for file in glob.glob(os.path.join(datafolder, command) + '/*.wav'):
      X_audio.append(file)
      Y_command.append(np.array(to_categorical(i, num_classes=len(commands))))
  return np.asarray(X_audio), np.asarray(Y_command)

In [None]:
datafolder= '/content/audios/'
X_audio, Y_command = getExamples(datafolder)

In [None]:
print(len(X_audio), len(Y_command))

In [None]:
from sklearn.model_selection import train_test_split

X_audio, X_audio_test, Y_command, Y_command_test = train_test_split(X_audio, Y_command, test_size=0.25)
print(len(X_audio))
print(len(Y_command))

In [None]:
# Implementar el generador de Datos

class MySequence(tf.keras.utils.Sequence):

  def __init__(self, x_audio, y_command, batch_size):
    self.x_audio = x_audio
    self.y_command = y_command
    self.batch_size = batch_size

  def __len__(self):
    return len(self.x_audio)//self.batch_size

  def __getitem__(self, idx):

    batch_y = self.y_command[idx * self.batch_size : (idx+1)*self.batch_size]
    batch_x = np.zeros((self.batch_size, s.shape[0], s.shape[1]))
    for i in range(0, self.batch_size):
      batch_x[i] = audioToTensor(self.x_audio[idx * self.batch_size + i])
    return batch_x, batch_y



In [None]:
# Verificar la forma de los datos de entrada y la salida esperada

mS=MySequence(X_audio, Y_command, 16)
i=iter(mS)
bx, by = next(i)
print(bx.shape, by.shape)

In [None]:
# Ejemplo de modelo basado en redes neuronales recurrentes

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, BatchNormalization, Reshape
from tensorflow.keras.layers import MaxPooling2D, Dropout, Flatten, Resizing

input_tensor = Input(shape=(247, 257))
x = Reshape((247, 257, 1))(input_tensor)
x = Resizing(32,32)(x)
x = BatchNormalization()(x)
#x = Reshape((x.shape[1], x.shape[2], 1))(x)
x = Conv2D(32, 3, activation='relu')(x)
x = Conv2D(64, 3, activation='relu')(x)
x = MaxPooling2D()(x)
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output_tensor = Dense(4, activation='softmax')(x)

model = Model(inputs=input_tensor, outputs=output_tensor)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, to_file="model.png", show_shapes=True)

In [None]:
batch_size = 1
epochs = 5
h = model.fit(MySequence(X_audio, Y_command, batch_size),
              shuffle=True,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=MySequence(X_audio_test, Y_command_test, batch_size))

## Reconocimiento de comandos

In [None]:
tensor = audioToTensor('/content/audios/on/on_0.wav')
tensor=tf.reshape(tensor, (1,247, 257,1))
print(tensor.shape)

In [None]:
datafolder='/content/audios/'
commands = [os.path.basename(x) for x in glob.glob(datafolder + '*')]

#Predicción
r = model.predict(tensor)
print(r[0])
print(commands[r[0].argmax()])

## Guardar, cargar y verificar el modelo

In [None]:
model.save('/content/modelAudioRecognition.h5')

In [None]:
from keras.models import load_model
smodel=load_model('/content/modelAudioRecognition.h5')

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(smodel)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

with open('audio_cnn.tflite', 'wb') as f:
  f.write(tflite_model)