In [12]:
import torch
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
MAX_LENGTH=250

In [20]:
processed_waveforms = torch.load("processed_waveforms.pt")
processed_waveforms_np = processed_waveforms.numpy()
#processed_waveforms_tf = tf.convert_to_tensor(processed_waveforms_np, dtype=tf.float32)
#processed_waveforms_tf

In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

labels_df = pd.read_parquet('labels.parquet')
labels_df.head()

label_encoder = LabelEncoder()
input_labels = label_encoder.fit_transform(labels_df['label'].values)

num_classes = len(np.unique(input_labels))

In [28]:
len(input_labels)

14835

In [30]:
processed_waveforms_np.shape

(14835, 1, 40, 250)

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_waveforms_np, input_labels, test_size=0.2, random_state=42)

In [58]:

num_classes = len(np.unique(input_labels))

class KeywordSpottingModel(tf.keras.Model):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = layers.Conv2D(32, kernel_size=3, activation='relu')
        self.pool1 = layers.MaxPooling2D(pool_size=2)
        self.conv2 = layers.Conv2D(64, kernel_size=3, activation='relu')
        self.pool2 = layers.MaxPooling2D(pool_size=2)
        self.flatten = layers.Flatten()
        self.fc1 = layers.Dense(128, activation='relu')
        self.dropout = layers.Dropout(0.5)
        self.fc2 = layers.Dense(num_classes, activation='softmax')

    def call(self, x):
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.dropout(x)
        return self.fc2(x)

# Example input shape: (batch_size, 40, MAX_LENGTH, 1)
input_shape = (40, MAX_LENGTH)
model = KeywordSpottingModel(num_classes)

model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.0001, weight_decay=1e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
print("Model and optimizer initialized.")


Model and optimizer initialized.


In [59]:
X_train.shape

(11868, 1, 40, 250)

In [60]:
y_train.shape

(11868,)

In [65]:
batch_size = 256
num_epochs = 100

# If your data is (batch, 1, 40, 250), convert to (batch, 40, 250, 1)

X_train_t = np.transpose(X_train, (0, 2, 3, 1))
X_test_t = np.transpose(X_test, (0, 2, 3, 1))


X_train_t.shape

(11868, 40, 250, 1)

In [71]:
print(X_train_t[0][0])

[[-310.17117 ]
 [-302.32654 ]
 [-287.1179  ]
 [-274.86465 ]
 [-265.47418 ]
 [-258.0353  ]
 [-250.823   ]
 [-241.94958 ]
 [-230.24306 ]
 [-214.72646 ]
 [-189.89095 ]
 [-159.40329 ]
 [-127.76908 ]
 [-100.01271 ]
 [ -77.012314]
 [ -59.00333 ]
 [ -45.45096 ]
 [ -35.68926 ]
 [ -28.722294]
 [ -23.49182 ]
 [ -19.44215 ]
 [ -16.928602]
 [ -16.61527 ]
 [ -19.050562]
 [ -24.706367]
 [ -33.24147 ]
 [ -42.829643]
 [ -52.685272]
 [ -63.11634 ]
 [ -73.5384  ]
 [ -83.231895]
 [ -91.68851 ]
 [ -98.61966 ]
 [-105.63673 ]
 [-114.3607  ]
 [-125.00774 ]
 [-132.39197 ]
 [-132.48988 ]
 [-126.91919 ]
 [-113.65961 ]
 [ -93.74307 ]
 [ -74.63668 ]
 [ -59.072422]
 [ -47.25016 ]
 [ -39.32996 ]
 [ -35.142258]
 [ -33.951862]
 [ -34.81804 ]
 [ -36.36661 ]
 [ -38.702465]
 [ -42.77099 ]
 [ -48.14795 ]
 [ -53.874588]
 [ -58.98803 ]
 [ -62.85082 ]
 [ -65.80484 ]
 [ -68.826164]
 [ -72.586494]
 [ -77.7146  ]
 [ -84.73167 ]
 [ -94.17538 ]
 [-106.89946 ]
 [-123.535255]
 [-143.99301 ]
 [-168.33667 ]
 [-195.9803  ]
 [-221.378

In [66]:

#Train the model
model.fit(X_train_t,y_train, epochs=num_epochs, validation_data=(X_test_t, y_test), batch_size=batch_size)

Epoch 1/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 354ms/step - accuracy: 0.8881 - loss: 0.3143 - val_accuracy: 0.8669 - val_loss: 0.3552
Epoch 2/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 357ms/step - accuracy: 0.8894 - loss: 0.3155 - val_accuracy: 0.8675 - val_loss: 0.3569
Epoch 3/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 363ms/step - accuracy: 0.8885 - loss: 0.3171 - val_accuracy: 0.8662 - val_loss: 0.3633
Epoch 4/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 371ms/step - accuracy: 0.8895 - loss: 0.3128 - val_accuracy: 0.8750 - val_loss: 0.3467
Epoch 5/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 373ms/step - accuracy: 0.8922 - loss: 0.3047 - val_accuracy: 0.8584 - val_loss: 0.3732
Epoch 6/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 378ms/step - accuracy: 0.8921 - loss: 0.3058 - val_accuracy: 0.8733 - val_loss: 0.3421
Epoch 7/100
[1m

<keras.src.callbacks.history.History at 0x1a22aa64b60>

In [82]:
# 1. Save as a standard Keras SavedModel
model.save("keyword_spotting_model.keras")

# 2. Convert to TFLite
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_keras_model(model)
#this is a quantization step
tflite_model = converter.convert()

# 3. Write the TFLite model to a file
with open("keyword_spotting_model.tflite", "wb") as f:
    f.write(tflite_model)


converter = tf.lite.TFLiteConverter.from_keras_model(model)
#this is a quantization step
converter.optimizations = [tf.lite.Optimize.DEFAULT] 
tflite_model = converter.convert()

# 3. Write the TFLite model to a file
with open("keyword_spotting_model-q.tflite", "wb") as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: C:\Users\alexa\AppData\Local\Temp\tmphn24ud6k\assets


INFO:tensorflow:Assets written to: C:\Users\alexa\AppData\Local\Temp\tmphn24ud6k\assets


Saved artifact at 'C:\Users\alexa\AppData\Local\Temp\tmphn24ud6k'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 40, 250, 1), dtype=tf.float32, name=None)
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  1796014112016: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014112592: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014110096: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014110480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014111632: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014112400: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014112208: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014916432: TensorSpec(shape=(), dtype=tf.resource, name=None)
INFO:tensorflow:Assets written to: C:\Users\alexa\AppData\Local\Temp\tmp5gknyi_a\assets


INFO:tensorflow:Assets written to: C:\Users\alexa\AppData\Local\Temp\tmp5gknyi_a\assets


Saved artifact at 'C:\Users\alexa\AppData\Local\Temp\tmp5gknyi_a'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 40, 250, 1), dtype=tf.float32, name=None)
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  1796014112016: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014112592: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014110096: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014110480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014111632: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014112400: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014112208: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1796014916432: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [79]:
output=model.predict(X_test_t[2:3])
output

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


array([[0.01933353, 0.00496708, 0.9756993 ]], dtype=float32)

In [80]:
label_encoder.inverse_transform([np.argmax(output[0])])

array(['stop'], dtype=object)

In [81]:
label_encoder.classes_

array(['alfred', 'other', 'stop'], dtype=object)

In [83]:
import socket
import json

# Global socket variable
_jdetect_socket = None

def _get_jdetect_socket():
    global _jdetect_socket
    HOST = '127.0.0.1'
    PORT = 5555
    if _jdetect_socket is None:
        _jdetect_socket = socket.create_connection((HOST, PORT))
    return _jdetect_socket

def jdetect_mfcc(double_array, sample_rate=16000):
    """
    Sends an array of doubles as JSON to a persistent TCP socket on localhost:5555,
    and returns the parsed JSON output from the socket.
    """
    input_json = json.dumps({"waveform":double_array,"sample_rate":sample_rate})
    try:
        sock = _get_jdetect_socket()
        sock.sendall(input_json.encode('utf-8') + b'\n'+b'\n')
        received = b''
        while True:
            chunk = sock.recv(4096)
            if not chunk:
                break
            received += chunk
            if b'\n' in chunk:
                break
        received = received.strip()
        return json.loads(received.decode('utf-8'))
    except Exception as e:
        print(f"Error using persistent TCP socket: {e}")
        # Reset socket on error
        global _jdetect_socket
        if _jdetect_socket is not None:
            try:
                _jdetect_socket.close()
            except Exception:
                pass
            _jdetect_socket = None
        return None


In [87]:
wav_path = "samples/hello_alfred6.wav"
import numpy as np
with open(wav_path, "rb") as f:
        f.seek(44+17*2)  # Skip the 44-byte WAV header
        pcm_data = f.read()

print(f"Read {len(pcm_data)} bytes of PCM data from {wav_path} (after skipping header)")
print(f"length of PCM data: {len(pcm_data)/2}")

sr =16000


#print(pcm_data[:10])  # Print the first 10 bytes of PCM data for debugging
# Convert the pcm data to a list of integers
pcm_values = []
for i in range(0, len(pcm_data), 2):
    # Unpack each 16-bit PCM value (little-endian)
    pcm_value = int.from_bytes(pcm_data[i:i+2], byteorder='little', signed=True)
    pcm_values.append(pcm_value)
print(f"First 10 PCM values: {pcm_values[:10]}")  # Print the first 10 PCM values for debugging

waveform = np.array(pcm_values, dtype=np.int16)
# Convert to float32
waveform = waveform.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
# Compute MFCCs

Read 65494 bytes of PCM data from samples/hello_alfred6.wav (after skipping header)
length of PCM data: 32747.0
First 10 PCM values: [-30, -37, -33, -29, -26, -20, -16, -7, -3, 6]


In [95]:

print(f"waveform before jdetect_mfcc: {waveform} ")
spectrogram=jdetect_mfcc(waveform.tolist())
spectrogram = np.array(spectrogram)
spectrogram[0][0:10]
spectrogram[1][0:10]
#spectrogram3[39][0:10]

waveform before jdetect_mfcc: [-9.1552734e-04 -1.1291504e-03 -1.0070801e-03 ... -1.5258789e-04
  6.1035156e-05  1.2207031e-04] 


array([65.71024 , 64.899055, 62.475754, 58.75968 , 54.112823, 49.109306,
       44.226913, 39.52972 , 35.327145, 32.017933])

In [96]:
spectrogram.shape

(40, 205)

In [97]:
if spectrogram.shape[1] < MAX_LENGTH:
        pad_width = MAX_LENGTH - spectrogram.shape[1]
        spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
else:
        spectrogram = spectrogram[:, :MAX_LENGTH]

In [99]:
input_tensor = tf.convert_to_tensor(spectrogram, dtype=tf.float32)
input_tensor = tf.expand_dims(input_tensor, axis=0)  # Add batch dimension
input_tensor = tf.expand_dims(input_tensor, axis=-1)  # Add channel dimension
input_tensor.shape

TensorShape([1, 40, 250, 1])

In [102]:
model.predict(input_tensor)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


array([[9.3416196e-01, 6.5771736e-02, 6.6332766e-05]], dtype=float32)

In [None]:
asd