In [4]:
import os
import py7zr
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns

from IPython import display

ModuleNotFoundError: No module named 'tensorflow'

In [9]:
# Data and extraction directories
data_dir = "/kaggle/input/tensorflow-speech-recognition-challenge/"
extract_dir = '/kaggle/working/extracted_data_train'

def extract_7z(filepath, dest_dir):
    with py7zr.SevenZipFile(filepath, mode='r') as z:
        z.extractall(path=dest_dir)

# Extract the data if it hasn't been already
if not os.path.exists(extract_dir):
    filepath = os.path.join(data_dir, "train.7z")
    print(f"Extracting files from {filepath} to {extract_dir}...")
    extract_7z(filepath, extract_dir)
else:
    print(f"Data already extracted at {extract_dir}")

In [10]:
# Load command labels
commands = np.array([d for d in tf.io.gfile.listdir(os.path.join(extract_dir, "train/audio")) 
                    if d != '_background_noise_'])
print('Commands:', commands)

# Load background noise files for data augmentation
noise_dir = os.path.join(extract_dir, "train/audio/_background_noise_")
noise_files = tf.io.gfile.glob(noise_dir + '/*.wav')

# Function to decode audio files
def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(contents=audio_binary)
    return tf.squeeze(audio, axis=-1)

# Function to extract label from file path
def get_label(file_path):
    label = tf.strings.split(input=file_path, sep=os.path.sep)[-2]
    return tf.cond(tf.reduce_any(tf.equal(commands, label)), lambda: label, lambda: tf.constant("unknown", dtype=tf.string))

# Function to convert waveform to spectrogram
def get_spectrogram(waveform):
    input_len = 16000
    waveform = waveform[:input_len]
    # Pad with zeros if audio is shorter than 16000 samples
    zero_padding = tf.zeros([input_len] - tf.shape(waveform), dtype=tf.float32)
    waveform = tf.cast(waveform, dtype=tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    
    spectrogram = tf.signal.stft(equal_length, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    return spectrogram[..., tf.newaxis]

# Function to add background noise for data augmentation
def add_background_noise(waveform, noise_files):
    noise_file = random.choice(noise_files)
    noise_audio_binary = tf.io.read_file(noise_file)
    noise_waveform = decode_audio(noise_audio_binary)

    # Adjust noise length to match the audio
    waveform_len = tf.shape(waveform)[0]
    noise_len = tf.shape(noise_waveform)[0]
    if noise_len > waveform_len:
        offset = tf.random.uniform(shape=[], minval=0, maxval=noise_len - waveform_len, dtype=tf.int32)
        noise_waveform = noise_waveform[offset:offset + waveform_len]
    else:
        padding = tf.zeros([waveform_len - noise_len], dtype=tf.float32)
        noise_waveform = tf.concat([noise_waveform, padding], axis=0)

    noise_factor = tf.random.uniform(shape=[], minval=0.0, maxval=0.5)
    augmented_waveform = waveform + noise_factor * noise_waveform
    return tf.clip_by_value(augmented_waveform, -1.0, 1.0)

# Function to load and preprocess audio data
def preprocess_dataset(files, augment=False):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(
        lambda file_path: (tf.io.read_file(file_path), get_label(file_path)), 
        num_parallel_calls=tf.data.AUTOTUNE
    )
    output_ds = output_ds.map(
        lambda audio_binary, label: (decode_audio(audio_binary), label),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    if augment:
        output_ds = output_ds.map(
            lambda waveform, label: (add_background_noise(waveform, noise_files), label), 
            num_parallel_calls=tf.data.AUTOTUNE
        )
    output_ds = output_ds.map(
        lambda waveform, label: (get_spectrogram(waveform), tf.argmax(label == commands)),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    return output_ds

In [11]:
filepath_data = os.path.join(extract_dir, "train/audio")

# Get all filenames, excluding noise files
filenames = tf.io.gfile.glob([os.path.join(filepath_data, d,  '*') for d in commands])
filenames = tf.random.shuffle(filenames)

total_samples = len(filenames)
train_size = int(0.8 * total_samples)  # 80% for training
val_size = int(0.1 * total_samples)  # 10% for validation
test_size = total_samples - train_size - val_size  # Remaining 10% for testing

# Split dataset into train, validation, and test sets
train_files = filenames[:train_size]
val_files = filenames[train_size:train_size + val_size]
test_files = filenames[train_size + val_size:]

print('Training set size:', len(train_files))
print('Validation set size:', len(val_files))
print('Test set size:', len(test_files))

In [None]:
train_ds = preprocess_dataset(train_files, augment=True)
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

# Batch the datasets for training
batch_size = 32
train_ds = train_ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

In [None]:
# Get input shape from the data
for spectrogram, _ in train_ds.take(1):
    input_shape = spectrogram.shape[1:]

print('Input shape:', input_shape)
num_labels = len(commands)

# Normalize input
norm_layer = layers.Normalization()
norm_layer.adapt(data=train_ds.map(lambda spec, label: spec))

timesteps = 16

# Build the model
model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Resizing(32, 32),
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.Conv2D(128, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Reshape((-1, timesteps, 21632 // timesteps)), 
    tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=1)),
    layers.GRU(64),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels)
])

model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

EPOCHS = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

In [12]:
metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=commands, yticklabels=commands, annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [13]:
sample_file = "/kaggle/working/extracted_data/train/audio/down/0a9f9af7_nohash_0.wav"
sample_ds = preprocess_dataset([str(sample_file)])

for spectrogram, label in sample_ds.batch(1):
    prediction = model(spectrogram)
    plt.figure(figsize=(20, 5))
    plt.bar(commands, tf.nn.softmax(prediction[0]))
    plt.title(f'Predictions for "{commands[label[0]]}"')
    plt.show()

In [None]:
submission_sample = pd.read_csv("/kaggle/working/sample_submission.csv")
submission_sample.head()

In [None]:
# Data and extraction directories
data_dir = "/kaggle/input/tensorflow-speech-recognition-challenge/"
extract_dir = '/kaggle/working/extracted_data_test/'

# Extract the data if it hasn't been already
if not os.path.exists(extract_dir):
    filepath = os.path.join(data_dir, "test.7z")
    print(f"Extracting files from {filepath} to {extract_dir}...")
    extract_7z(filepath, extract_dir)
else:
    print(f"Data already extracted at {extract_dir}")

In [None]:
filepath_data = "/kaggle/working/extracted_data_test/test/audio/"
filenames = os.listdir(filepath_data)
filenames_path = [filepath_data + i for i in filenames]

In [None]:
test_ds = preprocess_dataset(filenames_path)
test_ds = test_ds.batch(1)

labels = []

for spectrogram in tqdm(test_ds):
    spectrogram = spectrogram[0]
    prediction = model(spectrogram)
    
    probabilities = tf.nn.softmax(prediction[0])
    predicted_index = tf.argmax(probabilities).numpy()
    predicted_command = commands[predicted_index]
    
    labels.append(predicted_command)

#     print(f"Probabilities: {probabilities.numpy()}")
#     print(f"Predicted Command: {predicted_command}")

In [None]:
submission = pd.DataFrame({"fnane":filenames,"label":labels})
submission.to_csv("submission.csv", index=False)
submission