In [53]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import soundfile as sf
from scipy.io import wavfile
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:

########### PARAMETERS ###########
# DO NOT MODIFY
# Desired sample rate 16000 Hz
sample_rate = 16000
# Frame length
frame_length = 512

In [30]:
# Load the audio file
audio_data, current_sample_rate = librosa.load('audio_aaico_challenge.wav', sr=sample_rate)

audio_data_int16 = (audio_data * 32767).astype(np.int16)
number_of_frames = len(audio_data_int16) // frame_length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate


In [9]:
audio_data_df = pd.DataFrame(audio_data_int16)

In [28]:
def extract_audio_sample(start, end, sample_name):
    # Extract the first frame from the audio data
    audio_sample = audio_data_df[start*sample_rate:end*sample_rate]
    audio_sample_numpy = audio_sample.to_numpy()
    wavfile.write(f'{sample_name}.wav', sample_rate, audio_sample_numpy)
    return audio_sample

## Noiseless data

In [23]:
galactic_temperature  = extract_audio_sample(9, 10, 'galactic_temperature')
galactic_battery  = extract_audio_sample(21, 23, 'galactic_battery')
galactic_oxygene  = extract_audio_sample(38, 40, 'galactic_oxygene')

## Data with noise

In [31]:
# Generate white noise
noise = np.random.randn(len(audio_data))

# Mix the original audio with noise
noisy_audio = audio_data + 0.005 * noise

# Save the augmented audio to a new file
sf.write('augmented_audio.wav', noisy_audio, current_sample_rate)

In [33]:
audio_data, current_sample_rate = librosa.load('wav_files/augmented_audio.wav', sr=sample_rate)

In [34]:
audio_data_int16 = (audio_data * 32767).astype(np.int16)
number_of_frames = len(audio_data_int16) // frame_length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate

In [35]:
audio_data_df = pd.DataFrame(audio_data_int16)

In [36]:
galactic_temperature  = extract_audio_sample(9, 10, 'noisy_galactic_temperature')
galactic_battery  = extract_audio_sample(21, 23, 'noisy_galactic_battery')
galactic_oxygene  = extract_audio_sample(38, 40, 'noisy_galactic_oxygene')

In [42]:
def generate_samples_without_key_word(keyword_timestamps, audio_path):
    y, sr = librosa.load(audio_path, sr=sample_rate)

    # Convert timestamps to sample indices
    keyword_samples = [(int(start * sample_rate), int(end * sample_rate)) for start, end in keyword_timestamps]

    # Sort and merge overlapping intervals if necessary
    # Assuming keywords_timestamps are sorted and non-overlapping for simplicity

    # Initialize the start of the first segment
    last_end = 0
    segments = []

    # Extract segments that are not covered by keyword timestamps
    for start, end in keyword_samples:
        if last_end < start:  # Check if there is a gap
            segments.append((last_end, start))
        last_end = end  # Move to the end of the current keyword segment

    # Don't forget the segment after the last keyword, if any
    if last_end < len(y):
        segments.append((last_end, len(y)))


    # Function to extract fixed length segments
    def extract_fixed_length_segments(start_idx, end_idx, length_in_samples, segments_list):
        current_start = start_idx
        while current_start + length_in_samples <= end_idx:
            segments_list.append(y[current_start:current_start + length_in_samples])
            current_start += length_in_samples

    # For storing extracted audio segments
    extracted_segments = []

    # Extract 1 or 2 second long non-keyword segments
    for start, end in segments:
        # Calculate segment length in samples for 1 and 2 seconds
        one_sec_samples = 1 * sr
        two_sec_samples = 2 * sr
        
        # First, try to extract 2-second segments
        extract_fixed_length_segments(start, end, two_sec_samples, extracted_segments)
        
        # Calculate remaining samples after extracting 2-second segments
        remaining_samples = end - start - len(extracted_segments) * two_sec_samples
        
        # If there's enough room, extract 1-second segments from the remainder
        if remaining_samples >= one_sec_samples:
            new_start = end - remaining_samples
            extract_fixed_length_segments(new_start, end, one_sec_samples, extracted_segments)

    # Save the extracted segments to files
    for i, segment in enumerate(extracted_segments):
        category = audio_path.split('/')[1]
        sf.write(f'{category}extracted_segment_{i+1}.wav', segment, sr)

In [44]:
generate_samples_without_key_word([(9, 10), (21, 23), (38, 40)], 'wav_files/audio_aaico_challenge.wav')
generate_samples_without_key_word([(9, 10), (21, 23), (38, 40)], 'wav_files/augmented_audio.wav')

In [45]:
# Define directories
dir_0 = 'wav_files/dataset/0'
dir_1 = 'wav_files/dataset/1'

In [46]:
# Initialize a list to store feature vectors and labels
data = []

# Function to extract MFCCs
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs.T, axis=0)  # Returning the mean MFCC across time


In [49]:
# Extract features from each class and append them to the data list
for folder, label in [(dir_0, 0), (dir_1, 1)]:
    for file in os.listdir(folder):
        if file.endswith('.wav'):
            try:
                features = extract_features(os.path.join(folder, file))
                data.append([features, label])
            except Exception as e:
                print(f"Error processing {file}: {e}")

In [50]:
# Convert the list to a DataFrame
df = pd.DataFrame(data, columns=['Features', 'Label'])

# Expanding the lists of features into their own columns
features_df = pd.DataFrame(df['Features'].tolist())
features_df['Label'] = df['Label']

# Save to CSV
features_df.to_csv('audio_features.csv', index=False)

In [52]:

# Load the dataset
df = pd.read_csv('audio_features.csv')

# Separate features and labels
X = df.drop('Label', axis=1)
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Accuracy: 0.8333333333333334


In [54]:
# Define the model
clf = RandomForestClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits




[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estim

In [56]:
# Initialize the classifier with the best hyperparameters
final_clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2)

# Train the model on the full training dataset
final_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = final_clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy}")

Final Model Accuracy: 0.8333333333333334


In [57]:
from joblib import dump

# Assuming `final_clf` is your trained model
dump(final_clf, 'random_forest_classifier.joblib')


['random_forest_classifier.joblib']