In [1]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import soundfile as sf
from scipy.io import wavfile
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

########### PARAMETERS ###########
# DO NOT MODIFY
# Desired sample rate 16000 Hz
sample_rate = 16000
# Frame length
frame_length = 512

In [3]:
# Load the audio file
audio_data, current_sample_rate = librosa.load('audio_aaico_challenge.wav', sr=sample_rate)

audio_data_int16 = (audio_data * 32767).astype(np.int16)
number_of_frames = len(audio_data_int16) // frame_length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate


In [4]:
audio_data_df = pd.DataFrame(audio_data_int16)

In [5]:
def extract_audio_sample(start, end, sample_name):
    # Extract the first frame from the audio data
    audio_sample = audio_data_df[start*sample_rate:end*sample_rate]
    audio_sample_numpy = audio_sample.to_numpy()
    wavfile.write(f'{sample_name}.wav', sample_rate, audio_sample_numpy)
    return audio_sample

## Noiseless data

In [6]:
galactic_temperature  = extract_audio_sample(9, 10, 'galactic_temperature')
galactic_battery  = extract_audio_sample(21, 23, 'galactic_battery')
galactic_oxygene  = extract_audio_sample(38, 40, 'galactic_oxygene')

## Data with noise

In [7]:
# Generate white noise
noise = np.random.randn(len(audio_data))

# Mix the original audio with noise
noisy_audio = audio_data + 0.005 * noise

# Save the augmented audio to a new file
sf.write('augmented_audio.wav', noisy_audio, current_sample_rate)

In [8]:
audio_data, current_sample_rate = librosa.load('wav_files/augmented_audio.wav', sr=sample_rate)

In [9]:
audio_data_int16 = (audio_data * 32767).astype(np.int16)
number_of_frames = len(audio_data_int16) // frame_length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate

In [10]:
audio_data_df = pd.DataFrame(audio_data_int16)

In [11]:
galactic_temperature  = extract_audio_sample(9, 10, 'noisy_galactic_temperature')
galactic_battery  = extract_audio_sample(21, 23, 'noisy_galactic_battery')
galactic_oxygene  = extract_audio_sample(38, 40, 'noisy_galactic_oxygene')

## Get Non-keyword samples

In [12]:
def generate_samples_without_key_word(keyword_timestamps, audio_path):
    y, sr = librosa.load(audio_path, sr=sample_rate)

    # Convert timestamps to sample indices
    keyword_samples = [(int(start * sample_rate), int(end * sample_rate)) for start, end in keyword_timestamps]

    # Sort and merge overlapping intervals if necessary
    # Assuming keywords_timestamps are sorted and non-overlapping for simplicity

    # Initialize the start of the first segment
    last_end = 0
    segments = []

    # Extract segments that are not covered by keyword timestamps
    for start, end in keyword_samples:
        if last_end < start:  # Check if there is a gap
            segments.append((last_end, start))
        last_end = end  # Move to the end of the current keyword segment

    # Don't forget the segment after the last keyword, if any
    if last_end < len(y):
        segments.append((last_end, len(y)))


    # Function to extract fixed length segments
    def extract_fixed_length_segments(start_idx, end_idx, length_in_samples, segments_list):
        current_start = start_idx
        while current_start + length_in_samples <= end_idx:
            segments_list.append(y[current_start:current_start + length_in_samples])
            current_start += length_in_samples

    # For storing extracted audio segments
    extracted_segments = []

    # Extract 1 or 2 second long non-keyword segments
    for start, end in segments:
        # Calculate segment length in samples for 1 and 2 seconds
        one_sec_samples = 1 * sr
        two_sec_samples = 2 * sr
        
        # First, try to extract 2-second segments
        extract_fixed_length_segments(start, end, two_sec_samples, extracted_segments)
        
        # Calculate remaining samples after extracting 2-second segments
        remaining_samples = end - start - len(extracted_segments) * two_sec_samples
        
        # If there's enough room, extract 1-second segments from the remainder
        if remaining_samples >= one_sec_samples:
            new_start = end - remaining_samples
            extract_fixed_length_segments(new_start, end, one_sec_samples, extracted_segments)

    # Save the extracted segments to files
    for i, segment in enumerate(extracted_segments):
        category = audio_path.split('/')[1]
        sf.write(f'{category}extracted_segment_{i+1}.wav', segment, sr)

In [13]:
generate_samples_without_key_word([(9, 10), (21, 23), (38, 40)], 'wav_files/audio_aaico_challenge.wav')
generate_samples_without_key_word([(9, 10), (21, 23), (38, 40)], 'wav_files/augmented_audio.wav')

## Parse Files|

In [14]:
# Define directories
dir_0 = 'wav_files/dataset/0'
dir_1 = 'wav_files/dataset/1'

In [15]:
def extract_frames(audio_path, frame_length=512, hop_length=160, sample_rate=16000):
    """
    Extract frames from an audio file.

    Parameters:
    - audio_path: Path to the audio file.
    - frame_length: The number of samples in each frame.
    - hop_length: The number of samples to skip between frames.
    - sample_rate: The audio sample rate.

    Returns:
    - A 2D NumPy array of frames.
    """
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sample_rate)
    
    # Use librosa.util.frame to extract frames
    frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length).T
    
    return frames



In [20]:
# Initialize a list to store feature vectors and labels
data = []

def frame_to_mfcc(frame, sample_rate=16000, n_mfcc=13):
    """
    Converts audio frames to MFCC features.

    Parameters:
    - frames: A 2D numpy array where each row represents an audio frame.
    - sample_rate: Sampling rate of the audio frames.
    - n_mfcc: Number of MFCC features to extract.

    Returns:
    - A 2D numpy array where each row is the MFCC features of a frame.
    """
    mfccs = librosa.feature.mfcc(y=frame, sr=sample_rate, n_mfcc=n_mfcc)
    return mfccs


In [22]:
# Extract features from each class and append them to the data list
for folder, label in [(dir_0, 0), (dir_1, 1)]:
    for file in os.listdir(folder):
        if file.endswith('.wav'):
            try:
                frames = extract_frames(os.path.join(folder, file))
                for frame in frames:
                    features = frame_to_mfcc(frame)
                    data.append((features, label)) 
            except Exception as e:
                print(f"Error processing {file}: {e}")

In [23]:
data

[(array([[-1.72463943e+02, -1.77131836e+02],
         [ 9.61676636e+01,  1.18728821e+02],
         [ 3.42446756e+00,  2.07169132e+01],
         [ 1.17423649e+01,  1.27321529e+01],
         [ 9.31788349e+00,  6.52729893e+00],
         [-8.50068855e+00, -1.08470554e+01],
         [ 7.11643219e+00,  1.30625677e+00],
         [-4.17798042e+00, -2.96877098e+00],
         [ 4.76658165e-01, -1.19660586e-01],
         [-1.61817646e+01, -1.35644169e+01],
         [-1.92942963e+01, -1.85071602e+01],
         [-6.10059881e+00, -7.15446663e+00],
         [-3.04654455e+00, -5.71125412e+00]], dtype=float32),
  0),
 (array([[-187.19926   , -191.49533   ],
         [ 128.67398   ,  139.83731   ],
         [  32.123787  ,   52.737602  ],
         [  14.112389  ,   16.824648  ],
         [   0.40429077,   -2.9257927 ],
         [ -18.566116  ,  -21.028984  ],
         [  -1.155217  ,   -7.3413515 ],
         [   5.361137  ,    5.536073  ],
         [   4.2214746 ,    6.018498  ],
         [  -5.302552  

In [44]:
# Assuming `data` is your list of tuples
features = [item[0].flatten() for item in data] # Extract MFCC features
labels = [item[1] for item in data]  # Extract labels


In [45]:

# Expanding the lists of features into their own columns
features_df = pd.DataFrame(features)
features_df['Label'] = labels


# Save to CSV
features_df.to_csv('audio_features.csv', index=False)

In [46]:

# Load the dataset
df = pd.read_csv('audio_features.csv')

# Separate features and labels
X = df.drop('Label', axis=1)
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Define the model
clf = RandomForestClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits


571.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
571.59s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
571.79s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
572.01s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
572.21s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
572.41s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
572.62s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
572.82s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
573.02s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
573.22s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
573.69s - pydevd: Sending message related to process being r

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estim

In [48]:
# Initialize the classifier with the best hyperparameters
final_clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2)

# Train the model on the full training dataset
final_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = final_clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy}")

Final Model Accuracy: 0.9352014010507881


In [None]:
from joblib import dump

# Assuming `final_clf` is your trained model
dump(final_clf, 'random_forest_classifier.joblib')
