In [1]:
import os
base_path = "SpeechCommand"
import pandas as pd
import os
from sklearn.model_selection import train_test_split


print(os.listdir(base_path))

['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero', '_background_noise_']


In [2]:
speech_command_train = r"SpeechCommand"
audio_path = []

for label in os.listdir(speech_command_train):
    audio_files = [(label+"/"+f,label) for f in os.listdir(os.path.join(speech_command_train,label)) if f.endswith(".wav")]
    audio_path += audio_files

print(audio_path[:10])

audio_df = pd.DataFrame(audio_path, columns=["path", "labels"])
audio_df.to_csv("train_audio.csv", index=False)
print(audio_df.head())

audio_df = audio_df.drop(audio_df[audio_df["labels"]== "background_noise"].index)

mapping_dict = {
    "right": 0, "eight": 1, "cat": 2, "tree": 3, "bed": 4,  "happy": 5, "go": 6, "dog": 7, "no": 8, "wow": 9, "nine": 10,
    "left": 11, "stop": 12, "three": 13, "sheila": 14, "one": 15, "bird": 16, "zero": 17, "seven": 18, "up": 19,
    "marvin": 20, "two": 21, "house": 22, "down": 23, "six": 24, "yes": 25,   "on": 26, "five": 27, "off": 28, "four": 29
}

# Check for NaN values in the target variable
nan_values = audio_df["labels"].isna().sum()
print(f"Number of NaN values in 'labels': {nan_values}")

# Check unique values in the target variable that do not have mappings
unique_values = audio_df[~audio_df["labels"].isin(mapping_dict.keys())]["labels"].unique()
print(f"Unique values in 'labels' without mappings: {unique_values}")


[('bed/00176480_nohash_0.wav', 'bed'), ('bed/004ae714_nohash_0.wav', 'bed'), ('bed/004ae714_nohash_1.wav', 'bed'), ('bed/00f0204f_nohash_0.wav', 'bed'), ('bed/00f0204f_nohash_1.wav', 'bed'), ('bed/012c8314_nohash_0.wav', 'bed'), ('bed/012c8314_nohash_1.wav', 'bed'), ('bed/0132a06d_nohash_0.wav', 'bed'), ('bed/0135f3f2_nohash_0.wav', 'bed'), ('bed/0137b3f4_nohash_0.wav', 'bed')]
                        path labels
0  bed/00176480_nohash_0.wav    bed
1  bed/004ae714_nohash_0.wav    bed
2  bed/004ae714_nohash_1.wav    bed
3  bed/00f0204f_nohash_0.wav    bed
4  bed/00f0204f_nohash_1.wav    bed
Number of NaN values in 'labels': 0
Unique values in 'labels' without mappings: ['_background_noise_']


In [3]:
audio_df = pd.DataFrame(audio_path,columns = ["path","labels"])
audio_df.to_csv("train_audio.csv",index=False)
# Drop rows with labels as '_background_noise_'
audio_df = audio_df.drop(audio_df[audio_df["labels"] == "_background_noise_"].index)

# Check the resulting DataFrame
print(audio_df.head())


                        path labels
0  bed/00176480_nohash_0.wav    bed
1  bed/004ae714_nohash_0.wav    bed
2  bed/004ae714_nohash_1.wav    bed
3  bed/00f0204f_nohash_0.wav    bed
4  bed/00f0204f_nohash_1.wav    bed


In [4]:
# Create a target column 'TARGET' based on the mapping_dict
audio_df['TARGET'] = audio_df["labels"].map(mapping_dict)

# Perform train-test split
x_train, x_val, y_train, y_val = train_test_split(
    audio_df[['path', 'labels']],
    audio_df['TARGET'],
    test_size=0.1,
    random_state=65
)

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)


(52174, 2) (52174,) (5798, 2) (5798,)


In [5]:
print(audio_df)

                             path labels  TARGET
0       bed/00176480_nohash_0.wav    bed       4
1       bed/004ae714_nohash_0.wav    bed       4
2       bed/004ae714_nohash_1.wav    bed       4
3       bed/00f0204f_nohash_0.wav    bed       4
4       bed/00f0204f_nohash_1.wav    bed       4
...                           ...    ...     ...
57967  zero/ffd2ba2f_nohash_1.wav   zero      17
57968  zero/ffd2ba2f_nohash_2.wav   zero      17
57969  zero/ffd2ba2f_nohash_3.wav   zero      17
57970  zero/ffd2ba2f_nohash_4.wav   zero      17
57971  zero/fffcabd1_nohash_0.wav   zero      17

[57972 rows x 3 columns]


In [6]:
print(x_train)

                               path  labels
23953    nine/28ce0c58_nohash_3.wav    nine
54696     yes/7799c9cd_nohash_1.wav     yes
44157    stop/f19c1390_nohash_0.wav    stop
49580     two/c9b653a0_nohash_0.wav     two
23133  marvin/ba59cab3_nohash_0.wav  marvin
...                             ...     ...
2773     bird/ccb1266b_nohash_0.wav    bird
296       bed/34ba417a_nohash_2.wav     bed
575       bed/66041c69_nohash_0.wav     bed
10357   eight/f2b8fc18_nohash_0.wav   eight
7982     down/d750966e_nohash_0.wav    down

[52174 rows x 2 columns]


In [7]:
print(x_val)

                               path  labels
13652    four/876c84d6_nohash_0.wav    four
9991    eight/c68cf200_nohash_0.wav   eight
44673   three/31d31fa0_nohash_0.wav   three
50356      up/28460a60_nohash_0.wav      up
57155    zero/a42a88ff_nohash_2.wav    zero
...                             ...     ...
1801     bird/30a09789_nohash_2.wav    bird
23566  marvin/f9af823e_nohash_0.wav  marvin
17768   happy/a8cb6dda_nohash_0.wav   happy
805       bed/8c7c9168_nohash_0.wav     bed
8617    eight/24befdb3_nohash_3.wav   eight

[5798 rows x 2 columns]


In [8]:
import os
import librosa
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture


## value of 30 gave 86.33%

In [9]:
# csv_file_name = "train_audio.csv"  # Replace with your actual CSV file name
# data = pd.read_csv(csv_file_name)


# # Initialize a dictionary to store GMM models
# gmm_models = {}

# # Iterate over unique labels
# unique_labels = data['labels'].unique()

# for label in unique_labels:
#     # Filter data for the current label
#     label_data = data[data['labels'] == label]

#     # Initialize a list to store features
#     all_features = []

#     # Iterate over audio paths for the current label
#     for audio_path in label_data['path']:
#         # Load audio file
#         # Extract features (replace with your feature extraction function)
#         features = extract_features(audio_path)

#         # Append features to the list
#         all_features.append(features)

#     # Convert the list of features to a numpy array
#     X = np.vstack(all_features)
#     gmm = GaussianMixture(n_components=30, covariance_type='full', random_state=72, max_iter=250, init_params='kmeans', reg_covar=0.15)
#     gmm.fit(X)

#     # Save the GMM model in the dictionary
#     gmm_models[label] = gmm

#     print(f"GMM model for label '{label}' saved in the dictionary")
#     if gmm.converged_:
#         print(f"GMM for class {label}: Converged")
#         gmm_models[label] = gmm
#     else:
#         print(f"GMM for class {label}: Not converged")


In [10]:
import joblib

all_models_filename = "all_gmm_models.joblib"
# joblib.dump(gmm_models, all_models_filename)
print(f"All GMM models saved to {all_models_filename}")


All GMM models saved to all_gmm_models.joblib


In [11]:
# Load the models from the file
gmm_models = joblib.load(all_models_filename)


In [12]:
print(gmm_models)

{'bed': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'bird': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'cat': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'dog': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'down': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'eight': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'five': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'four': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'go': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'happy': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'house': GaussianMixture(max_iter=250, n_components=30, random_state=72, reg_covar=0.15), 'left': GaussianMixtur

In [13]:
def extract_features(audio_path):
    base_path = 'SpeechCommand'
    ap = os.path.join(base_path,audio_path)
    y, sr = librosa.load(ap)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    deltas = librosa.feature.delta(mfccs)
    features = np.vstack([mfccs, deltas])
    return features.T

def predict_and_evaluate(x_val, gmm_models, mapping_dict):
    correct_predictions = 0
    total_samples = len(x_val)

    for index, row in x_val.iterrows():
        audio_path = row['path']
        true_label = row['labels']

        # Extract features
        features = extract_features(audio_path)

        if features is not None:
            # Predict class label
            predicted_class = predict_class(features, gmm_models, mapping_dict)
            mapped = mapping_dict[true_label]
            print(mapped)
            print(predicted_class)
            # Check if the predicted class matches the true label
            if predicted_class == mapped:
                correct_predictions += 1

    accuracy = correct_predictions / total_samples
    print(f"Accuracy on validation set: {accuracy * 100:.2f}%")

# Function to predict class label based on GMM models
def predict_class(features, gmm_models, mapping_dict):
    max_score = float('-inf')
    predicted_class = None

    for class_label, gmm_model in gmm_models.items():
        score = gmm_model.score(features)
        if score > max_score:
            max_score = score
            predicted_class = mapping_dict[class_label]

    return predicted_class

predict_and_evaluate(x_val, gmm_models, mapping_dict)


29
29
1
1
13
13
19
19
17
17
21
21
10
10
4
4
20
20
18
18
23
23
1
1
8
8
6
6
28
28
22
22
17
17
18
18
9
9
8
8
8
8
0
0
15
15
14
14
5
5
14
14
18
18
22
22
25
25
11
25
7
7
23
23
29
29
4
4
7
7
1
1
25
25
5
5
12
12
26
26
13
13
23
23
29
29
1
1
21
21
24
24
23
23
11
11
25
25
19
19
9
9
4
4
10
10
10
10
13
13
9
9
19
19
23
23
13
13
23
8
7
7
7
7
3
3
14
14
24
24
22
22
29
29
17
17
18
18
8
8
28
28
26
23
29
29
13
13
10
10
2
2
12
12
29
29
2
2
5
5
18
18
2
2
4
4
2
2
7
7
27
27
14
14
19
19
5
5
16
16
27
27
10
6
16
16
22
22
22
22
26
26
24
24
15
15
0
0
5
5
25
25
3
3
17
17
24
24
15
15
10
10
16
16
20
20
10
10
0
0
23
23
8
8
14
14
2
2
27
27
8
23
28
28
8
8
19
10
29
29
7
7
14
14
3
3
24
24
29
29
15
15
3
21
11
28
20
20
0
0
11
11
14
14
6
6
19
19
6
6
16
16
15
15
15
15
15
15
9
9
27
28
25
25
27
4
14
14
25
25
18
18
20
20
17
29
21
21
5
5
29
29
27
27
7
7
18
18
29
29
7
7
20
20
15
15
23
23
8
8
21
21
25
25
3
3
10
10
23
23
20
20
10
24
21
21
13
13
13
13
25
25
7
7
12
28
24
24
10
10
12
12
27
27
27
27
2
2
0
0
8
8
8
8
16
16
25
25
13
13
15


In [14]:

test_df = pd.read_csv('test.csv')

def extract_features_t(audio_path):
    base_path = 'SpeechCommandTest'
    ap = os.path.join(base_path,audio_path)
    y, sr = librosa.load(ap)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    deltas = librosa.feature.delta(mfccs)
    features = np.vstack([mfccs, deltas])
    return features.T

def predict_class(audio_file, gmm_models):
    y, sr = librosa.load(os.path.join('SpeechCommandTest', audio_file), sr=None)
    features = extract_features_t(audio_file)

    if features is None:
        return None

    max_score = float('-inf')
    predicted_class = None

    for class_label, gmm_model in gmm_models.items():
        score = gmm_model.score(features)
        if score > max_score:
            max_score = score
            predicted_class = mapping_dict[class_label]

    return predicted_class

predictions = []

for _, row in test_df.iterrows():
    audio_file = row['AUDIO_FILE']
    predicted_class = predict_class(audio_file, gmm_models)
    predictions.append({'ID': row['ID'], 'Target': predicted_class})

predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv('predictions_mel.csv', index=False)

In [15]:

def extract_features_test(audio_path):
    y, sr = librosa.load(audio_path)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    deltas = librosa.feature.delta(mfccs)
    features = np.vstack([mfccs, deltas])
    return features.T

def predict_class_test_time(audio_file, gmm_models):
    features = extract_features_test(audio_file)

    if features is None:
        return None

    max_score = float('-inf')
    predicted_class = None

    for class_label, gmm_model in gmm_models.items():
        score = gmm_model.score(features)
        if score > max_score:
            max_score = score
            predicted_class = mapping_dict[class_label]

    return predicted_class

In [20]:
audio_path = input("Enter the path of the audio file: ")
predicted_class = predict_class_test_time(audio_path, gmm_models)
print(f"The predicted class for the audio file '{audio_path}' is: {predicted_class}")

The predicted class for the audio file 'kaggle 2 demo\down_23.wav' is: 17
