In [1]:
import os
import numpy as np
import scipy.stats
from scipy.io import wavfile
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import librosa
import matplotlib.pyplot as plt
from collections import Counter
from IPython.display import Audio
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from IPython.display import Audio

c:\Users\Chief Engineer (C)\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\Chief Engineer (C)\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [3]:
audio_files_dir = r'archive\16000_pcm_speeches'
audio_data = []
labels = []
sr = []

label_mapping = {
    'Benjamin_Netanyau': 1,
    'Jens_Stoltenberg': 2,
    'Julia_Gillard': 3,
    'Magaret_Tarcher': 4,
    'Nelson_Mandela': 5
}

for folder_name, label in tqdm(label_mapping.items()):
    folder_path = os.path.join(audio_files_dir, folder_name)
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder_path, filename)
            audio, sam_rate = librosa.load(file_path, sr=None)
            audio_data.append(audio)
            labels.append(label)
            sr.append(sam_rate)

audio_data = np.array(audio_data)
labels = np.array(labels)
sr = np.array(sr)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [01:05<00:00, 13.12s/it]


#### Feature extraction - MFCCS and LPC and LDB

In [8]:
import librosa
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

def extract_lpc_features(audio_data, order=10):
    lpc_features = []
    for item in tqdm(audio_data, desc="Extracting LPC features"):
        lpc = librosa.core.lpc(item, order=order)
        lpc_features.append(lpc)
    return np.array(lpc_features)

def calculate_mfcc_pipeline(audio_signal, sr=16000,  n_fft=2048, hop_length=512, n_mels=23):
    framed_audio = librosa.util.frame(audio_signal, frame_length=n_fft, hop_length=hop_length).T
    magnitude_spectrum = np.abs(librosa.stft(framed_audio, n_fft=n_fft, hop_length=hop_length))
    mel_spectrum = librosa.feature.melspectrogram(S=magnitude_spectrum**2, sr=sr, n_mels=n_mels)
    log_mel_spectrum = librosa.power_to_db(mel_spectrum)
    cepstral_coefficients = librosa.feature.mfcc(S=log_mel_spectrum, n_mfcc=n_mels)
    return cepstral_coefficients.T

from scipy.spatial.distance import cdist
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def extract_ldb_features(audio_data, num_nodes=10, frame_length=2048, hop_length=512):
    ldb_features = []
    for item in tqdm(audio_data, desc="Extracting LDB features"):
        frames = librosa.util.frame(item, frame_length=frame_length, hop_length=hop_length)
        frame_ldb_nodes = []
        for frame in frames.T:
            frame_ldb_nodes.append(frame)
        frame_ldb_nodes = np.array(frame_ldb_nodes)
        dissimilarity_matrix = cdist(frame_ldb_nodes, frame_ldb_nodes, 'euclidean')
        selected_nodes_indices = np.argsort(np.sum(dissimilarity_matrix, axis=1))[:num_nodes]
        selected_nodes = [frame_ldb_nodes[i] for i in selected_nodes_indices]

        ldb_features.append(selected_nodes)

    return np.array(ldb_features)
lpc_features = extract_lpc_features(audio_data, order=10)
mfcc_features = calculate_mfcc_pipeline(audio_data, sr=16000)
ldb_features = extract_ldb_features(audio_data, num_nodes=10)



Extracting LPC features: 100%|██████████| 7501/7501 [00:03<00:00, 2230.69it/s]
Extracting LDB features: 100%|██████████| 7501/7501 [00:07<00:00, 996.10it/s] 


In [6]:
# feature_mfccs = []
# for item in tqdm(audio_data):
#     mfccs = librosa.feature.mfcc(y=item, sr=16000, n_mfcc=13)
#     mfccs = StandardScaler().fit_transform(mfccs)
#     feature_mfccs.append(mfccs.T)
# feature_mfccs = np.array(feature_mfccs)

### SVM classifier

In [None]:
X = feature_mfccs.reshape(feature_mfccs.shape[0], -1)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
svm_classifier = SVC(kernel='linear', C=1)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print("Using SVM with MFCCS features only:")
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_str)

Using SVM with MFCCS features only:
Accuracy: 0.93
Classification Report:
               precision    recall  f1-score   support

           1       0.82      0.94      0.88       310
           2       0.93      0.79      0.86       310
           3       0.97      0.98      0.97       283
           4       0.96      0.95      0.96       283
           5       1.00      0.99      1.00       315

    accuracy                           0.93      1501
   macro avg       0.93      0.93      0.93      1501
weighted avg       0.93      0.93      0.93      1501



### RF classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = feature_mfccs.reshape(feature_mfccs.shape[0], -1)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_str_rf = classification_report(y_test, y_pred_rf)

print("Using RF with MFCCS features only:")
print(f"Accuracy: {accuracy_rf:.2f}")
print("Classification Report:\n", classification_report_str_rf)

Using RF with MFCCS features only:
Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           1       0.97      0.92      0.94       310
           2       0.96      0.93      0.94       310
           3       0.94      0.99      0.97       283
           4       0.97      0.99      0.98       283
           5       1.00      1.00      1.00       315

    accuracy                           0.97      1501
   macro avg       0.97      0.97      0.97      1501
weighted avg       0.97      0.97      0.97      1501



# Adding Noise

In [None]:
audio_file_benjamin_1 = r'..\archive\16000_pcm_speeches\Benjamin_Netanyau\10.wav'
noise_1 = r'..\archive\16000_pcm_speeches\_background_noise_\running_tap.wav'

In [None]:
Audio(audio_file_benjamin_1)

In [None]:
Audio(noise_1)

# noise added successfully

In [None]:
import torchaudio
import torchaudio.functional as F
import torch

SAMPLE_SPEECH = audio_file_benjamin_1
SAMPLE_NOISE = noise_1
speech, _ = torchaudio.load(SAMPLE_SPEECH)
noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, :speech.shape[1]]

snr_dbs = torch.tensor([10])
noisy_speeches = F.add_noise(speech, noise, snr_dbs)

In [None]:
Audio(data=noisy_speeches,rate=16000)

# Working on augmented data

In [None]:
audio_files_dir = r'..\archive\16000_pcm_speeches'
audio_data_new = []
labels = []
sr = []

label_mapping = {
    'Benjamin_Netanyau': 1,
    'Jens_Stoltenberg': 2,
    'Julia_Gillard': 3,
    'Magaret_Tarcher': 4,
    'Nelson_Mandela': 5
}

for folder_name, label in tqdm(label_mapping.items()):
    folder_path = os.path.join(audio_files_dir, folder_name)
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder_path, filename)
            audio, sam_rate = torchaudio.load(file_path)
            audio_data_new.append(audio)
            labels.append(label)
            sr.append(sam_rate)
labels = np.array(labels)
sr = np.array(sr)

100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


In [None]:
import os
import torchaudio.functional as F
import torch
import random

original = []
changed = []

# Path to background noise folders
background_noise_folder = r'..\archive\16000_pcm_speeches\_background_noise_'
others_noise_folder = r'..\archive\16000_pcm_speeches\other'

def add_background_noise(audio, noise, snr_dB):
    noise = noise[:, :audio.shape[1]]
    return F.add_noise(audio, noise, torch.tensor([snr_dB]))

desired_snr = 10

num_samples_background = int(0.15 * len(audio_data_new))
background_noise_files = random.sample(range(len(audio_data_new)), num_samples_background)

for i in background_noise_files:
    noise_file = os.path.join(background_noise_folder, random.choice(os.listdir(background_noise_folder)))
    noise, _ = torchaudio.load(noise_file)
    noise = noise[0:1, :]
    original = audio_data_new[i]
    audio_data_new[i] = add_background_noise(audio_data_new[i], noise, snr_dB=desired_snr)
    changed = audio_data_new[i]
    break

num_samples_others = int(0.05 * len(audio_data_new))
others_noise_files = random.sample(range(len(audio_data_new)), num_samples_others)

for i in others_noise_files:
    noise_file = os.path.join(others_noise_folder, random.choice(os.listdir(others_noise_folder)))
    noise, _ = torchaudio.load(noise_file)
    noise = noise[0:1, :]
    audio_data_new[i] = add_background_noise(audio_data_new[i], noise, snr_dB=desired_snr)

In [None]:
Audio(data=original,rate=16000)

In [None]:
Audio(data=changed,rate=16000)

# below approaches - although add the noise but changes the pitch

### Approach - 1

In [None]:
audio_file_benjamin_1 = r'..\archive\16000_pcm_speeches\Benjamin_Netanyau\10.wav'
noise_1 = r'..\archive\16000_pcm_speeches\_background_noise_\running_tap.wav'

In [None]:
# dsfa, _ = torchaudio.load(audio_file_benjamin_1)
dsfa, _ = librosa.load(audio_file_benjamin_1,sr=16000)
Audio(data=dsfa, rate=16000)

In [None]:
import numpy as np

def add_noise(clean_audio, noise, snr_dB):


    # Ensure both signals have the same sample rate
    assert len(clean_audio) < len(noise), "Clean audio should have shorter duration than noise."

    # Calculate scaling factor for noise based on SNR
    snr = 10 ** (snr_dB / 20.0)  # Calculate SNR in linear scale
    noise_power = np.sum(noise ** 2) / len(noise)
    desired_noise_power = noise_power / snr
    scale_factor = np.sqrt(desired_noise_power / noise_power)

    # Generate a random starting point within the noise signal
    start_idx = np.random.randint(0, len(noise) - len(clean_audio) + 1)

    # Extract a segment of noise of the same length as clean audio
    noise_segment = noise[start_idx:start_idx + len(clean_audio)]

    # Scale and add noise to the clean audio
    noisy_audio = clean_audio + scale_factor * noise_segment

    return noisy_audio

clean_sample,_ = librosa.load(audio_file_benjamin_1)
noisy_sample,_ = librosa.load(noise_1)
snr_dB = 10  # Adjust the desired SNR

noisy_audio = add_noise(clean_sample, noisy_sample, snr_dB)

In [None]:
Audio(data=clean_sample, rate=16000)

In [None]:
Audio(data=noisy_audio, rate=16000)

### Approach 2

In [None]:
# import numpy as np
# from random import randint
# rand = []
# def augment_audio_with_noise_v2(clean_audio, noise_audio, min_snr=5, max_snr=20):
#     def cut_random_section(noise, size):
#         starting_point = randint(0, (noise.size - size))
#         end_point = starting_point + size
#         noise_cut_part = noise[starting_point:end_point]
#         return noise_cut_part

#     def mix(audio, noise, snr):
#         audio_max = max(audio)
#         if audio_max == 0:
#             audio_max = int(np.random.uniform(0.7, 1) * 32767)
#         audio = audio * 1.0
#         audio = audio / audio_max
#         noise = cut_random_section(noise, audio.size)
#         noise = noise * 1.0
#         noise = noise / max(noise)
#         gain = pow(10, (snr / 10.))
#         numerator = np.mean(abs(audio) ** 2)
#         denominator = numerator / gain
#         noise_power = np.mean(abs(noise) ** 2)
#         mult_value = (denominator / noise_power) ** 0.5
#         temp = noise * mult_value
#         noisy = audio + temp
#         if max(audio) == 0:
#             noisy = noise
#         else:
#             noisy = noisy / max(noisy)
#         print(noisy)
#         scale_factor = np.max(np.abs(audio)) / np.iinfo(np.int16).max
#         noisy_scaled = (noisy / scale_factor).astype(np.int16)
#         print(noisy_scaled)
#         return noisy_scaled

#     # Select a random segment from noisy audio with the same duration as clean audio
#     noisy_segment = cut_random_section(noise_audio, len(clean_audio))
#     # Randomly select SNR
#     snr_dB = np.random.uniform(min_snr, max_snr)
#     # Mix the clean audio with the noisy segment to achieve the desired SNR
#     augmented_audio = mix(clean_audio, noisy_segment, snr_dB)
#     return augmented_audio

# # Example of using the function with a clean audio sample and a noisy audio sample
# clean_sample,_ = librosa.load(audio_file_benjamin_1)
# noisy_sample,_ = librosa.load(noise_1)
# augmented_sample = augment_audio_with_noise_v2(clean_sample, noisy_sample)

[-0.02330106 -0.01470888  0.02698577 ... -0.18013197 -0.02908643
  0.08353435]
[ -706  -446   818 ... -5464  -882  2534]


In [None]:
# Audio(data=augmented_sample, rate=16000)