In [53]:
import os

# Folder containing the files
folder_path = "test_audio"

# Get a list of all files in the folder
files = os.listdir(folder_path)

# Iterate over each file in the folder
for i, file in enumerate(files):
    # Check if the file name contains "音频"
    if "录音" in file:
        # Generate the new file name
        new_file_name = f"haoyu_audio_{i}.wav"
        # Construct the old and new file paths
        old_file_path = os.path.join(folder_path, file)
        new_file_path = os.path.join(folder_path, new_file_name)
        # Rename the file
        os.rename(old_file_path, new_file_path)


In [26]:
import os

folder_path = "audio_files"  # Path to the folder containing the audio files

# Loop through files in the folder
for filename in os.listdir(folder_path):
    # Check if the filename matches the pattern "cyhh_audio_i" with i from 101 to 160
    if filename.startswith("cyhh_audio_") and filename.endswith(".wav"):
        try:
            file_number = int(filename.split("_")[-1].split(".")[0])
            if 101 <= file_number <= 160:
                # Rename the file to "stranger_audio_i"
                new_filename = filename.replace("cyhh_audio_", "stranger_audio_")
                os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))
                print(f"Renamed {filename} to {new_filename}")
        except ValueError:
            # Skip files that don't match the expected format
            continue


In [4]:
import audioread

# 音频文件路径
audio_file = "audio_files/haoyu_audio_31.wav"

try:
    # 使用audioread打开音频文件
    with audioread.audio_open(audio_file) as f:
        print("音频文件信息:")
        print(f"  文件名: {audio_file}")
        print(f"  时长: {f.duration:.2f} 秒")
        print(f"  采样率: {f.samplerate} Hz")
        print(f"  通道数: {f.channels}")
        
except audioread.NoBackendError:
    print("无法找到支持的后端，无法打开音频文件。")
except audioread.DecodeError:
    print("无法解码音频文件。可能是因为不支持的编码格式。")


音频文件信息:
  文件名: audio_files/haoyu_audio_31.wav
  时长: 3.53 秒
  采样率: 48000 Hz
  通道数: 2


In [27]:
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, utils
import tensorflow as tf
from tensorflow import keras

In [28]:
# Function to extract MFCC features from audio files
def extract_features(file_path, mfcc=True, chroma=True, mel=True,sr=22050):
    audio_data, _ = librosa.load(file_path)  # Load audio data directly without a context manager
    features = []
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13), axis=1)
        features.extend(mfccs)
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio_data, sr=sr), axis=1)
        features.extend(chroma)
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio_data, sr=sr), axis=1)
        features.extend(mel)
    return features

In [29]:
# Load audio files and extract features
def load_data(file_paths,sr):
    X = []
    y = []
    for file_path in file_paths:
        features = extract_features(file_path,sr)
        X.append(features)
        # Assume file name format is "<speaker_id>_<other_info>.wav"
#         print(file_path)
        label = file_path.split("/")[-1].split("_")[0]
        y.append(label)
    return np.array(X), np.array(y)

In [30]:
import os
# Function to collect paths of all .wav files in a directory
def collect_audio_paths(directory):
    audio_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            audio_paths.append(os.path.join(root, file))
    return audio_paths


In [33]:
# Load audio files and corresponding labels
# Directory containing .wav files
audio_directory = "audio_files/"
# Sampling Rate
sr= 22050

class_names = ["cyhh","haoyu","stranger"]
# Collect paths of .wav files
file_paths = collect_audio_paths(audio_directory)

X, y = load_data(file_paths,sr)
# print(y)
# Encode labels as integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# print(y_encoded)
# print("X:",X)
# print("y:",y)
print("Shape of X:", X.shape)
print("Shape of y_encoded:", y_encoded.shape)
shape_x_0=X.shape[0]
X=X.reshape((shape_x_0,153,1))
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)

input_shape=X_train.shape[1]
print(input_shape)

print(y_test)

Shape of X: (161, 153)
Shape of y_encoded: (161,)
(128, 153, 1)
(128,)
153
[2 2 2 1 1 0 2 1 1 2 0 1 0 1 0 0 2 2 1 0 1 0 0 0 0 2 1 1 2 2 2 1 1]


In [34]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    x = keras.layers.MaxPool1D(pool_size=2, strides=2, padding='same')(x)  # Adjust the padding here
    return x


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)

In [35]:
#  Build and Compile the Model
model = build_model(((input_shape,1)), len(class_names))
model.summary()


model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 153, 1)]     0                                            
__________________________________________________________________________________________________
conv1d_19 (Conv1D)              (None, 153, 16)      64          input[0][0]                      
__________________________________________________________________________________________________
activation_13 (Activation)      (None, 153, 16)      0           conv1d_19[0][0]                  
__________________________________________________________________________________________________
conv1d_20 (Conv1D)              (None, 153, 16)      784         activation_13[0][0]              
____________________________________________________________________________________________

In [36]:
# Train the model
# model.fit(X_train, y_train, epochs=50, batch_size=4, validation_data=(X_test, y_test))
model.fit(X_train, y_train, epochs=50, batch_size=4)


# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")



Train on 128 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 1.0


In [60]:
# preprocessing
test_feature= extract_features("test_audio/cyhh_audio_1.wav", mfcc=True, chroma=True, mel=True,sr=22050)
# test_feature= extract_features("audio_files/haoyu_audio_70.wav", mfcc=True, chroma=True, mel=True,sr=22050)
print(len(test_feature))
test_feature= np.array(test_feature)
# print(test_feature.shape)
test_feature= test_feature.reshape(1,153,1)
# print(test_feature.shape)

# Make prediction

predictions = model.predict(test_feature)
print(predictions)
# Post-process predictions (e.g., choose the class with the highest probability)
predicted_label = np.argmax(predictions)

print("Predicted label:", class_names[predicted_label])

153
[[9.9999905e-01 7.0940030e-07 2.1518402e-07]]
Predicted label: cyhh


In [54]:
########## calculate the success rate #####################
import os
import librosa  # For audio processing
import numpy as np
from sklearn.metrics import accuracy_score

# Load your pre-trained model
# Replace this with code to load your pre-trained model

folder_path = "test_audio"  # Path to the folder containing the test audio files
success_count = 0
total_count = 0
true_labels = []
predicted_labels = []

# Loop through files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        # Extract label from filename
        label = filename.split("_")[0]

        # Load audio file
        file_path = os.path.join(folder_path, filename)
        # preprocessing
        test_feature= extract_features(file_path, mfcc=True, chroma=True, mel=True,sr=22050)
        # test_feature= extract_features("audio_files/haoyu_audio_70.wav", mfcc=True, chroma=True, mel=True,sr=22050)
        test_feature= np.array(test_feature)
        test_feature= test_feature.reshape(1,153,1)
        # print(test_feature.shape)

        # Make prediction
        predictions = model.predict(test_feature)
#         print(predictions)
        # Post-process predictions (e.g., choose the class with the highest probability)
        predicted_label_index = np.argmax(predictions)
        predicted_label=class_names[predicted_label_index]
        # Update counts and lists for accuracy calculation
        if label == predicted_label:
            success_count += 1
        total_count += 1
        true_labels.append(label)
        predicted_labels.append(predicted_label)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Total samples: {total_count}")
print(f"Success count: {success_count}")
print(f"Accuracy: {accuracy:.2f}")


Total samples: 42
Success count: 38
Accuracy: 0.90


In [55]:
# Save the model for future use
model.save("speaker_recognition_model")

# Save label encoder for future use
np.save("label_encoder.npy", label_encoder.classes_)

INFO:tensorflow:Assets written to: speaker_recognition_model\assets


In [56]:
# TODO:
# 1. Dynamic range quantization
converter = tf.lite.TFLiteConverter.from_saved_model("speaker_recognition_model")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_dy_range_quant_model = converter.convert()
# Save the model.
with open('dynamicQuant_speaker_recogniztion.tflite', 'wb') as f:
    f.write(tflite_dy_range_quant_model)