## Dictionary and Imports

In [627]:
# Dictionary of users and passwords registered
AUTHORIZED_USERS = {
    "Arianna": "falcon",
    "Andre": "tiger"
}

In [628]:
import os
import random
import numpy as np
import librosa
import sounddevice as sd
import torch

from scipy.signal import butter, lfilter
from scipy.spatial.distance import cosine
from speechbrain.inference import SpeakerRecognition
from torchvision import models


## Voice acquisition module
The user speaks into the microphone.
The signal is preprocessed to remove noise and normalized.

Alternatively, one of the audio files already present in the folder can be used.
The signal is processed in the same way.

In [629]:
def preprocess_audio(signal, sr, low_cutoff=100, high_cutoff=3000):
    normalized_signal = librosa.util.normalize(signal)
    nyquist = 0.5 * sr
    low = low_cutoff / nyquist
    high = high_cutoff / nyquist
    b, a = butter(5, [low, high], btype='band')

    return lfilter(b, a, normalized_signal)

# function to register audio in real time
def acquire_audio_new():
    sr = 16000  # sampling rate
    duration = 4
    print("Recording audio...")
    
    audio = sd.rec(int(duration * sr), samplerate=sr, channels=1, dtype='float32')
    sd.wait()  # wait the end of registration

    print("Audio recorded. Preprocessing...")
    processed_audio = preprocess_audio(audio.flatten(), sr)
    print("Audio preprocessed.")
    return processed_audio, sr

# function to acquire audio from "test" folder"
def acquire_audio_from_test(n):
    test_folder = "./dataset/test" 
    audio_files = [f for f in os.listdir(test_folder) if f.endswith('.wav')]

    if not audio_files:
        raise FileNotFoundError("No audio files found in the 'test' folder.")

    if n == 0:  # Se n == 0, random choice
        selected_file = random.choice(audio_files)
    else:  # otherwise, choose n-th file
        if n > len(audio_files):
            raise IndexError(f"The test folder contains only {len(audio_files)} files.")
        audio_files.reverse()
        selected_file = audio_files[n - 1]

    file_path = os.path.join(test_folder, selected_file)
    print(f"Loading audio file: {selected_file}")
    signal, sr = librosa.load(file_path, sr=None)

    print("Preprocessing audio...")
    processed_audio = preprocess_audio(signal, sr)
    print("Audio preprocessed.")
    return processed_audio, sr

## Speaker identification

Generates a unique numerical embedding from voice characteristics to identify the speaker. 

Returns the estimated identity ("Arianna", "Andre", or "Unauthorized").

In [630]:
recognizer = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="./model")

# function to create authorized speaker embeddings
def create_speaker_embedding(audio_paths):
    embeddings = []
    
    for audio_path in audio_paths:
        signal, sr = librosa.load(audio_path, sr=None)
        processed_audio = preprocess_audio(signal, sr)
        embedding = recognizer.encode_batch(torch.tensor([processed_audio]))
        embeddings.append(embedding.flatten().detach().numpy())

    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

# function to generate sentence file paths
def generate_sentence_file_paths(base_name, num_files):
    return [f"dataset/registered/sentences/{base_name}_pw{i}.wav" for i in range(1, num_files + 1)]

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
  state_dict = torch.load(path, map_location=de

In [631]:
arianna_sentence_files = generate_sentence_file_paths('arianna', 20)
andre_sentence_files = generate_sentence_file_paths('andre', 20) 

arianna_embedding = create_speaker_embedding(arianna_sentence_files)
print(f"arianna embeddings: {arianna_embedding}")
andre_embedding = create_speaker_embedding(andre_sentence_files)
print(f"andre embeddings: {andre_embedding}")

torch.save({"Arianna": arianna_embedding, "Andre": andre_embedding}, "speakers_embeddings.pth")
print("Embeddings created for authorized people")

arianna embeddings: [ 35.52124     21.893566    22.68876    -21.29689      6.066318
  26.263565     2.7747447    6.102174     0.26386526  24.066471
   2.9939454    2.250876     9.40031      4.4074373   15.027156
  -6.3589296   34.96337     25.187433     6.101095     8.3119545
  25.3614      27.620804    17.98833     18.622684    18.084957
 -14.810003   -32.013073   -11.776002    26.108643   -12.655676
   5.726528   -10.453799    -4.306484   -16.427935   -28.083847
  -7.281644     1.3660741   21.933296    -0.06710432  38.876648
   7.875436     6.077498    21.915012   -20.998077    24.029057
   0.53644764 -19.099873     3.9147706   22.783436    -8.539175
  19.578297    -7.700962   -18.176714    25.095062    -7.23887
   2.9536932  -32.201027     0.8318567  -10.680226    13.25445
  36.97503     -4.4166102  -25.694834    -5.250004    -9.53453
   5.4720173   16.373264   -10.620026     6.1696253   -7.76967
  15.917302    21.576717    -4.0674734  -20.908527     1.2099098
  16.609087   -23.8779

In [632]:
# function to compare test audio with registered speakers
def verify_speaker(processed_audio, speakers_embeddings, threshold=0.7):
    test_embedding = recognizer.encode_batch(torch.tensor([processed_audio])) 
    test_embedding = test_embedding.squeeze()  
    test_embedding = test_embedding / torch.norm(test_embedding)

    scores = {}
    for speaker, reference_embedding in speakers_embeddings.items():
        reference_embedding = reference_embedding / torch.norm(torch.tensor([reference_embedding]))
        similarity_score = 1 - cosine(test_embedding, reference_embedding)
        scores[speaker] = similarity_score

    recognized_speaker = max(scores, key=scores.get)
    if scores[recognized_speaker] >= threshold: 
        return recognized_speaker, scores
    else:
        return "Unauthorized", scores

## Password Verification

The system checks that the person is using their own password and not someone else's. The spoken password is transcribed into text and compared with the registered password of the identified user. To prevent transcription errors, the system also performs a direct audio verification, comparing the spoken sounds with the stored acoustic model of the password.

In [633]:
# function to generate password file paths
def generate_password_file_paths(base_name, num_files):
    return [f"dataset/registered/passwords/{base_name}_pw{i}.wav" for i in range(1, num_files + 1)]

arianna_pw_files = generate_password_file_paths('arianna', 20)
andre_pw_files = generate_password_file_paths('andre', 20) 

arianna_pw_embedding = create_speaker_embedding(arianna_pw_files)
andre_pw_embedding = create_speaker_embedding(andre_pw_files)

torch.save({"Arianna": arianna_pw_embedding, "Andre": andre_pw_embedding}, "password_embeddings.pth")
print("Password embeddings created for authorized people")

Password embeddings created for authorized people


In [634]:
def verify_password(audio_test, user, password_embeddings, threshold=0.6):
    user_embedding = password_embeddings.get(user)
    user_embedding = user_embedding / torch.norm(torch.tensor([user_embedding]))

    # verify through audio comparing
    test_embedding = recognizer.encode_batch(torch.tensor([audio_test]))
    test_embedding = test_embedding.squeeze()  
    test_embedding = test_embedding / torch.norm(test_embedding)
    similarity_score = 1 - cosine(test_embedding, user_embedding)

    audio_match = similarity_score >= threshold
    
    return audio_match

## Decision

If both the speaker and the password are correct, access is granted. Otherwise, access is denied.

In [635]:
def manage_access(audio_test, user, password_embeddings):
    if verify_password(audio_test, user, password_embeddings):
        print("Access allowed!")
    else:
        print("Access denied.")
    pass

## Main

In [636]:
speakers_embeddings = torch.load("speakers_embeddings.pth")
password_embeddings = torch.load("password_embeddings.pth")

# --- Test audio loading
# [processed_audio, sr] = acquire_audio_new()
[processed_audio, sr] = acquire_audio_from_test(8)

Loading audio file: test_andre_falcon2.wav
Preprocessing audio...
Audio preprocessed.


  speakers_embeddings = torch.load("speakers_embeddings.pth")
  password_embeddings = torch.load("password_embeddings.pth")


In [637]:
# --- First step: verify if the test speaker is a registered one
result = verify_speaker(processed_audio, speakers_embeddings)
print(f"Identified: {result}")

if result[0] == "Unauthorized":
    print("The test speaker is not registered among the authorized users...bye.")
    raise SystemExit()
else:
    print("The test speaker is a registered user!")
    # --- Second step (only if the user is registered): verify if the password is correct
    manage_access(processed_audio, result[0], password_embeddings)

Identified: ('Andre', {'Arianna': np.float32(0.7714257), 'Andre': np.float32(0.8707419)})
The test speaker is a registered user!
Access allowed!
