In [None]:
pip install opencv-python face_recognition sounddevice soundfile librosa numpy

Collecting librosa
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting pooch>=1.1 (from librosa)
  Using cached pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Using cached soxr-0.5.0.post1-cp311-cp311-macosx_10_14_x86_64.whl.metadata (5.6 kB)
Using cached librosa-0.11.0-py3-none-any.whl (260 kB)
Using cached audioread-3.0.1-py3-none-any.whl (23 kB)
Using cached pooch-1.8.2-py3-none-any.whl (64 kB)
Using cached soxr-0.5.0.post1-cp311-cp311-macosx_10_14_x86_64.whl (203 kB)
Installing collected packages: soxr, audioread, pooch, librosa
Successfully installed audioread-3.0.1 librosa-0.11.0 pooch-1.8.2 soxr-0.5.0.post1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import cv2
import face_recognition
import sounddevice as sd
import soundfile as sf
import librosa
import numpy as np
import json
import os
import time
import tkinter as tk
from tkinter import simpledialog, messagebox

DATA_FILE = "enrolled_data.json"

def enroll_gui():
    name = simpledialog.askstring("Enroll", "Enter your name:")
    if not name:
        return

    # Capture face
    cap = cv2.VideoCapture(0)
    time.sleep(2)
    messagebox.showinfo("Face Capture", "Look at the camera. Press SPACE to capture your face.")
    while True:
        ret, frame = cap.read()
        cv2.imshow("Press SPACE to capture", frame)
        key = cv2.waitKey(1) & 0xFF
        if key == ord(' '):
            face_file = f"{name}_face.jpg"
            cv2.imwrite(face_file, frame)
            break
        elif key == ord('q'):
            cap.release()
            cv2.destroyAllWindows()
            return
    cap.release()
    cv2.destroyAllWindows()

    # Record voice
    messagebox.showinfo("Voice Capture", "Speak into the microphone for 3 seconds...")
    fs = 16000
    recording = sd.rec(int(3 * fs), samplerate=fs, channels=1)
    sd.wait()
    voice_file = f"{name}_voice.wav"
    sf.write(voice_file, recording, fs)

    # Save data
    if os.path.exists(DATA_FILE):
        with open(DATA_FILE, "r") as f:
            data = json.load(f)
    else:
        data = {}

    data[name] = {"face": face_file, "voice": voice_file}
    with open(DATA_FILE, "w") as f:
        json.dump(data, f)

    messagebox.showinfo("Enroll Complete", f"User {name} enrolled successfully!")

def recognize_gui():
    if not os.path.exists(DATA_FILE):
        messagebox.showerror("Error", "No enrolled data found!")
        return

    with open(DATA_FILE, "r") as f:
        data = json.load(f)

    # Capture test face
    cap = cv2.VideoCapture(0)
    time.sleep(2)
    messagebox.showinfo("Face Capture", "Look at the camera. Press SPACE to capture your face.")
    while True:
        ret, frame = cap.read()
        cv2.imshow("Press SPACE to capture", frame)
        key = cv2.waitKey(1) & 0xFF
        if key == ord(' '):
            test_face_file = "test_face.jpg"
            cv2.imwrite(test_face_file, frame)
            break
        elif key == ord('q'):
            cap.release()
            cv2.destroyAllWindows()
            return
    cap.release()
    cv2.destroyAllWindows()

    # Record test voice
    messagebox.showinfo("Voice Capture", "Speak into the microphone for 3 seconds...")
    fs = 16000
    recording = sd.rec(int(3 * fs), samplerate=fs, channels=1)
    sd.wait()
    test_voice_file = "test_voice.wav"
    sf.write(test_voice_file, recording, fs)

    # Load test face encoding
    test_img = face_recognition.load_image_file(test_face_file)
    test_encs = face_recognition.face_encodings(test_img)
    if len(test_encs) == 0:
        messagebox.showerror("Error", "No face detected in the captured image!")
        return
    test_enc = test_encs[0]

    # Load test voice MFCC
    y_test, sr_test = librosa.load(test_voice_file)
    mfcc_test = librosa.feature.mfcc(y=y_test, sr=sr_test).mean(axis=1)

    best_match = None
    best_face_dist = float("inf")
    best_voice_dist = float("inf")

    for name, files in data.items():
        enrolled_img = face_recognition.load_image_file(files["face"])
        enrolled_encs = face_recognition.face_encodings(enrolled_img)
        if len(enrolled_encs) == 0:
            continue
        enrolled_enc = enrolled_encs[0]
        face_dist = np.linalg.norm(enrolled_enc - test_enc)

        y_enrolled, sr_enrolled = librosa.load(files["voice"])
        mfcc_enrolled = librosa.feature.mfcc(y=y_enrolled, sr=sr_enrolled).mean(axis=1)
        voice_dist = np.linalg.norm(mfcc_enrolled - mfcc_test)

        print(f"[INFO] Compared with {name}: Face dist={face_dist:.2f}, Voice dist={voice_dist:.2f}")

        if face_dist < 0.6 and voice_dist < 150:
            if face_dist + voice_dist < best_face_dist + best_voice_dist:
                best_match = name
                best_face_dist = face_dist
                best_voice_dist = voice_dist

    if best_match:
        messagebox.showinfo("Recognition Result",
                            f"Recognized as {best_match}\nFace distance: {best_face_dist:.2f}\nVoice distance: {best_voice_dist:.2f}")
    else:
        messagebox.showinfo("Recognition Result", "No match found!")

def on_hover(event):
    event.widget.config(bg="#444", fg="#fff")

def on_leave(event):
    event.widget.config(bg="#222", fg="#ddd")

def main_gui():
    root = tk.Tk()
    root.title("Face + Voice Recognition")
    root.geometry("500x300")
    root.configure(bg="#222")  

    tk.Label(root, text="Face + Voice Recognition", font=("Helvetica", 20, "bold"), fg="#00BFFF", bg="#222").pack(pady=30)

    btn1 = tk.Button(root, text="Enroll New User", command=enroll_gui, height=2, width=20,
                     font=("Helvetica", 14), bg="#222", fg="#ddd", bd=0, activebackground="#444", activeforeground="#fff", cursor="hand2")
    btn1.pack(pady=15)
    btn1.bind("<Enter>", on_hover)
    btn1.bind("<Leave>", on_leave)

    btn2 = tk.Button(root, text="Recognize User", command=recognize_gui, height=2, width=20,
                     font=("Helvetica", 14), bg="#222", fg="#ddd", bd=0, activebackground="#444", activeforeground="#fff", cursor="hand2")
    btn2.pack(pady=15)
    btn2.bind("<Enter>", on_hover)
    btn2.bind("<Leave>", on_leave)

    root.mainloop()

if __name__ == "__main__":
    main_gui()


[INFO] Compared with Anna: Face dist=0.30, Voice dist=106.53
[INFO] Compared with Meghna: Face dist=0.68, Voice dist=185.49
[INFO] Compared with Aldona: Face dist=0.67, Voice dist=96.11
[INFO] Compared with Anna: Face dist=0.66, Voice dist=117.76
[INFO] Compared with Meghna: Face dist=0.56, Voice dist=196.13
[INFO] Compared with Aldona: Face dist=0.36, Voice dist=84.59
