In [1]:
import cv2
import ultralytics
import random
import os

import torch
import numpy as np
from facenet_pytorch import InceptionResnetV1
from PIL import Image

from scipy.spatial.distance import cosine

from ultralytics import YOLO
# ultralytics.checks()

# Load the YOLOv8-face model
model = YOLO(r"D:\FCR\FaceRecognition\yolov11n-face.pt")  # yolov11n-face is a pre-trained small model for face detection

In [2]:
# Load known face embeddings and labels
known_embeddings = np.load("known_face_embeddings.npy")
known_labels = np.load("known_labels.npy")

# Initialize Facenet model for generating embeddings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
face_recognition_model = InceptionResnetV1(pretrained='vggface2').eval().to(device)

### Generate Face Embeddings for each person

In [3]:
# Constants
THRESHOLD = 0.5
MIN_FACE_SIZE = 32
TARGET_SIZE = (640, 640)

In [4]:
# Base directory containing folders of images for each person
base_dir = r"D:\FCR\FaceRecognition\Dataset"

# Lists to hold embeddings and labels
embeddings = []
labels = []

def preprocess_image(img):
    """Consistent image preprocessing"""
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    img = img.convert('RGB').resize((160, 160))
    img_tensor = torch.tensor(np.array(img)).permute(2, 0, 1).unsqueeze(0).float()
    img_tensor = (img_tensor - 127.5) / 128.0
    return img_tensor.to(device)

def get_embedding(img):
    """Generate normalized embedding for an image"""
    img_tensor = preprocess_image(img)
    with torch.no_grad():
        embedding = face_recognition_model(img_tensor).cpu().numpy().flatten()
        embedding = embedding / np.linalg.norm(embedding)  # L2 normalize
    return embedding

def find_closest_match(embedding, known_embeddings, known_labels, threshold=THRESHOLD):
    """Find the closest matching face using cosine similarity"""
    if len(known_embeddings) == 0:
        return "Unknown", 0.0
    
    # Calculate similarities
    similarities = [1 - cosine(embedding, known_emb) for known_emb in known_embeddings]
    max_similarity = max(similarities)
    max_index = np.argmax(similarities)
    
    # Debug print
    print(f"Max similarity: {max_similarity:.3f}, Threshold: {threshold}")
    print(f"Similarities: {list(zip(known_labels, similarities))}")
    
    # Stricter threshold check
    if max_similarity > threshold:
        return known_labels[max_index], max_similarity
    else:
        return "Unknown", max_similarity


In [5]:
# # Training phase: Generate embeddings for known faces
# print("Starting training phase...")
# for person_name in os.listdir(base_dir):
#     person_dir = os.path.join(base_dir, person_name)
#     if os.path.isdir(person_dir):
#         person_embeddings = []
#         print(f"Processing images for {person_name}...")

#         for image_name in os.listdir(person_dir):
#             image_path = os.path.join(person_dir, image_name)
#             try:
#                 img = Image.open(image_path)
#                 embedding = get_embedding(img)
#                 person_embeddings.append(embedding)
#             except Exception as e:
#                 print(f"Error processing {image_path}: {e}")

#         if person_embeddings:
#             mean_embedding = np.mean(person_embeddings, axis=0)
#             mean_embedding = mean_embedding / np.linalg.norm(mean_embedding)  # L2 normalize
#             embeddings.append(mean_embedding)
#             labels.append(person_name)

# # Save embeddings and labels
# np.save("known_face_embeddings.npy", embeddings)
# np.save("known_labels.npy", labels)
# print("Training phase completed. Embeddings saved.")

In [None]:
# Load known face embeddings and labels
known_embeddings = np.load("known_face_embeddings.npy")
known_labels = np.load("known_labels.npy")

# Real-time recognition phase
print("Starting real-time recognition...")
cap = cv2.VideoCapture(r"C:\Users\arauf\Pictures\Camera Roll\test_vid.mp4")

if not cap.isOpened():
    print("Error: Could not open video stream.")
    exit()

while True:
    try:
        ret, frame = cap.read()
        if not ret:
            print("Error: Failed to capture image.")
            break

        results = model(frame)

        for result in results:
            boxes = result.boxes

            for box in boxes:
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
                confidence = box.conf[0].cpu().numpy()

                face = frame[y1:y2, x1:x2]
                
                if face.shape[0] < MIN_FACE_SIZE or face.shape[1] < MIN_FACE_SIZE:
                    face = cv2.resize(face, (MIN_FACE_SIZE, MIN_FACE_SIZE))

                embedding = get_embedding(face)
                label, similarity = find_closest_match(embedding, known_embeddings, known_labels)

                # Draw bounding box and label with both detection and recognition confidence
                cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
                cv2.putText(frame, 
                           f"{label} (D:{confidence:.2f} R:{similarity:.2f})", 
                           (x1, y1 - 10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 
                           0.6, 
                           (255, 0, 0), 
                           2)

        cv2.imshow("Face Detection & Recognition", frame)

    except Exception as e:
        print(f"Error in main loop: {e}")

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Starting real-time recognition...

0: 480x640 3 faces, 124.5ms
Speed: 0.0ms preprocess, 124.5ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)
Max similarity: 0.804, Threshold: 0.5
Similarities: [('Abdul', 0.20121999375701027), ('Adeen', 0.8042129990927592), ('Fazal', 0.4177745927706611)]
Max similarity: 0.498, Threshold: 0.5
Similarities: [('Abdul', 0.2611704468727112), ('Adeen', 0.333141565322876), ('Fazal', 0.49769172072410583)]
Max similarity: 0.277, Threshold: 0.5
Similarities: [('Abdul', -0.1569363921880722), ('Adeen', 0.27654850482940674), ('Fazal', 0.10512511432170868)]

0: 480x640 3 faces, 1.3ms
Speed: 2.0ms preprocess, 1.3ms inference, 14.1ms postprocess per image at shape (1, 3, 480, 640)
Max similarity: 0.808, Threshold: 0.5
Similarities: [('Abdul', 0.20687712986009577), ('Adeen', 0.8075610513948782), ('Fazal', 0.42226073609228587)]
Max similarity: 0.271, Threshold: 0.5
Similarities: [('Abdul', 0.153857221969532), ('Adeen', 0.27110414081191225), ('Fazal', 