In [4]:
import torch
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image
import numpy as np

# Set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
mtcnn = MTCNN(
    image_size=160, # Desired output size for the cropped face
    margin=0, 
    min_face_size=20, 
    thresholds=[0.6, 0.7, 0.7], # Detection thresholds for stages of MTCNN
    factor=0.709, 
    post_process=False,
    device=device
)

def detect_and_crop(image_file_path, quality_threshold=0.95):
    img = Image.open(image_file_path).convert('RGB')
    
    # boxes: bounding boxes, probs: confidence scores
    boxes, probs = mtcnn.detect(img)
    
    # Use the prob output as your quality score
    if boxes is None:
        return [] # No face found
    
    good_quality_faces = []
    for i, (box, prob) in enumerate(zip(boxes, probs)):
        # Check against your desired quality/confidence threshold
        if prob > quality_threshold:
            box_np = np.array(box)
            
            # 2. Reshape the 1D box (4,) into a 2D array of shape (1, 4) 
            #    which is the expected batch format for coordinates.
            box_array_2d = box_np.reshape(1, 4)
            
            try:
                # Pass the 2D NumPy array directly.
                face_tensor = mtcnn.extract(img, box_array_2d, save_path=None)
                
                if face_tensor is not None:
                    good_quality_faces.append({
                        'face_tensor': face_tensor, 
                        'confidence_score': prob
                    })
            except Exception as e:
                print(f"Error extracting face for box {i} (score: {prob:.4f}): {e}")
    return good_quality_faces

In [54]:
def add_zero(number):
    if number < 10:
        return "00" + str(number)
    if number < 100:
        return "0" +  str(number)
    return str(number)

faces = []
for i in range(1, 702):
    img_path = f'/Users/carlos/Documents/BlyzAI/face_classifer/data/archive/natural_images/dog/dog_0{add_zero(i)}.jpg'
    ans = detect_and_crop(img_path, 0.6)
    if len(ans) > 0:
        best_one = sorted(ans, key=lambda x: x['confidence_score'], reverse=True)[0] 
        faces.append((best_one, img_path))


In [57]:
# for face, img_path in faces:
#     if face['confidence_score'] > 0.95:
#         print(f"path {img_path}, confidence score: {face['confidence_score']}")
len([f for f, _ in faces if f['confidence_score'] > 0.95])

21

In [41]:
from PIL import Image, ImageDraw
import numpy as np
# Assuming mtcnn is initialized and imported

def detect_and_crop_and_visualize(image_file_path, quality_threshold=0.95):
    # Open the image and convert it to RGB
    img = Image.open(image_file_path).convert('RGB')
    
    # Create a copy of the image to draw on (we don't want to modify the original img object used for mtcnn.detect)
    visual_img = img.copy()
    draw = ImageDraw.Draw(visual_img)

    # boxes: bounding boxes (NumPy array), probs: confidence scores (NumPy array)
    boxes, probs = mtcnn.detect(img)
    
    if boxes is None:
        print("No face detected in the image.")
        return [], visual_img # Return the original image if no face is found
    
    good_quality_faces = []
    
    for i, (box, prob) in enumerate(zip(boxes, probs)):
        # Check against your desired quality/confidence threshold
        if prob > quality_threshold:
            
            # --- Draw the Bounding Box on the Visual Image ---
            # Box coordinates are [x_min, y_min, x_max, y_max]
            x_min, y_min, x_max, y_max = [int(b) for b in box]

            # Draw a red rectangle (line width 2) for accepted faces
            draw.rectangle([x_min, y_min, x_max, y_max], outline=(255, 0, 0), width=3)
            
            # Optionally add the confidence score as text
            text = f"{prob:.2f}"
            draw.text((x_min, y_min - 15), text, fill=(255, 0, 0))


            # --- Face Extraction (Original Code) ---
            box_np = np.array(box)
            box_array_2d = box_np.reshape(1, 4)
            
            try:
                face_tensor = mtcnn.extract(img, box_array_2d, save_path=None)
                
                if face_tensor is not None:
                    good_quality_faces.append({
                        'face_tensor': face_tensor, 
                        'confidence_score': prob,
                        'bounding_box': box.tolist() # Store the coordinates
                    })
            except Exception as e:
                print(f"Error extracting face for box {i} (score: {prob:.4f}): {e}")

    # Return the list of good faces AND the image with boxes drawn on it
    return good_quality_faces, visual_img

In [58]:
# Assuming your variables are defined and libraries are imported
for face, img_path in faces:
    if face['confidence_score'] > 0.96:

         best_faces, image_with_boxes = detect_and_crop_and_visualize(img_path)

         image_with_boxes.show()


In [5]:
##EMBEDDING EXTRACTION - STAGE 2##
##
# Initialize InceptionResnetV1 for embedding extraction
resnet = InceptionResnetV1(
    pretrained='vggface2' # Pre-trained weights on a large dataset
).eval().to(device)

def get_face_embedding(face_tensor):
    # Pass the face tensor (from Stage 1) through the ResNet encoder
    with torch.no_grad():
        # The .unsqueeze(0) adds the batch dimension
        embedding = resnet(face_tensor.to(device).unsqueeze(0))
    # The output is a 512-dimensional vector (for InceptionResnetV1)
    return embedding.squeeze(0).cpu().numpy() # Return as a NumPy array

In [None]:
resnet = InceptionResnetV1(pretrained='casia-webface').eval()

# Load two face images to be verified
img1 = Image.open('path_to_image1.jpg')
img2 = Image.open('path_to_image2.jpg')

# Detect faces and extract embeddings
faces1, _ = mtcnn.detect(img1)
faces2, _ = mtcnn.detect(img2)

if faces1 is not None and faces2 is not None:
    aligned1 = mtcnn(img1)
    aligned2 = mtcnn(img2)
    embeddings1 = resnet(aligned1).detach()
    embeddings2 = resnet(aligned2).detach()
    
    # Calculate the Euclidean distance between embeddings
    distance = (embeddings1 - embeddings2).norm().item()
    if distance < 1.0:  # You can adjust the threshold for verification
        print("Same person")
    else:
        print("Different persons")

In [None]:
##ClASSIFICATION - STAGE 3##
##

from scipy.spatial.distance import euclidean, cosine

# Assume this is loaded from your Enrollment/Data Service
# Structure: {'Individual Name': numpy_array_centroid_embedding}
KNOWN_FACE_CENTROIDS = {} 

def classify_identity(input_embedding, known_centroids, tolerance_threshold=0.9):
    min_distance = float('inf')
    best_match_id = "Unknown"
    
    for identity, centroid in known_centroids.items():
        # Use Euclidean distance: smaller distance means higher similarity
        distance = euclidean(input_embedding, centroid)
        
        if distance < min_distance:
            min_distance = distance
            best_match_id = identity
            
    # Apply the tolerance threshold
    # Note: Threshold selection is critical and requires training/validation
    if min_distance <= tolerance_threshold:
        return best_match_id, min_distance
    else:
        return "Unknown", min_distance

In [6]:
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image, ImageDraw
import numpy as np
import random

def add_zero(number):
    if number < 10:
        return "00" + str(number)
    if number < 100:
        return "0" +  str(number)
    return str(number)
# Initialize MTCNN for face detection
#Load two face images to be verified
def get_embedding(img, mtcnn_model, resnet_model, device):
    # 1. MTCNN Detection and Alignment
    aligned = mtcnn_model(img) 
    
    # 2. Handle Case 1: No Face Found (aligned is None)
    if aligned is None:
        print("Warning: No face detected in image.")
        # Return a zero vector or handle as an error
        return None
    
    # 3. Handle Case 2: Single Face Found (aligned is (3, 160, 160))
    # Check if the output is a single image (not a batch) and add the batch dimension
    if aligned.dim() == 3:
        aligned = aligned.unsqueeze(0)
    
    # Ensure the tensor is on the correct device (it should be if mtcnn was)
    aligned = aligned.to(device)

    # 4. ResNet Embedding Extraction
    with torch.no_grad():
        embeddings = resnet_model(aligned).detach()
        
    return embeddings

mtcnn = MTCNN()

# Load pre-trained Inception ResNet model
resnet = InceptionResnetV1(pretrained='casia-webface').eval()

def get_difference(img1, img2, mtcnn, resnet, device):
    embeddings1 = get_embedding(img1, mtcnn, resnet, device)
    embeddings2 = get_embedding(img2, mtcnn, resnet, device)
    if embeddings1 is not None and embeddings2 is not None:
        distance = (embeddings1 - embeddings2).norm().item()
        return distance
    else:
        print("Cannot calculate distance due to missing face detection.")
        return None


PREPATH = '/Users/carlos/Documents/BlyzAI/face_classifer/data/archive/data/natural_images/person/person_0'

for i in []: #range(100):
    int1, int2 = random.randint(1, 985), random.randint(1, 986)
    #print(f"Comparing images {int1} and {int2}...")
    path1 = PREPATH + add_zero(int1) + '.jpg'
    path2 = PREPATH + add_zero(int2) + '.jpg'
    img1 = Image.open(path1).convert('RGB')
    img2 = Image.open(path2).convert('RGB')
    distance = get_difference(img1, img2, mtcnn, resnet, device)
    if distance is not None and distance < 0.9:
        print(f"Images {int1} and {int2} are of the SAME person with distance {distance:.4f}")


In [8]:
import cv2
import torch
import numpy as np
from PIL import Image

# --- ASSUMING THESE ARE INITIALIZED ---
# from facenet_pytorch import MTCNN, InceptionResnetV1
# mtcnn = MTCNN(image_size=160, margin=0, device=device)
# resnet = InceptionResnetV1(pretrained='vggface2').eval().to(device)
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def get_single_face_embedding(frame_rgb, mtcnn_model, resnet_model, quality_threshold=0.95):
    """
    Detects faces in a single frame, extracts the embedding for the *highest confidence* face
    that passes the quality threshold, and returns it as a NumPy array.
    """
    boxes, probs = mtcnn_model.detect(frame_rgb)

    if boxes is None:
        return None # No face found

    # 2. Select the Best Face
    good_detections = [(box, prob) for box, prob in zip(boxes, probs) if prob > quality_threshold]
    
    if not good_detections:
        return None # No face meets quality threshold
    
    # Select the box with the highest confidence score (best face)
    best_box, best_prob = max(good_detections, key=lambda x: x[1])

    # 3. Alignment, Cropping, and Normalization
    try:
        # Convert the best box from NumPy array (4,) to 2D NumPy array (1, 4)
        box_array_2d = np.array(best_box).reshape(1, 4)
        
        # mtcnn.extract handles alignment and conversion to a PyTorch Tensor
        face_tensor = mtcnn_model.extract(frame_rgb, box_array_2d, save_path=None)
        
        if face_tensor is None:
            return None

        # 4. Embedding Extraction (Encoder Stage)
        with torch.no_grad():
            # unsqueeze(0) adds the batch dimension (1, 3, 160, 160)
            embedding = resnet_model(face_tensor.to(device).unsqueeze(0))
        
        return embedding.squeeze(0).cpu().numpy() # Return (512,) NumPy array

    except Exception as e:
        # print(f"Extraction error: {e}") 
        return None

In [9]:
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image, ImageDraw
import numpy as np
import random
def process_video_for_embedding(video_path, frames_to_sample, mtcnn, resnet, quality_threshold=0.95):
    """Returns all face embeddings for a given video and specified frames"""
    cap = cv2.VideoCapture(video_path)
    
    # Get video metadata
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if total_frames < frames_to_sample:
        print(f"Video is too short ({total_frames} frames). Sampling all frames.")
        frames_to_sample = total_frames

    # Determine the step size for uniform sampling
    if frames_to_sample == 0:
        return None
        
    sampling_interval = max(1, total_frames // frames_to_sample)
    
    all_embeddings = []
    
    # --- Main Sampling and Processing Loop ---
    for i in range(total_frames):
        # Set the current frame position
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        
        # Read the frame
        ret, frame = cap.read() 
        
        # Check if frame read was successful and if it's a sample point
        if not ret:
            break
        
        if i % sampling_interval == 0:
            # OpenCV reads in BGR format, convert to RGB PIL image for facenet-pytorch
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_pil = Image.fromarray(frame_rgb)
            
            # Get embedding for the best face in this frame
            embedding = get_single_face_embedding(frame_pil, mtcnn, resnet, quality_threshold)
            
            if embedding is not None:
                all_embeddings.append(embedding)

    # Clean up
    cap.release()
    
    # --- Embedding Aggregation (Averaging) ---
    if not all_embeddings:
        print("No high-quality faces were detected in the sampled frames.")
        return None
        
    # Stack the embeddings to form a matrix (N, 512)
    embedding_matrix = np.stack(all_embeddings)
    
    return all_embeddings

In [10]:
import cv2
video_path = '/Users/carlos/Documents/BlyzAI/face_classifer/data/videos-dataset/files/1/3.mp4'
#mtcnn = MTCNN()

# Load pre-trained Inception ResNet model
resnet = InceptionResnetV1(pretrained='casia-webface').eval()

embeddings_video_3 = process_video_for_embedding(
    video_path,
    30,
    mtcnn,
    resnet    
)

In [21]:
from PIL import Image, ImageDraw

image_file_path = '/Users/carlos/Documents/BlyzAI/face_classifer/data/videos-dataset/files/9/2.jpg'
img2 = Image.open(image_file_path).convert('RGB')
embedding2 = get_embedding(img2, mtcnn, resnet, device)
min_d = 2.2
if embedding2 is not None:
    for embedding in embeddings_video_3:
        

    # Convert the list of video embeddings into a single PyTorch tensor (N, 512)
        video_tensor = torch.from_numpy(embedding).float().to(device)
    

        distance = (video_tensor - embedding2).norm(dim=1).cpu().numpy()
        if distance < min_d:
            min_d = distance
            

print(f" Min distance is {min_d}")
        

 Min distance is [1.1759666]


In [19]:
!conda install -c conda-forge opencv

^C


In [20]:
!python -m pip install opencv-python-headless

Collecting opencv-python-headless
  Downloading opencv-python-headless-4.12.0.88.tar.gz (95.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.4/95.4 MB[0m [31m16.0 MB/s[0m  [33m0:00:06[0mm0:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting numpy<2.3.0,>=2 (from opencv-python-headless)
  Using cached numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl (21.2 MB)
Building wheels for collected packages: opencv-python-headless
  Building wheel for opencv-python-headless (pyproject.toml) ... [?25l/^C
[31mERROR: Operation cancelled by user[0m[31m
[0m