Install necessary packages

In [2]:
!pip install tensorflow numpy opencv-python Pillow ultralytics torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

Looking in indexes: https://download.pytorch.org/whl/cu117


Import them

In [3]:
import tensorflow as tf
import torch
from ultralytics import YOLO
import numpy as np
import cv2
from PIL import Image
import glob
import os
import pickle
from collections import deque
import pandas as pd
from datetime import datetime

In [4]:
torch.cuda.is_available()

print("Is CUDA available:", torch.cuda.is_available())
print("PyTorch version:", torch.__version__)

Is CUDA available: True
PyTorch version: 2.5.1+cu124


Create features/embeddings from the dataset

In [None]:
model_path = './20180402-114759.pb'
dataset_dir = './dataset'

# Load the FaceNet model
print("Loading FaceNet model...")
with tf.io.gfile.GFile(model_path, "rb") as f:
    graph_def = tf.compat.v1.GraphDef() 
    graph_def.ParseFromString(f.read())

with tf.Graph().as_default() as graph:
    tf.import_graph_def(graph_def, name="")
print("Model loaded successfully.")

# Get input and output tensors
input_tensor = graph.get_tensor_by_name('input:0') 
output_tensor = graph.get_tensor_by_name('embeddings:0')

# Function to preprocess the image and generate embeddings
def generate_embedding(image_path):
    # Open the image and convert it to RGB format (3 channels)
    image = Image.open(image_path).convert('RGB').resize((160, 160))
    image = np.array(image) / 255.0 
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    with tf.compat.v1.Session(graph=graph) as sess:
        embedding = sess.run(output_tensor, feed_dict={
            input_tensor: image,
            graph.get_tensor_by_name('phase_train:0'): False
        })

    # Normalize the embedding
    normalized_embedding = embedding[0] / np.linalg.norm(embedding[0])
    return normalized_embedding

# Dictionary to store embeddings with labels
embeddings_dict = {}

# Process each image in the dataset directory
print("Starting embedding generation for each image...")
for image_path in glob.glob(os.path.join(dataset_dir, '*.png')):  #image format
    label = os.path.basename(image_path).split('.')[0]  # Get label from file name
    print(f"Processing image: {label}")
    embedding = generate_embedding(image_path)
    # print(embedding.shape[-1])
    embeddings_dict[label] = embedding
    print(f"Embedding generated for {label}: {embedding[:5]}...") 
# Save embeddings dictionary to a file for easy mapping
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)
print("All embeddings generated and saved successfully in 'embeddings.pkl'.")

Loading FaceNet model...

Model loaded successfully.
Starting embedding generation for each image...
Processing image: 101
Embedding generated for 101: [   0.023957    0.015615   -0.033094   -0.012375   -0.038895]...
Processing image: 102
Embedding generated for 102: [   0.037405   -0.044949   -0.037431    0.010111   -0.031994]...
Processing image: 103
Embedding generated for 103: [   0.049462    0.050944   0.0021693   -0.065702   -0.087857]...
Processing image: 104
Embedding generated for 104: [  -0.028368    0.034861   -0.033094    0.024209   -0.052906]...
Processing image: 106
Embedding generated for 106: [  -0.029574    0.025538   -0.016259   -0.045912   -0.061736]...
Processing image: 107
Embedding generated for 107: [   0.036722   0.0024776   -0.012381   -0.053597    -0.10775]...
Processing image: 108
Embedding generated for 108: [ -0.0037316   -0.027756   -0.050823   -0.043343    0.046011]...
Processing image: 112
Embedding generated for 112: [   0.025297    0.005112  -0.0024553

Test over static image recognition

Integrate Yolo for Dynamic detection with FaceNet

In [5]:
# FOR OVERWRITING

# Load FaceNet model
model_path = './20180402-114759.pb'

print("Loading FaceNet model...")
with tf.io.gfile.GFile(model_path, "rb") as f:
    graph_def = tf.compat.v1.GraphDef()
    graph_def.ParseFromString(f.read())

with tf.Graph().as_default() as graph:
    tf.import_graph_def(graph_def, name="")

input_tensor = graph.get_tensor_by_name('input:0')
output_tensor = graph.get_tensor_by_name('embeddings:0')

# Set up device for GPU in PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load embeddings from the file
with open('embeddings.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

# Initialize YOLO model and move it to GPU
yolo_model = YOLO('yolo11m.pt')
yolo_model = yolo_model.to(device)

# Set up TensorFlow session to use GPU
gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
sess = tf.compat.v1.Session(graph=graph, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

# Initialize buffers and logs
identity_buffer = deque(maxlen=10)
detection_logs = []
detection_count = {}

# Load or create the timetable DataFrame
try:
    attendance_df = pd.read_excel("attendance.xlsx", index_col=0)
except FileNotFoundError:
    columns = ["Sunday 1:30-20:00"]
    roll_numbers = [f"{i}" for i in range(101, 173)]
    attendance_df = pd.DataFrame(index=roll_numbers, columns=columns)


cap = cv2.VideoCapture(0)

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to a PyTorch tensor and move it to the GPU
        frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).float().to(device)

        # Run YOLO detection on the GPU
        results = yolo_model.predict(frame_tensor)
        current_frame_ids = []

        for result in results:
            boxes = result.boxes.xyxy.cpu().numpy()
            confidences = result.boxes.conf.cpu().numpy()
            classes = result.boxes.cls.cpu().numpy()

            for i in range(len(boxes)):
                x1, y1, x2, y2 = map(int, boxes[i])
                conf = confidences[i]
                cls = int(classes[i])

                if cls == 0 and conf > 0.5:
                    face_crop = frame[y1:y2, x1:x2]
                    face_image = Image.fromarray(cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB)).resize((160, 160))
                    face_array = np.array(face_image) / 255.0
                    face_array = np.expand_dims(face_array, axis=0)

                    with tf.device('/device:GPU:0'):
                        face_embedding = sess.run(output_tensor, feed_dict={
                            input_tensor: face_array,
                            graph.get_tensor_by_name('phase_train:0'): False
                        })[0]

                    min_dist = float("inf")
                    identity = "Unknown"

                    for name, stored_embedding in embeddings_dict.items():
                        dist = np.linalg.norm(stored_embedding - face_embedding)
                        if dist < min_dist:
                            min_dist = dist
                            identity = name

                    if min_dist < 0.975:
                        current_frame_ids.append(identity)
                        identity_buffer.append(identity)
                        stable_identity = str(max(set(identity_buffer), key=identity_buffer.count))


                        if stable_identity != "Unknown":
                            detection_count[stable_identity] = detection_count.get(stable_identity, 0) + 1

                            print("Count: ",detection_count[stable_identity])
                            if detection_count[stable_identity] > 20:
                                print("Insode if Count: ",detection_count[stable_identity])
                                current_time = datetime.now().strftime("%H:%M")
                                time_slots = {
                                    "10:15-12:15": ("10:15", "12:15"),
                                    "1:30-8:00": ("13:30", "20:00")
                                }

                                for slot, (start, end) in time_slots.items():
                                    if (start <= current_time) and (current_time <= end):
                                        print(f"Marking {stable_identity} Present")

                                        attendance_df.index = attendance_df.index.astype(str).str.strip().str.lower()
                                        stable_identity = stable_identity.strip().lower()

                                
                                        # Check and update attendance
                                        if stable_identity in attendance_df.index:
                                            attendance_df.loc[stable_identity, f"Sunday {slot}"] = 'P'
                                        else:
                                            print(f"Stable identity '{stable_identity}' not found in DataFrame index.")
                                        break

                        label = f"{stable_identity}"
                    else:
                        identity_buffer.append("Unknown")
                        label = "Unknown"

                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        cv2.imshow("Face Recognition", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

finally:
    cap.release()
    cv2.destroyAllWindows()
    sess.close()

    # Save updated attendance
    current_time = datetime.now().strftime("%H-%M")
    attendance_df.to_excel(f"attendance.xlsx")
    print("Attendance saved to attendance.xlsx")

Loading FaceNet model...

Using device: cuda

0: 480x640 1 person, 68.9ms
Speed: 0.0ms preprocess, 68.9ms inference, 82.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 couch, 1 bed, 19.6ms
Speed: 0.0ms preprocess, 19.6ms inference, 5.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 couch, 1 bed, 24.2ms
Speed: 0.0ms preprocess, 24.2ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bed, 24.6ms
Speed: 0.0ms preprocess, 24.6ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bed, 19.6ms
Speed: 0.0ms preprocess, 19.6ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bed, 24.7ms
Speed: 0.0ms preprocess, 24.7ms inference, 3.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bed, 20.5ms
Speed: 0.0ms preprocess, 20.5ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 