In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
import cv2
import numpy as np
import os
from PIL import Image
from scipy.spatial.distance import cosine
from tqdm import tqdm

In [2]:
# Шаг 1: Определение VGG-Face модели
class VGGFaceModel(nn.Module):
    def __init__(self):
        super(VGGFaceModel, self).__init__()
        self.model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
        self.model.classifier = nn.Sequential(*list(self.model.classifier.children())[:-1])

    def forward(self, x):
        return self.model(x)

In [3]:
vgg_face = VGGFaceModel()
vgg_face.eval()

Using cache found in /Users/adpakw/.cache/torch/hub/pytorch_vision_v0.10.0


VGGFaceModel(
  (model): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      (

In [4]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [5]:
def get_face_embedding(image, model):
    image = preprocess(image).unsqueeze(0)  # Добавляем размер батча
    with torch.no_grad():
        embedding = model(image).numpy().flatten()
    return embedding

In [6]:
def detect_faces(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    
    face_images = []
    face_coords = []
    for (x, y, w, h) in faces:
        face = image[y:y+h, x:x+w]
        face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
        face_images.append(Image.fromarray(face))
        face_coords.append((x, y, w, h))

    return face_images, face_coords

In [7]:
def load_known_faces(known_faces_dir, model):
    known_face_encodings = []
    known_face_names = []

    for file_name in os.listdir(known_faces_dir):
        if file_name.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(known_faces_dir, file_name)
            image = cv2.imread(image_path)
            detected_faces, _ = detect_faces(image)

            if not detected_faces:
                print(f"No face detected in {file_name}, skipping.")
                continue

            # Предполагаем, что в изображении только одно лицо (первое найденное)
            face = detected_faces[0]
            embedding = get_face_embedding(face, model)
            known_face_encodings.append(embedding)
            known_face_names.append(os.path.splitext(file_name)[0])

    return known_face_encodings, known_face_names

In [8]:
def recognize_faces(frame, known_face_encodings, known_face_names, model, threshold=0.5):
    detected_faces, face_coords = detect_faces(frame)

    results = []
    for face, (x, y, w, h) in zip(detected_faces, face_coords):
        input_embedding = get_face_embedding(face, model)
        best_match = ("Unknown", 0)

        for known_embedding, name in zip(known_face_encodings, known_face_names):
            similarity = 1 - cosine(input_embedding, known_embedding)  # Косинусное сходство
            if similarity >= threshold and similarity > best_match[1]:
                best_match = (name, similarity)

        results.append((best_match[0], best_match[1], (x, y, w, h)))

    return results

In [9]:
# Шаг 6: Настройка директорий
known_faces_dir = "../data/known_faces"
input_video_path = "../data/Late Night Nightmare wâ§¸ Kimmel, Meyers, Colbert, Fallon, Letterman & Noah.mp4"
output_video_path = "../data/output_video.mp4"

# Загрузка известных лиц
known_face_encodings, known_face_names = load_known_faces(known_faces_dir, vgg_face)

# Открытие видео
video_capture = cv2.VideoCapture(input_video_path)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
fps = int(video_capture.get(cv2.CAP_PROP_FPS))
width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))

for _ in tqdm(range(total_frames), desc="Processing video"):
    ret, frame = video_capture.read()
    if not ret:
        break

    # Распознавание лиц в кадре
    results = recognize_faces(frame, known_face_encodings, known_face_names, vgg_face)

    # Рисование рамок и подписей
    for name, similarity, (x, y, w, h) in results:
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        label = f"{name}: {similarity:.2f}"
        cv2.putText(frame, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    out.write(frame)

video_capture.release()
out.release()
print(f"Output video saved to {output_video_path}")


Processing video:  21%|██▏       | 1925/9037 [03:08<11:36, 10.22it/s]


KeyboardInterrupt: 