In [1]:
# Import required libraries
import torch
from torchvision import transforms
from PIL import Image
from transformers import ViTForImageClassification
import cv2
import numpy as np

# Disable Hugging Face symlinks warning
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load the fine-tuned model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",  # Pre-trained ViT model
    num_labels=7,  # FER2013 has 7 emotion classes
    ignore_mismatched_sizes=True  # Ignore size mismatch for classification head
)

# Load the saved model weights and map them to CPU
model.load_state_dict(torch.load("vit_fer2013 (1).pth", map_location=torch.device('cpu')))
model.eval()  # Set the model to evaluation mode

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the class labels (FER2013 emotions)
class_labels = ["Angry", "Disgust", "Fear", "Happy", "Neutral", "Sad", "Surprise"]

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224 (ViT input size)
    transforms.Grayscale(num_output_channels=3),  # Convert grayscale to 3 channels
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize
])

# Function to predict emotion from a face image
def predict_emotion(face_image):
    # Convert the OpenCV BGR image to RGB
    face_image = cv2.cvtColor(face_image, cv2.COLOR_BGR2RGB)
    # Convert to PIL Image
    face_image = Image.fromarray(face_image)
    # Preprocess the image
    face_image = transform(face_image).unsqueeze(0)  # Add batch dimension
    face_image = face_image.to(device)  # Move image to GPU if available

    # Perform inference
    with torch.no_grad():
        outputs = model(face_image).logits
        _, predicted = torch.max(outputs, 1)  # Get the predicted class index

    # Get the predicted emotion label
    predicted_emotion = class_labels[predicted.item()]
    return predicted_emotion

# Initialize the webcam
cap = cv2.VideoCapture(0)  # 0 for default webcam

# Load the Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# Real-time emotion recognition loop
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces in the frame
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    # Process each detected face
    for (x, y, w, h) in faces:
        # Extract the face region
        face = frame[y:y+h, x:x+w]

        # Predict the emotion
        emotion = predict_emotion(face)

        # Draw a rectangle around the face
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

        # Display the predicted emotion
        cv2.putText(frame, emotion, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Real-Time Facial Expression Recognition", frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("vit_fer2013 (1).pth", map_location=torch.device('cpu')))
