In [1]:
import scipy.io
import numpy as np
import os
import glob
import pandas as pd
import cv2
import mediapipe as mp
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [2]:
# Map angles to [-π, π] )
def map_to_pi(angle):
    return (angle + np.pi) % (2 * np.pi) - np.pi

In [3]:
# Preprocess landmarks
def preprocess(face, width=450, height=450):
    x_val = [lm.x * width for lm in face.landmark]
    y_val = [lm.y * height for lm in face.landmark]
    x_val = np.array(x_val) - np.mean(x_val[1])  # Center around nose
    y_val = np.array(y_val) - np.mean(y_val[1])
    x_val = x_val / x_val.max() if x_val.max() != 0 else x_val
    y_val = y_val / y_val.max() if y_val.max() != 0 else y_val
    return np.concatenate([x_val, y_val])

In [4]:
# Path to dataset folder containing .mat files and images
folder = "AFLW2000"
# Get all .mat files in the dataset folder
mat_files = sorted(glob.glob(os.path.join(folder, "*.mat")))

# Empty list to store extracted pose information
pose_data = []

# Loop over all .mat files
for filename in mat_files:
    try:
        # Load the .mat file
        mat = scipy.io.loadmat(filename)

        # Usually AFLW2000 has "Pose_Para" or "pt2d"
        if "Pose_Para" not in mat:
            print(f"⚠️ Pose_Para not found in {filename}, skipping...")
            continue

        # Extract pose parameters (yaw, pitch, roll, translation, etc.)
        pose = mat["Pose_Para"]  # shape (1,7) or (7,1)
        pose = pose.reshape(-1)

        # Extract pitch, yaw, roll and normalize to [-pi, pi]
        pitch, yaw, roll = map_to_pi(pose[0]), map_to_pi(pose[1]), map_to_pi(pose[2])


        # Store extracted pose with corresponding image filename
        pose_data.append({
            "mat_file": os.path.basename(filename),
            "image": os.path.basename(filename).replace(".mat", ".jpg"),
            "yaw": yaw,
            "pitch": pitch,
            "roll": roll
        })

    except Exception as e:
        print(f"❌ Error reading {filename}: {e}")

# Convert all extracted pose data into a DataFrame for later use
df_pose = pd.DataFrame(pose_data)

In [5]:
df_pose.head()

Unnamed: 0,mat_file,image,yaw,pitch,roll
0,image00002.mat,image00002.jpg,0.018227,-0.399231,0.085676
1,image00004.mat,image00004.jpg,1.189533,0.470065,0.300959
2,image00006.mat,image00006.jpg,0.881137,-0.18465,-0.236852
3,image00008.mat,image00008.jpg,0.299208,-0.175379,-0.373374
4,image00010.mat,image00010.jpg,1.198004,-0.882169,-1.033374


In [6]:
# Extract Face Landmarks using Mediapipe

mp_face_mesh = mp.solutions.face_mesh

folder_path = "AFLW2000"   # folder containing your .jpg files
image_files = [f for f in os.listdir(folder_path) if f.endswith(".jpg")]

# Empty list to store landmark data
all_data = []

# Use MediaPipe FaceMesh to detect face landmarks
with mp_face_mesh.FaceMesh(
    static_image_mode=True,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5
) as face_mesh:

    # Use MediaPipe FaceMesh to detect face landmarks
    for img_file in tqdm(image_files, desc="Processing images"):
        img_path = os.path.join(folder_path, img_file)
        # Load image
        image = cv2.imread(img_path)
        if image is None:
            continue

        h, w, _ = image.shape # Get image dimensions
        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB

        # Load image
        results = face_mesh.process(rgb_image)
        if results.multi_face_landmarks:
            # take the first face only
            face_landmarks = results.multi_face_landmarks[0]
            # Preprocess landmarks
            landmarks = preprocess(face_landmarks, image.shape[1], image.shape[0])
            # Append to all_data
            all_data.append([img_file, landmarks])



Processing images: 100%|██████████| 2000/2000 [00:13<00:00, 146.05it/s]


In [7]:
# Save to dataframe
df_landmark = pd.DataFrame(all_data, columns=["image", "marks_values"])

In [8]:
df_landmark.head()

Unnamed: 0,image,marks_values
0,image00002.jpg,"[-0.004888512887593222, 0.0, 0.009465049577641..."
1,image00004.jpg,"[0.09810980194008903, 0.0, 0.09116667846005758..."
2,image00006.jpg,"[0.025413889513647227, 0.0, -0.011194460805470..."
3,image00008.jpg,"[0.1455376506244284, 0.0, 0.07284436427867869,..."
4,image00013.jpg,"[0.07809535117165153, 0.0, 0.01967548868284996..."


In [9]:
# Create the final dataframe
df_final = pd.merge(df_landmark, df_pose, on="image", how="inner")

print("✅ Final DataFrame created with landmarks + pose")
df_final.head()

✅ Final DataFrame created with landmarks + pose


Unnamed: 0,image,marks_values,mat_file,yaw,pitch,roll
0,image00002.jpg,"[-0.004888512887593222, 0.0, 0.009465049577641...",image00002.mat,0.018227,-0.399231,0.085676
1,image00004.jpg,"[0.09810980194008903, 0.0, 0.09116667846005758...",image00004.mat,1.189533,0.470065,0.300959
2,image00006.jpg,"[0.025413889513647227, 0.0, -0.011194460805470...",image00006.mat,0.881137,-0.18465,-0.236852
3,image00008.jpg,"[0.1455376506244284, 0.0, 0.07284436427867869,...",image00008.mat,0.299208,-0.175379,-0.373374
4,image00013.jpg,"[0.07809535117165153, 0.0, 0.01967548868284996...",image00013.mat,0.011965,-0.026812,-0.220662


In [10]:
# Convert the list of face landmark coordinates into a NumPy array
X = np.array(df_final['marks_values'].to_list())
# Extract the head pose angles (pitch, yaw, roll) as the target values
y = np.array(df_final[['pitch', 'yaw', 'roll']])
# Print the shapes
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (1843, 956), y shape: (1843, 3)


In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train SVR with MultiOutputRegressor
regressor = SVR(kernel='rbf', C=3, gamma=0.005, degree=3, epsilon=0.005)
multi_output_regressor = MultiOutputRegressor(regressor)
multi_output_regressor.fit(X_train, y_train)

In [13]:
# Evaluate
predictions = multi_output_regressor.predict(X_test)
mse = mean_squared_error(y_test, predictions)
med_ape = np.median(np.abs((y_test - predictions) / y_test), axis=None) * 100
med_spe = np.median(((y_test - predictions) ** 2 / y_test), axis=None) * 100
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse:.4f}')
print(f'Median Absolute Percentage Error: {med_ape:.2f}%')
print(f'Median Squared Percentage Error: {med_spe:.2f}%')
print(f'R² Score: {r2:.4f}')

Mean Squared Error: 0.0194
Median Absolute Percentage Error: 21.99%
Median Squared Percentage Error: -0.05%
R² Score: 0.8031


In [14]:
# Save model
with open('svr_model.pkl', 'wb') as f:
    pickle.dump(multi_output_regressor, f)

In [15]:
multi_output_regressor.score(X_train, y_train)


0.8545858219520569