In [1]:
#檢查data.h5資料結構(資料結構跟資料集的資料夾結構不相同)>>需先通過mpii_face_gaze_dataset.py來解析用到dataloader裡面再傳到模型資料才可以使用
import h5py

# 打開 data.h5 文件用
with h5py.File('./data/mpiifacegaze_preprocessed/data.h5', 'r') as hdf:
    # 列出所有頂層對象 (數據集或組)
    print("HDF5 顶层对象：", list(hdf.keys()))
    
    # 迭代打印每個數據集的細節
    for key in hdf.keys():
        data = hdf[key]
        print(f"{key} 形狀: {data.shape}, 類型: {data.dtype}")

        # 如果數據集較小，可以顯示前幾個樣本
        if isinstance(data, h5py.Dataset):
            print(f"前幾個樣本： {data[:5]}")

HDF5 顶层对象： ['file_name_base', 'gaze_location', 'gaze_pitch', 'gaze_yaw', 'screen_size']
file_name_base 形狀: (37667,), 類型: object
前幾個樣本： [b'p00/day01/0005' b'p00/day01/0030' b'p00/day01/0031' b'p00/day01/0038'
 b'p00/day01/0063']
gaze_location 形狀: (37667, 2), 類型: int64
前幾個樣本： [[ 476  758]
 [1078  693]
 [ 126  598]
 [ 211   69]
 [ 878  752]]
gaze_pitch 形狀: (37667,), 類型: float64
前幾個樣本： [-0.29827398 -0.28884337 -0.24197472 -0.03119828 -0.28038082]
gaze_yaw 形狀: (37667,), 類型: float64
前幾個樣本： [-0.02652254  0.2650824  -0.19636019 -0.13139582  0.16473648]
screen_size 形狀: (37667, 2), 類型: int64
前幾個樣本： [[1280  800]
 [1280  800]
 [1280  800]
 [1280  800]
 [1280  800]]


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from dataset.mpii_face_gaze_dataset import get_dataloaders
import torch.nn.functional as F

# 檢查是否有 GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 定義一個新的神經網絡模型
class MultiInputCNN(nn.Module):
    def __init__(self):
        super(MultiInputCNN, self).__init__()
        
        # 卷積層處理全臉圖像
        self.cnn_face = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # 卷積層處理眼睛圖像
        self.cnn_eye = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # 全連接層
        self.fc_face = nn.Linear(16 * 48 * 48, 256)
        self.fc_eye = nn.Linear(16 * 32 * 48 * 2, 512)  # 眼睛輸出，左右眼拼接
        
        # 最終全連接層
        self.fc_final = nn.Linear(256 + 512, 2)  # 拼接後的大小

    def forward(self, full_face: torch.Tensor, right_eye: torch.Tensor, left_eye: torch.Tensor):
        # 處理全臉圖像

        out_cnn_face = self.cnn_face(full_face)
        print("Shape after CNN face:", out_cnn_face.shape)  # 檢查 CNN 輸出形
        out_cnn_face = out_cnn_face.view(out_cnn_face.size(0), -1)  # 展平
        print("Shape after flattening:", out_cnn_face.shape)  # 檢查展平後的形狀
        out_fc_face = self.fc_face(out_cnn_face).to(device)
        
        # 處理左右眼圖像
        out_cnn_right_eye = self.cnn_eye(right_eye)
        out_cnn_left_eye = self.cnn_eye(left_eye)
        print("Shape after CNN right eye:", out_cnn_right_eye.shape)
        print("Shape after CNN left eye:", out_cnn_left_eye.shape)

        
        # 拼接左右眼的特徵
        out_cnn_eye = torch.cat((out_cnn_right_eye.view(out_cnn_right_eye.size(0), -1),out_cnn_left_eye.view(out_cnn_left_eye.size(0), -1)), dim=1)
        
        out_fc_eye = self.fc_eye(out_cnn_eye)

        # 拼接全臉和眼睛的特徵
        combined_features = torch.cat((out_fc_face, out_fc_eye), dim=1)
        
        # 最終預測
        output = self.fc_final(combined_features)
        
        return output

# 設置超參數和獲取數據加載器（假設已經定義 get_dataloaders 函數）
batch_size = 32
path_to_data = './data/mpiifacegaze_preprocessed'
validate_on_person = 0  # 驗證用的人員 ID
test_on_person = 1      # 測試用的人員 ID

train_loader, valid_loader, test_loader = get_dataloaders(path_to_data, validate_on_person, test_on_person, batch_size)

# 初始化模型、損失函數和優化器
model = MultiInputCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 訓練模型
for epoch in range(1):  # 訓練10個epoch
    model.train()  # 設置為訓練模式
    
    for batch in train_loader:
        
        #print("gaze_location",batch['gaze_location'])
        full_face_images = batch['full_face_image'].float().to(device)   # 獲取全臉圖像
        #print("loader_full_face:",full_face_images.shape)
        right_eye_images = batch['right_eye_image'].float().to(device)   # 獲取右眼圖像
        #print("loader_right_eye_images",right_eye_images.shape)
        left_eye_images = batch['left_eye_image'].float().to(device)     # 獲取左眼圖像
        #print("loader_left_eye_images",left_eye_images.shape)

        gaze_pitch = batch['gaze_pitch'].float().to(device)
        #print("loader_gaze_pitch",gaze_pitch.shape)
        gaze_yaw = batch['gaze_yaw'].float().to(device)
        #print("loader_gaze_yaw",gaze_yaw.shape)

        labels = torch.stack([gaze_pitch, gaze_yaw], dim=1).to(device)   #俯仰角轉置變成每個樣本有2個角度
        
        optimizer.zero_grad()                          # 清除梯度
        
        outputs = model(full_face_images, right_eye_images, left_eye_images).to(device)  # 前向傳播

        #print("outputs",outputs.shape)
        
        loss = F.mse_loss(outputs, labels).to(device)              # 計算損失
        
        loss.backward()                                # 後向傳播
        optimizer.step()                               # 更新權重

    print(f'Epoch [{epoch + 1}/10], Loss: {loss.item():.4f}')

# 驗證模型性能（可選）
model.eval()  
correct = 0
total = 0

with torch.no_grad():
    for batch in valid_loader:
         
        full_face_images = batch['full_face_image'].float().to(device)   # 獲取全臉圖像
        right_eye_images = batch['right_eye_image'].float().to(device)   # 獲取右眼圖像
        left_eye_images = batch['left_eye_image'].float().to(device)     # 獲取左眼圖像

        gaze_pitch = batch['gaze_pitch'].float().to(device)
        gaze_yaw = batch['gaze_yaw'].float().to(device)

        labels = torch.stack([gaze_pitch, gaze_yaw], dim=1).to(device)   #俯仰角轉置變成每個樣本有2個角度
        
        outputs = model(full_face_images, right_eye_images, left_eye_images).to(device)
        
        # 計算損失，這裡可以選擇計算均方誤差 (MSE)
        loss = F.mse_loss(outputs, labels)  # 計算損失

        # 這裡可以選擇計算準確率的方式，例如計算預測值與標籤的平均絕對誤差
        # 如果需要計算準確率，可以設置一個閾值，例如：
        threshold = 0.1  # 設定一個容忍範圍
        correct += ((outputs - labels).abs() < threshold).sum().item()

        total += labels.size(0)  # 總樣本數量

print(f'Validation Mean Squared Error: {loss.item():.4f}')
print(f'Validation Accuracy (within threshold): {100 * correct / total:.2f}%')

train on persons [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
valid on person 0
test on person 1
len(dataset_train) 60784
len(dataset_train) 2927
len(dataset_train) 2904
Shape after CNN face: torch.Size([32, 16, 48, 48])
Shape after flattening: torch.Size([32, 36864])
Shape after CNN right eye: torch.Size([32, 16, 32, 48])
Shape after CNN left eye: torch.Size([32, 16, 32, 48])
Shape after CNN face: torch.Size([32, 16, 48, 48])
Shape after flattening: torch.Size([32, 36864])
Shape after CNN right eye: torch.Size([32, 16, 32, 48])
Shape after CNN left eye: torch.Size([32, 16, 32, 48])
Shape after CNN face: torch.Size([32, 16, 48, 48])
Shape after flattening: torch.Size([32, 36864])
Shape after CNN right eye: torch.Size([32, 16, 32, 48])
Shape after CNN left eye: torch.Size([32, 16, 32, 48])
Shape after CNN face: torch.Size([32, 16, 48, 48])
Shape after flattening: torch.Size([32, 36864])
Shape after CNN right eye: torch.Size([32, 16, 32, 48])
Shape after CNN left eye: torch.Size([32, 16,