In [7]:
import cv2 
from matplotlib import pyplot as plt

In [8]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
import os
import csv
import cv2
from frame import FLCDataset, visualize_transformed_image


In [9]:
IMG_INPUT_SIZE = [12,12]

# 定义转换操作
transform = transforms.Compose([
    transforms.Resize(IMG_INPUT_SIZE[0]),
    transforms.CenterCrop(IMG_INPUT_SIZE[0]),
    transforms.ToTensor(),  # 将PIL图像或NumPy ndarray转换为FloatTensor。
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # 标准化，使用ImageNet的均值和标准差
                         std=[0.229, 0.224, 0.225])
])


def label_transform(label, img_size):
    # 目标尺寸
    nh, nw = IMG_INPUT_SIZE[1], IMG_INPUT_SIZE[0]
    # 原始尺寸
    h, w = img_size
    # 计算缩放比例
    x_scale = nw / w
    y_scale = nh / h
    
    # 处理标签中的每个坐标
    transformed_label = []
    for i, value in enumerate(label):
        if i % 2 == 0:  # 偶数索引位置，x坐标
            transformed_label.append(value * x_scale)
        else:  # 奇数索引位置，y坐标
            transformed_label.append(value * y_scale)
            
    return transformed_label



In [10]:
train_dataset = FLCDataset(r"C:\Users\lucyc\Desktop\face_loc\train.csv", r"C:\Users\lucyc\Desktop\face_loc\train", transform, label_transform)
val_dataset = FLCDataset(r"C:\Users\lucyc\Desktop\face_loc\val.csv", r"C:\Users\lucyc\Desktop\face_loc\val", transform, label_transform)
test_dataset = FLCDataset(r"C:\Users\lucyc\Desktop\face_loc\test.csv", r"C:\Users\lucyc\Desktop\face_loc\test", transform, label_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [11]:
a, b, c = next(iter(val_loader))

In [12]:
a.shape

torch.Size([32, 3, 12, 12])

In [13]:
b

tensor([[  0.0000,  -4.7664,   8.4112,  12.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  4.3960,   3.6832,   8.0792,   3.5644,   7.1287,   6.7723,   4.6337,
           9.1485,   7.8416,   8.9109],
        [  2.3784,   3.6757,   6.0541,   3.4595,   4.7568,   5.9459,   2.4865,
           8.3243,   5.6216,   8.3243],
        [ 13.3538,   0.0000,   8.8615,  12.3077,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  2.4375,   3.7500,   6.1875,   4.0312,   3.9375,   6.5625,   2.6250,
           7.5000,   6.2812,   7.6875],
        [  0.0000,  -0.5647,  11.4353,  11.7176,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  4.2000,   5.1600,   8.2800,   5.2800,   5.0400,   8.0400,   5.0400,
          10.2000,   7.6800,  10.6800],
        [  0.0000,   1.2000,  11.1000,  12.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,  10.5000,  12.3871,

In [14]:
c

('1',
 '3',
 '3',
 '2',
 '3',
 '0',
 '3',
 '0',
 '0',
 '3',
 '2',
 '3',
 '0',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '2',
 '1',
 '0',
 '3',
 '1',
 '0',
 '3',
 '3',
 '1')

In [15]:
def get_args(b):
    return [int(float(x)//1) for x in b.split()]

In [16]:
#visualize_transformed_image(a[6],get_args(b[6]),get_args(b[6]))

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "CPU")
print(device)
torch.cuda.empty_cache()

cuda:0


In [22]:
class PNet(nn.Module):

    def __init__(self):
        super(PNet, self).__init__()

        # 定义网络层
        self.conv1 = nn.Conv2d(3, 10, 3)  #12 -> 10 -> maxp -> 5
        self.conv2 = nn.Conv2d(10, 16, 3) #5 -> 3
        self.conv3 = nn.Conv2d(16, 32, 3) #3 -> 1

        self.face_det = nn.Conv2d(32, 2, 1) #1 -> 1
        self.bbox = nn.Conv2d(32, 10, 1) #1 -> 1
        self.landmark = nn.Conv2d(32, 10, 1) #1 -> 1

    def forward(self, x):
        # 定义前向传播
        x = F.relu(self.conv1(x)) #10
        x = F.max_pool2d(x, 2) #5
        x = F.relu(self.conv2(x)) #3
        x = F.relu(self.conv3(x)) #1

        facedet = F.relu(self.face_det(x))
        bbox = F.relu(self.bbox(x))
        landmark = F.relu(self.landmark(x))

        facedet = torch.flatten(facedet, 1)
        bbox = torch.flatten(bbox, 1)
        landmark = torch.flatten(landmark, 1)

        return facedet, bbox, landmark


In [50]:
class PNetLoss(nn.Module):
    def __init__(self):
        super(PNetLoss, self).__init__()
        # 初始化可以在这里完成，如果有必要的话

    def forward(self, facedet, bbox, landmark, label, ltypes):
        # 假设 'device' 是一个全局变量或已经事先定义好的
        loss_total = torch.tensor(0.0, device=device, requires_grad=True)
        
        for i, ltype in enumerate(ltypes):
            # 计算面部检测损失
            facedet_loss = torch.pow(facedet[i][0] - 1, 2)
            
            if ltype == "2":
                # 对于类型2，只需要面部检测损失
                loss_total = loss_total + facedet_loss
            elif ltype in ["1", "0"]:
                # 对于类型1和0，计算边界框损失
                bbox_loss = F.mse_loss(bbox[i], label[i])
                loss_total = loss_total + (bbox_loss + facedet_loss)
            elif ltype == "3":
                # 对于类型3，计算关键点损失
                landmark_loss = F.mse_loss(landmark[i], label[i])
                loss_total = loss_total + (landmark_loss + facedet_loss)

        return loss_total

In [51]:
torch.cuda.empty_cache()
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x263db32b090>

In [59]:
model = PNet()
print(model)

model.to(device)  # 将模型发送到GPU，如果有的话

# 定义损失函数和优化器
criterion = PNetLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 训练模型
num_epochs = 5

for epoch in range(num_epochs):
    model.train()  # 设置模型为训练模式

    train_loss_acc = 0.
    train_num = 0

    for inputs in train_loader:
        
        img_tensor = inputs[0].to(device)
        label = inputs[1].to(device)
        ltypes = inputs[2]

        facedet, bbox, landmark = model(img_tensor)
        optimizer.zero_grad()  # 清除之前的梯度
        loss = criterion(facedet, bbox, landmark, label, ltypes)
        # 反向传播和优化
        loss.backward()  # 反向传播计算当前的梯度
        optimizer.step()  # 更新参数

        train_loss_acc += loss.item()
        train_num += 1

        print(train_num,loss.item(),train_loss_acc/train_num, train_num*32, len(train_dataset))

    model.eval() 
    val_loss_acc = 0
    val_num = 0
    with torch.no_grad():
        for inputs in val_loader:
            img_tensor = inputs[0].to(device)
            label = inputs[1].to(device)
            ltypes = inputs[2]

            facedet, bbox, landmark = model(img_tensor)
            optimizer.zero_grad()  # 清除之前的梯度
            loss = criterion(facedet, bbox, landmark, label, ltypes)
            # 反向传播和优化
            loss.backward()  # 反向传播计算当前的梯度
            optimizer.step()  # 更新参数

            val_loss_acc += loss.item()
            val_num += 1

            print(val_num,loss.item(),val_loss_acc/val_num, val_num*32, len(train_dataset))

    print("Epoch [{}/{}], Loss: {:.2f}, Val_loss: {:.2f}".format(epoch+1, num_epochs, val_loss_acc/val_num, val_loss_acc/val_num))


PNet(
  (conv1): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(10, 16, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (face_det): Conv2d(32, 2, kernel_size=(1, 1), stride=(1, 1))
  (bbox): Conv2d(32, 10, kernel_size=(1, 1), stride=(1, 1))
  (landmark): Conv2d(32, 10, kernel_size=(1, 1), stride=(1, 1))
)
1 683.6535034179688 683.6535034179688 32 130000
2 617.8057861328125 650.7296447753906 64 130000
3 684.5799560546875 662.0130818684896 96 130000
4 795.099609375 695.2847137451172 128 130000
5 679.6245727539062 692.152685546875 160 130000
6 625.7022705078125 681.0776163736979 192 130000
7 866.8304443359375 707.6137346540179 224 130000
8 801.1862182617188 719.3102951049805 256 130000
9 600.3334350585938 706.0906439887153 288 130000
10 498.264892578125 685.3080688476563 320 130000
11 738.489013671875 690.1427001953125 352 130000
12 795.678955078125 698.9373881022135 384 130000
13 716.2138671875 700.266348031851 416 1300

KeyboardInterrupt: 