In [1]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
import zipfile
import subprocess
import os
# import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torchsummary
import numpy as np
from PIL import Image
from PIL import ImageFile
import json
# import cv2
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import random

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [3]:
def seed_everything(seed=42):
    """
    모든 랜덤 시드를 주어진 값으로 고정합니다.
    """
    random.seed(seed)  # Python random 모듈
    np.random.seed(seed)  # Numpy 랜덤 시드
    torch.manual_seed(seed)  # PyTorch 랜덤 시드
    torch.cuda.manual_seed(seed)  # GPU를 위한 PyTorch 랜덤 시드
    torch.cuda.manual_seed_all(seed)  # 멀티 GPU를 위한 PyTorch 랜덤 시드

seed_everything()

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, image_zip_paths, label_zip_paths, cache_dir, transform=None, workers=4):
        super().__init__()
        self.image_zip_paths = image_zip_paths
        self.label_zip_paths = label_zip_paths
        self.cache_dir = cache_dir
        self.transform = transform
        self.workers = workers

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir, exist_ok=True)

        self.image_names = {}
        self.label_names = {}
        self.image_list = []

        self._prepare()

    def _extract_and_cache(self, zip_path, file_name, cache_file_path, attempts=3):
        if os.path.exists(cache_file_path):
            return True

        command = ["unzip", "-o", zip_path, file_name, "-d", os.path.dirname(cache_file_path)]

        for attempt in range(attempts):
            result = subprocess.run(command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            # 성공적으로 압축 해제되었거나, 경고만 있는 경우
            if result.returncode == 0 or "warning" in result.stderr.decode().lower():
                return True
            else:
                # 오류 메시지가 있고, 재시도 횟수가 남아있는 경우
                print(f"Attempt {attempt + 1}: Error unzipping file => {zip_path}")
                time.sleep(2 ** attempt)

        # 모든 시도가 실패한 경우
        print(f"Failed to extract {file_name}")
        return False

    def _prepare(self):
        max_tries = 3
        wait_seconds = 10

        # 각 zip 파일에 대한 재시도 횟수를 추적
        retries = {zip_path: 0 for zip_path in self.image_zip_paths + self.label_zip_paths}

        # 전체 압축 파일 리스트
        to_process = [(zip_path, '.png') for zip_path in self.image_zip_paths] + [(zip_path, '.json') for zip_path in self.label_zip_paths]

        while to_process:
            # 병렬처리
            with ThreadPoolExecutor(max_workers=self.workers) as executor:
                future_to_zip_path = {
                    executor.submit(self._process_zip, zip_path, file_extension): (zip_path, file_extension)
                    for zip_path, file_extension in to_process
                }

                # 처리 중 예외 발생 시 재시도할 작업 목록
                to_retry = []

                # future 객체의 작업완료 상황을 모니터링
                for future in as_completed(future_to_zip_path):
                    zip_path, file_extension = future_to_zip_path[future]

                    try:
                        success = future.result()
                        if not success:
                            raise Exception(f"Failed to process {zip_path}")

                    except OSError as e: # 구글드라이브와의 연결로 인한 예외처리
                        if e.errno == 107 and retries[zip_path] < max_tries:
                            print(f"OSError [Errno 107] => {zip_path}")
                            to_retry.append((zip_path, file_extension))
                            retries[zip_path] += 1
                            time.sleep(wait_seconds)  # 각 재시도 사이에 대기
                            wait_seconds *= 2  # 대기 시간 증가
                        else:
                            print(f"Unexpected Error : {zip_path}: {e}")

                    except Exception as e:
                        print(f"Exception processing {zip_path}: {e}")

                # 재시도할 작업이 있으면 to_process 업데이트
                to_process = to_retry if to_retry else []

        self.image_list = sorted(self.image_names.keys())

    def _process_zip(self, zip_path, file_extension):
        success = True
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:

            for file_name in sorted(zip_ref.namelist()):

                if file_name.endswith(file_extension):
                    base_name = os.path.splitext(os.path.basename(file_name))[0].lstrip('_')  # 확장자 제외 파일 기본 이름

                    # 데이터 접근시 사용할 경로
                    cache_file_path = os.path.join(self.cache_dir, os.path.basename(zip_path).replace('.zip', ''), file_name.replace('/', '_').lstrip('_'))

                    if file_extension == '.png':
                        self.image_names[base_name.lower()] = cache_file_path
                    else:
                        self.label_names[base_name.lower()] = cache_file_path

                    extract_success = self._extract_and_cache(zip_path, file_name, cache_file_path)
                    success = success and extract_success

        return success

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        # 파일 이름을 키 리스트에서 추출
        base_name = self.image_list[idx]

        image_path = self.image_names.get(base_name)

        try:
            image = Image.open(image_path)
        except Exception as e:
            return None

        label_path = self.label_names.get(base_name)
        with open(label_path, 'r') as f:
            label_data = json.load(f)

        age_past = label_data['age_past']
        gender = label_data['gender']
        box = label_data['annotation'][0]['box']

        image = image.crop((box['x'], box['y'], box['x'] + box['w'], box['y'] + box['h']))

        if self.transform:
            image = self.transform(image)

        gender_label = 0 if gender == 'male' else 1

        label = {'age_past': age_past, 'gender': gender_label}

        return image, label


In [5]:
transform_train = transforms.Compose([
    transforms.RandomRotation(5),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

transform_val = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [6]:
def get_zip_files(directory):
    return [os.path.join(directory, zip) for zip in os.listdir(directory) if zip.endswith('.zip')] # 리스트를 정렬하면 속도 하락

train_images = '/content/drive/Othercomputers/Home/data_age/data/Training/image'
train_labels = '/content/drive/Othercomputers/Home/data_age/data/Training/label'
train_cache = '/content/cache/train'

val_images = '/content/drive/Othercomputers/Home/data_age/data/Validation/image'
val_labels = '/content/drive/Othercomputers/Home/data_age/data/Validation/label'
val_cache = '/content/cache/val'

train_image_zips = get_zip_files(train_images)
train_label_zips = get_zip_files(train_labels)
val_image_zips = get_zip_files(val_images)
val_label_zips = get_zip_files(val_labels)

In [7]:
%time train_dataset = CustomDataset(train_image_zips, train_label_zips, train_cache, transform_train)

Attempt 1: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0301.zipAttempt 1: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0015.zip
Attempt 1: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0584.zip

Attempt 1: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0480.zip
Attempt 2: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0015.zip
Attempt 2: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0584.zip
Attempt 2: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0301.zip
Attempt 2: Error unzipping file => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0480.zip
OSError [Errno 107] => /content/drive/Othercomputers/Home/data_age/data/Training/image/TS_0015.zip
OSError [Errn

In [8]:
%time val_dataset = CustomDataset(val_image_zips, val_label_zips, val_cache, transform_val)

CPU times: user 5.55 s, sys: 2.4 s, total: 7.95 s
Wall time: 4min 34s


In [9]:
def custom_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    return torch.utils.data.dataloader.default_collate(batch)

In [10]:
batch_size = 64

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    collate_fn=custom_collate_fn,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    prefetch_factor=4
)

val_loader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    collate_fn=custom_collate_fn,
    batch_size=batch_size,
    num_workers=2,
    prefetch_factor=4
)

In [11]:
x, y = next(iter(train_loader))
x.shape

torch.Size([64, 3, 224, 224])

In [12]:
y['age_past'].shape, y['gender'].shape

(torch.Size([64]), torch.Size([64]))

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [71]:
class Age_Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_1 = nn.Conv2d(3, 128, kernel_size=3, padding='same')
        self.conv1_2 = nn.Conv2d(128, 128, kernel_size=3, padding='same')
        self.conv1_3 = nn.Conv2d(128, 128, kernel_size=3, padding='same')

        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.one_conv1 = nn.Conv2d(128, 64, kernel_size=1)
        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding='same')
        self.conv2_3 = nn.Conv2d(128, 128, kernel_size=3, padding='same')

        self.one_conv2 = nn.Conv2d(128, 64, kernel_size=1)
        self.conv3_1 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.conv3_2 = nn.Conv2d(128, 256, kernel_size=3, padding='same')
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding='same')

        self.one_conv3 = nn.Conv2d(256, 64, kernel_size=1)
        self.conv4_1 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.conv4_2 = nn.Conv2d(128, 256, kernel_size=3, padding='same')
        self.conv4_3 = nn.Conv2d(256, 512, kernel_size=3, padding='same')
        self.one_conv4 = nn.Conv2d(512, 128, kernel_size=1)

        self.fc1 = nn.Linear(25088, 128)
        self.dropout = nn.Dropout(0.25)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.conv1_1(x))
        x = F.relu(self.conv1_2(x))
        x = F.relu(self.conv1_3(x))
        x = self.pool(x)

        x = F.relu(self.one_conv1(x))
        x = F.relu(self.conv2_1(x))
        x = F.relu(self.conv2_2(x))
        x = F.relu(self.conv2_3(x))
        x = self.pool(x)
        
        x = F.relu(self.one_conv2(x))
        x = F.relu(self.conv3_1(x))
        x = F.relu(self.conv3_2(x))
        x = F.relu(self.conv3_3(x))
        x = self.pool(x)

        x = F.relu(self.one_conv3(x))
        x = F.relu(self.conv4_1(x))
        x = F.relu(self.conv4_2(x))
        x = F.relu(self.conv4_3(x))
        x = F.relu(self.one_conv4(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(self.dropout(x)))
        x = self.fc2(x)
        x = F.relu(x).squeeze()
        return x

In [15]:
class Gender_Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 36, kernel_size=3)
        self.conv2 = nn.Conv2d(36, 64, kernel_size=3)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3)

        self.pool = nn.MaxPool2d(kernel_size=3, stride=2)

        self.fc1 = nn.Linear(512 * 4 * 4, 512)
        self.dropout = nn.Dropout(0.25)
        self.fc2 = nn.Linear(512, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = self.pool(F.relu(self.conv5(x)))

        x = x.view(-1, 512 * 4 * 4)
        x = F.relu(self.fc1(self.dropout(x)))
        x = self.fc2(x)
        x = x.squeeze()
        return x

In [72]:
age_model = Age_Net().to(device)
torchsummary.summary(age_model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1        [-1, 128, 224, 224]           3,584
            Conv2d-2        [-1, 128, 224, 224]         147,584
            Conv2d-3        [-1, 128, 224, 224]         147,584
         MaxPool2d-4        [-1, 128, 112, 112]               0
            Conv2d-5         [-1, 64, 112, 112]           8,256
            Conv2d-6        [-1, 128, 112, 112]          73,856
            Conv2d-7        [-1, 128, 112, 112]         147,584
            Conv2d-8        [-1, 128, 112, 112]         147,584
         MaxPool2d-9          [-1, 128, 56, 56]               0
           Conv2d-10           [-1, 64, 56, 56]           8,256
           Conv2d-11          [-1, 128, 56, 56]          73,856
           Conv2d-12          [-1, 256, 56, 56]         295,168
           Conv2d-13          [-1, 256, 56, 56]         590,080
        MaxPool2d-14          [-1, 256,

In [38]:
50176 / 128
np.sqrt(392)

19.79898987322333

In [17]:
gender_model = Gender_Net().to(device)
torchsummary.summary(gender_model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 36, 222, 222]           1,008
         MaxPool2d-2         [-1, 36, 110, 110]               0
            Conv2d-3         [-1, 64, 108, 108]          20,800
         MaxPool2d-4           [-1, 64, 53, 53]               0
            Conv2d-5          [-1, 128, 51, 51]          73,856
         MaxPool2d-6          [-1, 128, 25, 25]               0
            Conv2d-7          [-1, 256, 23, 23]         295,168
         MaxPool2d-8          [-1, 256, 11, 11]               0
            Conv2d-9            [-1, 512, 9, 9]       1,180,160
        MaxPool2d-10            [-1, 512, 4, 4]               0
          Dropout-11                 [-1, 8192]               0
           Linear-12                  [-1, 512]       4,194,816
           Linear-13                    [-1, 1]             513
Total params: 5,766,321
Trainable param

In [18]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim

opt_age = optim.Adam(age_model.parameters(), lr=0.0003)
age_lr_scheduler = ReduceLROnPlateau(opt_age, mode='min', verbose=True)

opt_gender = optim.Adam(gender_model.parameters(), lr=0.0003)
gender_lr_scheduler = ReduceLROnPlateau(opt_gender, mode='min', verbose=True)

In [19]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.best_loss = np.inf
        self.early_stop = False
        self.counter = 0

    def __call__(self, val_loss):
        if self.best_loss - val_loss > self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                if self.verbose:
                    print("Early stopping")

In [20]:
early_stopping = EarlyStopping(patience=5, verbose=True)

In [21]:
def train_loop_age(dataloader, model, loss_fn, optimizer, epoch):
    model.train()
    size = len(dataloader.dataset)
    total_loss = 0.0
    start_time = time.time()  # 에포크 시작 시간

    for batch, (x, y) in enumerate(dataloader):
        batch_start_time = time.time()  # 배치 처리 시작 시간
        x, y = x.to(device), y['age_past'].float().to(device)
        pred = model(x)
        loss = loss_fn(pred.squeeze(), y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        batch_process_time = time.time() - batch_start_time

        if batch % 10 == 0:
            processed = (batch + 1) * len(x)
            print(f'Epoch {epoch+1} : [{processed} / {size}] loss : {loss.item()}, Batch time: {batch_process_time:.4f} sec')

    average_loss = total_loss / len(dataloader)
    epoch_time = time.time() - start_time

    print(f"Epoch {epoch+1} finished, Total Epoch time: {epoch_time:.4f} sec")
    return average_loss


In [22]:
def validation_loop_age(dataloader, model, loss_fn, device):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y['age_past'].float().to(device)
            pred = model(x)
            loss = loss_fn(pred, y)
            val_loss += loss.item()
    val_loss /= len(dataloader)
    return val_loss

In [23]:
def save_model(epoch, model, optimizer, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

In [24]:
# start = time.time()
# for epoch in range(30):
#     age_loss = train_loop_age(train_loader, age_model, nn.MSELoss(), opt_age, epoch)
#     val_loss = validation_loop_age(val_loader, age_model, nn.MSELoss(), device)
#     age_lr_scheduler.step(val_loss)

#     early_stopping(val_loss)
#     if early_stopping.early_stop:
#         print("Early stopping triggered")
#         save_model(epoch, age_model, opt_age, '/content/drive/MyDrive/DL_DATA/Model/Cashe/age_model_checkpoint.pth')
#         break

#     save_model(epoch, age_model, opt_age, f'/content/drive/MyDrive/DL_DATA/Model/Cashe/age_model_checkpoint_epoch_{epoch+1}.pth')
#     print(f'Epoch : {epoch+1}, Loss : {age_loss}, Val_loss : {val_loss}')

# total_time = time.time() - start
# # 전체 학습 시간 출력
# hours, rem = divmod(total_time, 3600)
# minutes, seconds = divmod(rem, 60)
# print("Total training time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [25]:
# age 모델 이어서 학습
def load_model(model, optimizer, path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    return model, optimizer, epoch

In [26]:
# 모델과 옵티마이저 초기화
age_model = Age_Net().to(device)
opt_age = optim.Adam(age_model.parameters(), lr=0.0003)

# 체크포인트 불러오기
age_model, opt_age, start_epoch = load_model(age_model, opt_age, '/content/drive/MyDrive/DL_DATA/Model/norm_batch/age_model_checkpoint_epoch_11.pth')

# 학습 재개
start = time.time()
for epoch in range(start_epoch + 1, start_epoch + 28):
    age_loss = train_loop_age(train_loader, age_model, nn.MSELoss(), opt_age, epoch)
    val_loss = validation_loop_age(val_loader, age_model, nn.MSELoss(), device)
    age_lr_scheduler.step(val_loss)

    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered")
        save_model(epoch, age_model, opt_age, '/content/drive/MyDrive/DL_DATA/Model/norm_batch/age_model_checkpoint.pth')
        break

    save_model(epoch, age_model, opt_age, f'/content/drive/MyDrive/DL_DATA/Model/norm_batch/age_model_checkpoint_epoch_{epoch+1}.pth')
    print(f'Epoch : {epoch + 1}, Loss : {age_loss}, Val_loss : {val_loss}')

total_time = time.time() - start

hours, rem = divmod(total_time, 3600)
minutes, seconds = divmod(rem, 60)
print("Total training time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

Epoch 12 : [64 / 40150] loss : 49.313148498535156, Batch time: 3.2401 sec
Epoch 12 : [704 / 40150] loss : 32.76795196533203, Batch time: 1.7047 sec
Epoch 12 : [1344 / 40150] loss : 29.350566864013672, Batch time: 1.7053 sec
Epoch 12 : [1984 / 40150] loss : 35.77113723754883, Batch time: 1.6939 sec
Epoch 12 : [2624 / 40150] loss : 25.802255630493164, Batch time: 1.8060 sec
Epoch 12 : [3264 / 40150] loss : 29.548646926879883, Batch time: 1.8062 sec
Epoch 12 : [3904 / 40150] loss : 39.687034606933594, Batch time: 1.8207 sec
Epoch 12 : [4544 / 40150] loss : 24.013076782226562, Batch time: 1.8464 sec
Epoch 12 : [5184 / 40150] loss : 18.948627471923828, Batch time: 1.7831 sec
Epoch 12 : [5824 / 40150] loss : 26.584285736083984, Batch time: 1.8123 sec
Epoch 12 : [6464 / 40150] loss : 37.56182861328125, Batch time: 1.8067 sec
Epoch 12 : [7104 / 40150] loss : 35.59412384033203, Batch time: 1.8399 sec
Epoch 12 : [7744 / 40150] loss : 39.27695083618164, Batch time: 1.8276 sec
Epoch 12 : [8384 / 4



Epoch 12 : [10304 / 40150] loss : 40.217262268066406, Batch time: 1.8048 sec
Epoch 12 : [10944 / 40150] loss : 18.308425903320312, Batch time: 1.8371 sec
Epoch 12 : [11584 / 40150] loss : 37.61986541748047, Batch time: 1.8410 sec
Epoch 12 : [12224 / 40150] loss : 33.50717544555664, Batch time: 1.8225 sec
Epoch 12 : [12864 / 40150] loss : 31.02480125427246, Batch time: 1.8345 sec
Epoch 12 : [13504 / 40150] loss : 27.472766876220703, Batch time: 1.8199 sec
Epoch 12 : [14144 / 40150] loss : 38.95669937133789, Batch time: 1.8177 sec
Epoch 12 : [14784 / 40150] loss : 46.026344299316406, Batch time: 1.8363 sec
Epoch 12 : [15424 / 40150] loss : 40.643348693847656, Batch time: 1.8505 sec
Epoch 12 : [16064 / 40150] loss : 50.301918029785156, Batch time: 1.8402 sec




Epoch 12 : [16704 / 40150] loss : 29.849184036254883, Batch time: 1.8474 sec
Epoch 12 : [17344 / 40150] loss : 28.641448974609375, Batch time: 1.8435 sec
Epoch 12 : [17984 / 40150] loss : 37.5587158203125, Batch time: 1.7993 sec
Epoch 12 : [18624 / 40150] loss : 49.304649353027344, Batch time: 1.8356 sec
Epoch 12 : [19264 / 40150] loss : 39.83075714111328, Batch time: 1.8409 sec
Epoch 12 : [19904 / 40150] loss : 36.525779724121094, Batch time: 1.8192 sec
Epoch 12 : [20544 / 40150] loss : 25.27975082397461, Batch time: 1.8117 sec
Epoch 12 : [21184 / 40150] loss : 36.42578887939453, Batch time: 1.8074 sec
Epoch 12 : [21824 / 40150] loss : 18.983585357666016, Batch time: 1.7873 sec
Epoch 12 : [22464 / 40150] loss : 28.57994842529297, Batch time: 1.8159 sec
Epoch 12 : [23104 / 40150] loss : 29.65227508544922, Batch time: 1.8260 sec
Epoch 12 : [23744 / 40150] loss : 29.826427459716797, Batch time: 1.8204 sec
Epoch 12 : [24384 / 40150] loss : 25.215641021728516, Batch time: 1.8087 sec
Epoch 



Epoch 13 : [6464 / 40150] loss : 25.525609970092773, Batch time: 1.7889 sec
Epoch 13 : [7104 / 40150] loss : 25.569198608398438, Batch time: 1.7522 sec
Epoch 13 : [7744 / 40150] loss : 22.512928009033203, Batch time: 1.7663 sec
Epoch 13 : [8384 / 40150] loss : 26.23139762878418, Batch time: 1.7818 sec
Epoch 13 : [9024 / 40150] loss : 36.2441520690918, Batch time: 1.7645 sec
Epoch 13 : [9664 / 40150] loss : 31.853008270263672, Batch time: 1.7727 sec
Epoch 13 : [10304 / 40150] loss : 55.403236389160156, Batch time: 1.7479 sec
Epoch 13 : [10944 / 40150] loss : 29.43818473815918, Batch time: 1.7714 sec
Epoch 13 : [11584 / 40150] loss : 28.72286033630371, Batch time: 1.7836 sec
Epoch 13 : [12224 / 40150] loss : 25.07830810546875, Batch time: 1.7835 sec
Epoch 13 : [12864 / 40150] loss : 24.822071075439453, Batch time: 1.7673 sec
Epoch 13 : [13504 / 40150] loss : 36.05870819091797, Batch time: 1.7977 sec
Epoch 13 : [14144 / 40150] loss : 19.2220516204834, Batch time: 1.7796 sec




Epoch 13 : [14784 / 40150] loss : 24.11465835571289, Batch time: 1.7458 sec
Epoch 13 : [15424 / 40150] loss : 23.352338790893555, Batch time: 1.7839 sec
Epoch 13 : [16064 / 40150] loss : 35.312408447265625, Batch time: 1.7763 sec
Epoch 13 : [16704 / 40150] loss : 15.86916732788086, Batch time: 1.8230 sec
Epoch 13 : [17344 / 40150] loss : 47.18305206298828, Batch time: 1.8403 sec
Epoch 13 : [17984 / 40150] loss : 30.59844207763672, Batch time: 1.8249 sec
Epoch 13 : [18624 / 40150] loss : 48.353355407714844, Batch time: 1.8367 sec
Epoch 13 : [19264 / 40150] loss : 22.91344451904297, Batch time: 1.8201 sec
Epoch 13 : [19904 / 40150] loss : 30.083328247070312, Batch time: 1.8255 sec
Epoch 13 : [20544 / 40150] loss : 29.23086166381836, Batch time: 1.8359 sec
Epoch 13 : [21184 / 40150] loss : 26.505874633789062, Batch time: 1.8628 sec
Epoch 13 : [21824 / 40150] loss : 28.83131980895996, Batch time: 1.8422 sec
Epoch 13 : [22464 / 40150] loss : 23.743576049804688, Batch time: 1.8371 sec
Epoch 



Epoch 14 : [33344 / 40150] loss : 33.43273162841797, Batch time: 1.8535 sec




Epoch 14 : [33984 / 40150] loss : 24.89082145690918, Batch time: 1.8180 sec
Epoch 14 : [34624 / 40150] loss : 27.422351837158203, Batch time: 1.8415 sec
Epoch 14 : [35264 / 40150] loss : 37.34911346435547, Batch time: 1.8291 sec
Epoch 14 : [35904 / 40150] loss : 30.30063247680664, Batch time: 1.8503 sec
Epoch 14 : [36544 / 40150] loss : 40.22540283203125, Batch time: 1.8148 sec
Epoch 14 : [37184 / 40150] loss : 31.748165130615234, Batch time: 1.8150 sec
Epoch 14 : [37824 / 40150] loss : 27.97020721435547, Batch time: 1.8197 sec
Epoch 14 : [38464 / 40150] loss : 26.59662628173828, Batch time: 1.8211 sec
Epoch 14 : [39104 / 40150] loss : 30.77665138244629, Batch time: 1.8226 sec
Epoch 14 : [39744 / 40150] loss : 15.491265296936035, Batch time: 1.8319 sec
Epoch 14 finished, Total Epoch time: 2889.4244 sec
Epoch : 14, Loss : 28.95467392198599, Val_loss : 48.438780072369156
Epoch 15 : [64 / 40150] loss : 22.69908905029297, Batch time: 1.7023 sec
Epoch 15 : [704 / 40150] loss : 22.0556297302



Epoch 15 : [7744 / 40150] loss : 29.098098754882812, Batch time: 1.7618 sec
Epoch 15 : [8384 / 40150] loss : 21.43895721435547, Batch time: 1.7741 sec
Epoch 15 : [9024 / 40150] loss : 18.96926498413086, Batch time: 1.8161 sec
Epoch 15 : [9664 / 40150] loss : 27.68683433532715, Batch time: 1.8036 sec
Epoch 15 : [10304 / 40150] loss : 28.271503448486328, Batch time: 1.8381 sec
Epoch 15 : [10944 / 40150] loss : 26.09821891784668, Batch time: 1.8117 sec
Epoch 15 : [11584 / 40150] loss : 23.60952377319336, Batch time: 1.8081 sec
Epoch 15 : [12224 / 40150] loss : 29.241058349609375, Batch time: 1.7701 sec
Epoch 15 : [12864 / 40150] loss : 27.679889678955078, Batch time: 1.8071 sec
Epoch 15 : [13504 / 40150] loss : 42.434268951416016, Batch time: 1.7455 sec
Epoch 15 : [14144 / 40150] loss : 20.566181182861328, Batch time: 1.7248 sec
Epoch 15 : [14784 / 40150] loss : 33.84473419189453, Batch time: 1.7266 sec
Epoch 15 : [15424 / 40150] loss : 24.68648910522461, Batch time: 1.7352 sec
Epoch 15 :



Epoch 15 : [18624 / 40150] loss : 28.493064880371094, Batch time: 1.6580 sec
Epoch 15 : [19264 / 40150] loss : 24.080921173095703, Batch time: 1.6573 sec
Epoch 15 : [19904 / 40150] loss : 13.415287971496582, Batch time: 1.7353 sec
Epoch 15 : [20544 / 40150] loss : 26.49669647216797, Batch time: 1.6873 sec
Epoch 15 : [21184 / 40150] loss : 21.77327537536621, Batch time: 1.6796 sec
Epoch 15 : [21824 / 40150] loss : 30.151508331298828, Batch time: 1.6756 sec
Epoch 15 : [22464 / 40150] loss : 23.37619972229004, Batch time: 1.6849 sec
Epoch 15 : [23104 / 40150] loss : 27.62038803100586, Batch time: 1.6747 sec
Epoch 15 : [23744 / 40150] loss : 21.897462844848633, Batch time: 1.6921 sec
Epoch 15 : [24384 / 40150] loss : 14.183609962463379, Batch time: 1.7153 sec
Epoch 15 : [25024 / 40150] loss : 20.420272827148438, Batch time: 1.6742 sec
Epoch 15 : [25664 / 40150] loss : 33.19867706298828, Batch time: 1.7193 sec
Epoch 15 : [26304 / 40150] loss : 39.24749755859375, Batch time: 1.6742 sec
Epoch



Epoch 16 : [6464 / 40150] loss : 27.20549774169922, Batch time: 1.6992 sec
Epoch 16 : [7104 / 40150] loss : 16.421920776367188, Batch time: 1.7656 sec
Epoch 16 : [7744 / 40150] loss : 14.774759292602539, Batch time: 1.7897 sec
Epoch 16 : [8384 / 40150] loss : 21.49358558654785, Batch time: 1.7894 sec
Epoch 16 : [9024 / 40150] loss : 22.397260665893555, Batch time: 1.7943 sec
Epoch 16 : [9664 / 40150] loss : 20.811403274536133, Batch time: 1.8275 sec
Epoch 16 : [10304 / 40150] loss : 17.684261322021484, Batch time: 1.8159 sec
Epoch 16 : [10944 / 40150] loss : 31.81106185913086, Batch time: 1.8083 sec
Epoch 16 : [11584 / 40150] loss : 21.407276153564453, Batch time: 1.7990 sec
Epoch 16 : [12224 / 40150] loss : 36.502685546875, Batch time: 1.8175 sec
Epoch 16 : [12864 / 40150] loss : 12.487272262573242, Batch time: 1.7974 sec
Epoch 16 : [13504 / 40150] loss : 21.930740356445312, Batch time: 1.8248 sec
Epoch 16 : [14144 / 40150] loss : 19.30422592163086, Batch time: 1.8091 sec
Epoch 16 : [



Epoch 16 : [16064 / 40150] loss : 16.196388244628906, Batch time: 1.8230 sec
Epoch 16 : [16704 / 40150] loss : 17.8514404296875, Batch time: 1.8069 sec
Epoch 16 : [17344 / 40150] loss : 22.1536808013916, Batch time: 1.7963 sec
Epoch 16 : [17984 / 40150] loss : 19.58378791809082, Batch time: 1.7871 sec
Epoch 16 : [18624 / 40150] loss : 25.387161254882812, Batch time: 1.7961 sec
Epoch 16 : [19264 / 40150] loss : 20.077302932739258, Batch time: 1.8073 sec
Epoch 16 : [19904 / 40150] loss : 27.32141876220703, Batch time: 1.8177 sec
Epoch 16 : [20544 / 40150] loss : 22.887775421142578, Batch time: 1.8139 sec
Epoch 16 : [21184 / 40150] loss : 18.911575317382812, Batch time: 1.8177 sec
Epoch 16 : [21824 / 40150] loss : 36.678955078125, Batch time: 1.7917 sec
Epoch 16 : [22464 / 40150] loss : 21.196876525878906, Batch time: 1.7777 sec
Epoch 16 : [23104 / 40150] loss : 26.73309326171875, Batch time: 1.7965 sec
Epoch 16 : [23744 / 40150] loss : 29.107816696166992, Batch time: 1.8133 sec
Epoch 16 

KeyboardInterrupt: 

- Epoch : 1, Loss : 183.76921428333628, Val_loss : 138.7342578125
- Epoch : 2, Loss : 128.2447030203683, Val_loss : 143.2067679595947
- Epoch : 3, Loss : 122.56000444965977, Val_loss : 96.83059211730956
- Epoch : 4, Loss : 87.70850903081437, Val_loss : 78.38082936604818
- Epoch : 5, Loss : 68.04693739406598, Val_loss : 72.64038126627604
- Epoch : 6, Loss : 57.276853927027304, Val_loss : 68.96054336547851
- Epoch : 7, Loss : 51.59112667961242, Val_loss : 57.317286516825355
- Epoch : 8, Loss : 47.01944582150006, Val_loss : 55.66085670471191
- Epoch : 9, Loss : 42.54086226624803, Val_loss : 53.47876875559489
- Epoch : 10, Loss : 39.80607411198723, Val_loss : 54.47281494140625
- Epoch : 11, Loss : 36.430880546569824, Val_loss : 54.80409914652507
- Epoch : 12, Loss : 33.627891385631195, Val_loss : 48.48928497411028
- Epoch : 13, Loss : 31.059670966142303, Val_loss : 51.898619048203095
- Epoch : 14, Loss : 28.95467392198599, Val_loss : 48.438780072369156
- Epoch : 15, Loss : 26.666760903255195, Val_loss : 49.81154549272755
- Epoch : 16, Loss : 24.51864461989919, Val_loss : 43.89916703067249

In [None]:
def train_loop_gender(dataloader, model, loss_fn, optimizer, epoch):
    model.train()
    size = len(dataloader.dataset)
    total_loss = 0.0
    start_time = time.time()  # 에포크 시작 시간

    for batch, (x, y) in enumerate(dataloader):
        batch_start_time = time.time()  # 배치 처리 시작 시간
        x, y = x.to(device), y['gender'].float().to(device)
        pred = model(x)
        loss = loss_fn(pred.squeeze(), y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        batch_process_time = time.time() - batch_start_time

        if batch % 10 == 0:
            processed = (batch + 1) * len(x)
            print(f'Epoch {epoch+1} : [{processed} / {size}] loss : {loss.item()}, Batch time: {batch_process_time:.4f} sec')

    average_loss = total_loss / len(dataloader)
    epoch_time = time.time() - start_time

    print(f"Epoch {epoch+1} finished, Total Epoch time: {epoch_time:.4f} sec")
    return average_loss


In [None]:
def validation_loop_gender(dataloader, model, loss_fn, device):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y['gender'].float().to(device)
            pred = model(x)
            loss = loss_fn(pred, y)
            val_loss += loss.item()
    val_loss /= len(dataloader)
    return val_loss

In [None]:
start = time.time()
for epoch in range(30):
    gender_loss = train_loop_gender(train_loader, gender_model, nn.BCEWithLogitsLoss(), opt_gender, epoch)
    val_loss = validation_loop_gender(val_loader, gender_model, nn.BCEWithLogitsLoss(), device)
    gender_lr_scheduler.step(val_loss)
    early_stopping(val_loss)

    if early_stopping.early_stop:
        print("Early stopping triggered")
        save_model(epoch, gender_model, opt_gender, '/content/drive/MyDrive/DL_DATA/Model/gender_model_checkpoint.pth')
        break

    save_model(epoch, gender_model, opt_gender, f'/content/drive/MyDrive/DL_DATA/Model/gender_model_checkpoint_epoch_{epoch+1}.pth')
    print(f'Epoch : {epoch+1}, Loss : {gender_loss}, Val_loss : {val_loss}')

total_time = time.time() - start
hours, rem = divmod(total_time, 3600)
minutes, seconds = divmod(rem, 60)
print("Total training time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))