In [1]:
import dataset
import model

import random
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision

In [2]:
# 재현성을 위한 seed 고정
random_seed = 42
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True # 고정하면 학습이 느려진다고 합니다.
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
np.random.seed(random_seed)
random.seed(random_seed)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
def createDirectory(directory): # 디렉토리 없으면 생성
    try: 
        if not os.path.exists(directory): 
            os.makedirs(directory) 
    except OSError: 
            print("Error: Failed to create the directory.")

In [4]:
def train_model(model_):
    ### 학습 코드 시작
    best_test_accuracy = 0.
    best_test_loss = 9999.

    dataloaders = {
            "train" : DL_train,
            "test" : DL_valid
        }

    for epoch in range(1,epoch_num+1):
        for phase in ["train", "test"]:
            running_loss = 0.
            running_acc = 0.
            if phase == "train":
                model_.train()
            elif phase == "test":
                model_.eval()

            for ind, (images, labels) in enumerate(tqdm(dataloaders[phase])):
                images = images.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == "train"):
                    preds = model_(images)
                    loss = criterion(preds, labels)

                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                    running_loss += loss.item() * images.size(0) # 한 Batch에서의 loss 값 저장
                    preds_num = torch.argmax(preds,dim=1)
                    running_acc += torch.sum(preds_num == labels) # 한 Batch에서의 Accuracy 값 저장

            # 한 epoch이 모두 종료되었을 때,
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_acc / len(dataloaders[phase].dataset)
            
            print(f"epoch-{epoch} {phase}-데이터 셋 평균 Loss : {epoch_loss:.3f}, 평균 Accuracy : {epoch_acc:.3f}")
            if phase == "test" and best_test_accuracy < epoch_acc:
                best_test_accuracy = epoch_acc
                torch.save(model_.state_dict(), f'./model/{test_name}/{model_name}_state_dict/{epoch:03d}_{best_test_accuracy:0.4f}.pt')
            if phase == "test" and best_test_loss > epoch_loss:
                best_test_loss = epoch_loss
    print("학습 종료!")
    print(f"최고 accuracy : {best_test_accuracy}, 최고 낮은 loss : {best_test_loss}")
    torch.save(model_, f'./model/{test_name}/{model_name}.pt')
    torch.cuda.empty_cache() # GPU 캐시 데이터 삭제

In [8]:
createDirectory('./model')

test_name = 'T1'
createDirectory(f'./model/{test_name}')

model_name = 'age'          # == target
createDirectory(f'./model/{test_name}/{model_name}_state_dict')

df_path = "train_new.csv"
class_num = 3

train_transform = dataset.albumentations_transform()
valid_transform = dataset.BaseAugmentation()
batch_size = 128
lr = 0.0001
epoch_num = 100
criterion = nn.CrossEntropyLoss()

model_ = model.BaseModel(class_num=class_num)
model_.to(device)
optimizer = torch.optim.Adam(model_.parameters(), lr=lr)

# data frame을 train, vaild set으로 분할
df = pd.read_csv(df_path)
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df[model_name], random_state=random_seed)

# train, valid dataloader 생성
DS_train = dataset.AlbumentationsDataset(train_df,target=model_name,transform=train_transform)
DS_valid = dataset.CustomDataset(valid_df,target=model_name,transform=valid_transform)
DL_train = DataLoader(DS_train,batch_size=batch_size,shuffle=True)
DL_valid = DataLoader(DS_valid,batch_size=batch_size,shuffle=True)

train_model(model_)

  0%|          | 0/473 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 1.47 GiB (GPU 0; 31.75 GiB total capacity; 29.12 GiB already allocated; 899.50 MiB free; 29.71 GiB reserved in total by PyTorch)