In [1]:
import os
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

import torchvision.models as models # 이미지
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # 텍스트
from transformers import T5Tokenizer, T5ForConditionalGeneration # 텍스트

from tqdm.auto import tqdm
from PIL import Image

In [2]:
if torch.cuda.is_available() : device = torch.device('cuda')
elif torch.backends.mps.is_available() : device = torch.device('mps')
else : device=torch.device('cpu')
print(f'Using {device}')

Using cuda


In [3]:
class VQADataset(Dataset):
    def __init__(self, df, tokenizer, transform, img_path, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.img_path = img_path
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_name = os.path.join(self.img_path, row['image_id'] + '.jpg')
        image = Image.open(img_name).convert('RGB')
        
        image = self.transform(image)

        question = row['question']
        question = self.tokenizer(question, truncation=True, padding='max_length', max_length=32, return_tensors="pt")

        if not self.is_test:
            answer = row['answer']

            answer = self.tokenizer(answer, truncation=True, padding='max_length', max_length=32, return_tensors="pt")
            
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
                'question_attention_mask' : question["attention_mask"].squeeze(),
                'answer': answer['input_ids'].squeeze()
            }
        else:
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
                'question_attention_mask' : question["attention_mask"].squeeze(),
            }

In [4]:
# 데이터 불러오기
train_df = pd.read_csv('trainsformed_train.csv')
test_df = pd.read_csv('trainsformed_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
train_img_path = 'image/train'
test_img_path = 'image/test'

# dataset & dataloader
# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

vocab_size = len(tokenizer)

transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = VQADataset(train_df, tokenizer, transform, train_img_path, is_test=False)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [5]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size):
        super(VQAModel, self).__init__()
        if torch.cuda.is_available() : self.device = torch.device('cuda')
        elif torch.backends.mps.is_available() : self.device = torch.device('mps')
        else : self.device=torch.device('cpu')

        self.vocab_size = vocab_size

        self.resnet = models.resnet18(weights=True)

        # self.flan = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small',load_in_8bit=True)
        self.flan = T5ForConditionalGeneration.from_pretrained('google/flan-t5-small',load_in_8bit=True)
        combined_features_size = 1000 + self.flan.lm_head.out_features

        self.classifier = nn.Linear(combined_features_size, vocab_size)

    def forward(self, images, question,question_attention_mask, answer):
        image_features = self.resnet(images)
        image_features = image_features.view(image_features.size(0),-1)
        question_features = self.flan(question,question_attention_mask,answer).logits
        image_features = image_features.unsqueeze(1).expand(-1, question_features.size(1),-1) # [batch, sequence, 1000]
        combined = torch.cat([image_features, question_features], dim=-1) # [batch, sequence, 1000+hidden]
        output = self.classifier(combined) # [batch, vocab_size]
        return output
        

In [6]:

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for data in tqdm(loader, total=len(loader)):
        images = data['image'].to(device)
        question = data['question'].to(device)
        question_attention_mask = data['question_attention_mask'].to(device)
        answer = data['answer'].to(device)

        optimizer.zero_grad()

        outputs = model(images = images, question = question, question_attention_mask = question_attention_mask, answer = answer)

        # output: [batch, sequence, vocab], answer : [batch, sequence]
        loss = criterion(outputs.view(-1, outputs.size(-1)), answer.view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(loader)
    return avg_loss

In [7]:
model = VQAModel(len(tokenizer)).to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 35.5MB/s]


In [8]:
model

VQAModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_run

In [9]:
os.makedirs("models",exist_ok=True)
# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
best_loss = float('inf')  # 초기 최적의 손실값을 무한대로 설정
num_epochs = 100000      # 원하는 만큼 반복 실행할 에폭 수

epoch = 0
while True:
    epoch += 1
    avg_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch: {epoch}, Loss: {avg_loss:.4f}")

    # 성능이 좋아질 때마다 모델 저장
    if avg_loss < best_loss:
        torch.save(model.state_dict(), f"models/{epoch}_{avg_loss}.pth")
        best_loss = avg_loss

  0%|          | 0/359521 [00:00<?, ?it/s]

KeyboardInterrupt: 