In [None]:
import os
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# from transformers import ViTImageProcessor, ViTForImageClassification
import torchvision.models as models # 이미지

# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # 텍스트
from transformers import BertModel, BertTokenizer # 텍스트

from tqdm.auto import tqdm
from PIL import Image


In [None]:
if torch.cuda.is_available() : device = torch.device('cuda')
elif torch.backends.mps.is_available() : device = torch.device('mps')
else : device=torch.device('cpu')
print(f'Using {device}')

In [None]:
class VQADataset(Dataset):
    def __init__(self, df, tokenizer, transform, img_path, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.img_path = img_path
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_name = os.path.join(self.img_path, row['image_id'] + '.jpg')
        image = Image.open(img_name).convert('RGB')

        image = self.transform(image)

        question = row['question']
        question = self.tokenizer(question, truncation=True, padding='max_length', max_length=100, return_tensors="pt")

        if not self.is_test:
            answer = row['answer']

            answer = self.tokenizer(answer, truncation=True, padding='max_length', max_length=100, return_tensors="pt")
            
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
                'answer': answer['input_ids'].squeeze()
            }
        else:
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
            }


In [None]:
# 데이터 불러오기
train_df = pd.read_csv('trainsformed_train.csv')
test_df = pd.read_csv('trainsformed_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
train_img_path = 'image/train'
test_img_path = 'image/test'

# dataset & dataloader
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = VQADataset(train_df, tokenizer, transform, train_img_path, is_test=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size):
        super(VQAModel, self).__init__()
        if torch.cuda.is_available() : self.device = torch.device('cuda')
        elif torch.backends.mps.is_available() : self.device = torch.device('mps')
        else : self.device=torch.device('cpu')

        self.vocab_size = vocab_size

        # self.vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
        # self.vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
        self.vgg = models.resnet50(weights=True)
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.bert.pooler = nn.Linear(768, 768)

        # combined_features_size = self.vit.classifier.out_features + self.flan.lm_head.out_features
        combined_features_size = 1000 + self.bert.pooler.out_features

        self.classifier = nn.Linear(combined_features_size, vocab_size)

    def forward(self, images, question):
        # inputs = self.vit_processor(images=images, return_tensors="pt")

        # inputs = inputs.to(self.device)
        # image_features = self.vit(**inputs).logits
        image_features = self.vgg(images)

        image_features = image_features.view(image_features.size(0),-1)

        # question_features = self.flan(question,question_attention_mask,answer).logits
        question_features = self.bert(question).last_hidden_state

        image_features = image_features.unsqueeze(1).expand(-1, question_features.size(1),-1) # [batch, sequence, 1000]

        combined = torch.cat([image_features, question_features], dim=-1) # [batch, sequence, 1000+hidden]

        output = self.classifier(combined) # [batch, vocab_size]

        return output
            

In [None]:
def train(model, loader, optimizer, criterion, accumulation_steps=8):
    model.train()
    total_loss = 0

    for i, data in tqdm(enumerate(loader), total=len(loader)):
        images = data['image'].to(device)
        question = data['question'].to(device)
        answer = data['answer'].to(device)

        optimizer.zero_grad() if i % accumulation_steps == 0 else None

        outputs = model(images=images, question=question)

        loss = criterion(outputs.view(-1, outputs.size(-1)), answer.view(-1))

        total_loss += loss.item()

        loss /= accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()

    avg_loss = total_loss / len(loader)
    return avg_loss


In [None]:
model = VQAModel(len(tokenizer)).to(device)


In [None]:
os.makedirs("models",exist_ok=True)
# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
best_loss = float('inf')  # 초기 최적의 손실값을 무한대로 설정

epoch = 0
while True:
    epoch += 1
    avg_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch: {epoch}, Loss: {avg_loss:.4f}")

    # 성능이 좋아질 때마다 모델 저장
    if avg_loss < best_loss:
        torch.save(model.state_dict(), f"models/{epoch}_{avg_loss}.pth")
        best_loss = avg_loss


In [None]:
PATH = "models/7_0.024020754328503088.pth"
model.load_state_dict(torch.load(PATH,map_location=device))


In [None]:
def inference(model, loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for data in tqdm(loader, total=len(loader)):
            images = data['image'].to(device)
            question = data['question'].to(device)

            outputs = model(images, question) # [batch, sequence, vocab]

            _, pred = torch.max(outputs, dim=2) # values, indices = _, pred
            preds.extend(pred.cpu().numpy())

    return preds

In [20]:
# Dataset & DataLoader
test_dataset = VQADataset(test_df, tokenizer, transform, test_img_path, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# inference
preds = inference(model, test_loader)

# no_pad_output = []
# for pred in preds:
#     output = pred[pred != 50257] # [PAD] token 제외
#     no_pad_output.append(tokenizer.decode(output).strip()) # 토큰 id -> 토큰

  0%|          | 0/1265 [00:00<?, ?it/s]

In [39]:
s_token_ids = set(tokenizer.all_special_ids)
res = []

for pred in preds:
    output = [token for token in pred if token not in s_token_ids]  # Exclude special tokens
    res.append(tokenizer.decode(output).strip())  # Convert token ids to tokens


In [40]:
res

['no',
 'surf',
 'red',
 'white',
 'no',
 'black',
 '1',
 'no',
 'yes',
 'blue',
 'yes',
 'bathroom',
 'yes',
 '0',
 'white',
 '0 :',
 'glasses',
 'white',
 'yes',
 '1',
 'yes',
 'no',
 'yes',
 'no',
 '1',
 'elephant',
 'yes',
 'clear',
 'yes',
 'wall',
 '2',
 'san',
 '4',
 'no',
 '2',
 'no',
 'blue',
 'sunset',
 'no',
 '11 : 20',
 'no',
 'flying',
 'yes',
 '1',
 'camera',
 '2',
 'no',
 'english',
 'left',
 'skateboarding',
 'brick',
 'red',
 'yes',
 'yes',
 'abstract',
 'yes',
 'white',
 'no',
 'breakfast',
 'fork',
 'collie',
 'no',
 'chair',
 'motorcycles',
 'green',
 'cardinals',
 'antique',
 'left',
 'sink',
 'baseball',
 'yes',
 'tan',
 'yes',
 'no',
 'street',
 'blue',
 'yes',
 'apartment',
 'lunch',
 'left elephant',
 'yes',
 'no',
 'pink',
 'kitchen',
 'yes',
 'cat',
 'left',
 'cake',
 'bathroom',
 'dog',
 'red',
 'cat',
 'wwwinx',
 'snow',
 'girl',
 'birthday',
 'red',
 'yes',
 'no',
 'i',
 '2',
 'yes',
 'no',
 'lettuce',
 'grass',
 '2',
 'yes',
 'yes',
 'winter',
 'long',
 '

In [41]:
sample_submission['answer'] = res
sample_submission.to_csv('submission.csv', index=False)