In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import os
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms as transforms
from torchinfo import summary
import torchvision.models as models
import spacy

In [3]:
# 인코딩용 CNN 모델 생성 : RESNET18 (가중치O)

# res_model = models.resnet18(weights = ( "ResNet18_Weights.DEFAULT"))
# 전결합층 변경
# res_model.fc = nn.Linear(in_features = 512, out_features = 1)


class encoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(encoderCNN, self).__init_()
        self.resnet = models.resnet18(weights = ( "ResNet18_Weights.DEFAULT"))
        # 전이학습 모델의 전결합층 변경
        self.resent.fc = nn.Linear(self.resenet.fc.in_features, embed_size)

        self.dropout = nn.Dropout(0.5)  # 흠,, 필요할까?
        self.relu = nn.Relu()
    
    def forward(self, x):
        features = self.resnet(x)
        
        # 모델의 합성곱층 가중치 고정 (완전 연결층은 학습시켜야함)
        for name, param in self.resnet.named_parameters():
            param.requires_grad = False
        for name, param in self.resnet.fc.named_parameters():
            param.requires_grad = True 

        result = self.relu(features)

        return result

In [4]:
# 디코딩용 RNN 모델 생성 : LSTM

class decoderRNN(nn.Module):
    def __init__ (self, embed_size, vocab_size, hidden_size, num_layers):
        super(decoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5) # 흠 필요함?
    
    def forward(self, features, caption):
        # embeddings = self.dropout(self.embedding(caption))   # 왜 dropout?
        embeddings = self.embedding(caption)
        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim = 0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

In [5]:
# CNN 과 RNN을 연결시키자

class CNN2RNN(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size, num_layers):
        super(CNN2RNN, self).__init__()
        self.encoderCNN = encoderCNN(embed_size)
        self.decoderRNN = decoderRNN(embed_size, vocab_size, hidden_size, num_layers)
    
    def captionImage(self, image, vocabulary, maxlength = 50):
        result_caption = []

        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None

            for _ in range(maxlength) :
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                print(predicted.shape)

                result_caption.append(predicted.item())
                x = self.decoderRNN.embedding(output).unsqueeze(0)

                if vocabulary.itos[predicted.item()] == "<EOS>" :
                    break
        
        return [vocabulary.itos[i] for i in result_caption]


In [6]:
spacy_eng = spacy.load("en_core_web_sm")

In [7]:
class Vocabulary:
    def __init__(self, freq_threshold):
        
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        
        self.freq_threshold = freq_threshold
    
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self,sentences):
        idx = 4
        frequency = {}
        
        for sentence in sentences:
            for word in self.tokenizer_eng(sentence):
                if word not in frequency:
                    frequency[word] = 1
                else:
                    frequency[word] += 1
                
                if (frequency[word] > self.freq_threshold-1):
                    self.itos[idx] = word
                    self.stoi[word] = idx
                    idx += 1
    
    def numericalize(self,sentence):
        tokenized_text = self.tokenizer_eng(sentence)
        
        return [self.stoi[word] if word in self.stoi else self.stoi["<UNK>"] for word in tokenized_text ]
                    

In [15]:
annotation = pd.read_csv("./data/encoded_data_v2.csv", usecols =[1,3])
annotation.columns = ['image','caption']
annotation.head()

Unnamed: 0,image,caption
0,1000092795.jpg,Two young guys with shaggy hair look at their ...
1,1000092795.jpg,Two young white males are outside near many b...
2,1000092795.jpg,Two men in green shirts are standing in a yard
3,1000092795.jpg,A man in a blue shirt standing in a garden
4,1000092795.jpg,Two friends enjoy time spent together


In [16]:
annotation['caption'].tolist()[:2]

['Two young guys with shaggy hair look at their hands while hanging out in the yard',
 'Two young  white males are outside near many bushes']

In [None]:
class FlickrDataset(Dataset):
    def __init__(self, root_dir="./data/flickr30k_images/", caption_path="./data/data_v2.csv", freq_threshold=5, transform=None, data_length=10000):
        self.freq_threshold = freq_threshold
        self.transform = transform
        self.root_dir = root_dir
    
        self.df = pd.read_csv(caption_path)[:data_length]
        
        self.captions = self.df['caption']
        self.images = self.df['image']
        
        self.vocab = Vocabulary(freq_threshold)
        
        print(len(self.captions.tolist()))
        self.vocab.build_vocabulary(self.captions.tolist())
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        caption = self.captions[index]
        image = self.images[index]
        
        img = Image.open(os.path.join(self.root_dir,image)).convert("RGB")
        
        if (self.transform):
            img = self.transform(img)
        
        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        
        numericalized_caption += self.vocab.numericalize(caption)
        
        numericalized_caption.append(self.vocab.stoi["<EOS>"])
        
        return img, torch.tensor(numericalized_caption)

In [None]:
class MyCollate:
    def __init__(self, pad_value):
        self.pad_value = pad_value
    
    def __call__(self,batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        img = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_value)
        
        return img, targets

In [None]:
transform = transforms.Compose(
        [
            transforms.Resize((356, 356)),
            transforms.RandomCrop((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ]
    )

In [18]:
def get_loader(root_dir="./data/flickr30k_images/", caption_path="./data/data_v2.csv", transform=transform, batch_size=32, num_workers=8, shuffle=True, pin_memory=True):
    dataset = FlickrDataset(root_dir=root_dir,caption_path=caption_path, transform=transform)
    pad_value = dataset.vocab.stoi["<PAD>"]
    
    loader = DataLoader(dataset=dataset, batch_size=32, num_workers=8, shuffle=True, pin_memory=True, collate_fn=MyCollate(pad_value))
    
    return loader, dataset

In [19]:
loader, dataset = get_loader()

NameError: name 'pandas' is not defined