In [1]:
import os
import sys
import cv2
import tqdm
import time
import spacy 
import random
import scipy.io
import itertools
import numpy as np
from math import ceil
import pandas as pd
from itertools import chain
from tqdm.contrib import tzip
import matplotlib.pyplot as plt
from skimage.io import imread
from scipy.ndimage.filters import gaussian_filter
from sklearn.model_selection import train_test_split

In [2]:
import torch
from torch.nn.utils.rnn import pad_sequence 
import torchvision.transforms as transforms

import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [3]:
captions_path = "../../datasets/part1/captions.txt"
images_path = "../../datasets/part1/Flicker8k_Dataset/"

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, images, captions, transform):
        self.X, self.y = images, captions
        self.transform = transform
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        caption = self.y[index]
        img = self.X[index]

        if self.transform is not None:
            img = self.transform(img)
        
        return img, torch.tensor(caption)

In [5]:
class DataUtils():
    def __init__(self, captions_path, images_path, min_word_frequency = 2):
        self.min_word_frequency = min_word_frequency
        self.max_cap_length = 24
        self.english_tokenizer = spacy.load("en_core_web_sm")
        
        print("loading images ...")
        images = self.image_loader(images_path)
        print("loading captions ...")
        captions = self.caption_loader(captions_path)
        
        self.images = []
        self.captions = []
        
        for i, j in images.items():
            self.images.append(j)
            self.captions.append(captions[i])
        
        print("creating vocabulary ...")
        self.create_vocabulary(self.captions)
        
        self.numericalize_captions = self.captions_numericalizer(self.captions)
        
        print("images:", len(self.images), "captions:",  len(self.captions), "Vocab:", len(self.vocabulary))
    
    def captions_numericalizer(self, caps):
        numericalize_captions = []
        for i in (caps):
            items = []
            for j in i:
                padded_numericalized = self.pad(self.numericalize_caption(j))
                items.append(padded_numericalized)
            numericalize_captions.append(items)
        return numericalize_captions
        
    def create_vocabulary(self, all_caps):
        self.vocabulary = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.rev_vocabulary = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        min_word_frequency = self.min_word_frequency
        
        frequencies = {}
        idx = 4
        
        counter, count = 1, len(all_caps)
        for sentence_list in all_caps:
            for sentence in sentence_list:
                for word in self.tokenizer_eng(sentence):
                    if word not in frequencies:
                        frequencies[word] = 1
                    else:
                        frequencies[word] += 1

                    if frequencies[word] == min_word_frequency:
                        self.rev_vocabulary[word] = idx
                        self.vocabulary[idx] = word
                        idx += 1
            counter += 1
            print(str(round(counter/count*100, 2))+'%', end="\r")
        print('---done!---')
        
    def pad(self, sequence):
        max_cap_length = self.max_cap_length
        if len(sequence) > max_cap_length:
            sequence = sequence[:max_cap_length]
            sequence[max_cap_length-1] = self.rev_vocabulary["<EOS>"]
        else:
            while(len(sequence) < max_cap_length):
                sequence.append(self.rev_vocabulary["<PAD>"])
        return sequence
        
    def tokenizer_eng(self, text):
        return [tok.text.lower() for tok in self.english_tokenizer.tokenizer(text)]
    
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.rev_vocabulary[token] if token in self.rev_vocabulary else self.rev_vocabulary["<UNK>"]
            for token in tokenized_text
        ]
    
    def reverse_numericalize(self, encoded):

        strings = []
            
        for token in encoded:
            if token in self.vocabulary:
                strings.append(self.vocabulary[token])
            else:
                strings.append(self.vocabulary[3])
            if strings[-1] == '<EOS>':
                break
        
        
        if '<SOS>' in strings:
            strings.remove('<SOS>')
        if '<EOS>' in strings:
            strings.remove('<EOS>')
        while '<PAD>' in strings:
            strings.remove('<PAD>')
            
        sentence = ''
        for i in strings:
            sentence += i
            sentence += ' '
            
        return sentence[:-1]
    
    def numericalize_caption(self, caption):
        numericalized_caption = [self.rev_vocabulary["<SOS>"]]
        numericalized_caption += self.numericalize(caption)
        numericalized_caption.append(self.rev_vocabulary["<EOS>"])
        return numericalized_caption
    
    
    def get_images_and_captions(self, test_size = 0.2):
        sample_captions = []
        for i in self.numericalize_captions:
            sample_captions.append(i[np.random.randint(5)])
        
        self.im_train, self.im_test, self.cap_trin, self.cap_test =\
            train_test_split(self.images, sample_captions, test_size = test_size, random_state=42)
        return self.im_train, self.im_test, self.cap_trin, self.cap_test
    
    def image_loader(self, images_path):
        images = {}
        counter, count = 1, len(os.listdir(images_path))
        for image_path in os.listdir(images_path):
            images[image_path.split('.')[0]] = imread(images_path + image_path)#cv2.resize(imread(images_path + image_path), (224, 224))
            counter+=1
            print(round(counter/count*100, 2), '%', end="\r")
        print("---done!---")
        return images
    
    def caption_loader(self, captions_path):
        captions = {}
        captions_file = pd.read_csv(captions_path).to_numpy()
        counter, count = 1, len(captions_file)
        for i in captions_file:
            
            key = i[0].split('.')[0]
            
            if key in captions.keys():
                captions[key].append(i[1])
            else:
                captions[key] = []
                captions[key].append(i[1])
            counter+=1
            print(str(round(counter/count*100, 2))+'%', end="\r")
        print("---done!---")
        return captions
        

In [6]:
def imshow(img, t = ''):
    img = img / 2 + 0.5
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg[0], (1, 2, 0)))
    plt.title(t)
    plt.show()