In [3]:
with open("drive-path.txt", 'r') as f:
    download_path = f.readline()

In [4]:
import urllib
import zipfile
from tqdm import tqdm

#https://stackoverflow.com/a/53877507/1558946
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_data(url):
    print(f"{url} 다운로드 중 ...")
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        zip_path, _ = urllib.request.urlretrieve(url, reporthook=t.update_to)

    print("압축을 푸는 중 ...")
    with zipfile.ZipFile(zip_path, "r") as f:
        for name in tqdm(iterable=f.namelist(), total=len(f.namelist())):
            # f.extract(member=name, path="data_dir")
            f.extract(member=name, path=download_path)

In [None]:
download_data("http://images.cocodataset.org/annotations/annotations_trainval2014.zip")

In [5]:
download_data("http://images.cocodataset.org/zips/train2014.zip")

http://images.cocodataset.org/zips/train2014.zip 다운로드 중 ...


train2014.zip: 13.5GB [11:02:06, 340kB/s]                              


압축을 푸는 중 ...


100%|██████████| 82784/82784 [05:36<00:00, 245.85it/s]


In [6]:
download_data("http://images.cocodataset.org/zips/val2014.zip")

http://images.cocodataset.org/zips/val2014.zip 다운로드 중 ...


val2014.zip: 6.65GB [5:18:23, 348kB/s]                               


압축을 푸는 중 ...


100%|██████████| 40505/40505 [02:15<00:00, 299.69it/s]


# Import

In [2]:
%pip install pycocotools

Collecting pycocotoolsNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for pycocotools from https://files.pythonhosted.org/packages/24/b2/ef28a34cf6ca50b6b2f7ad81e5837ed45c252ffef22f5a704b94141ea842/pycocotools-2.0.7-cp311-cp311-win_amd64.whl.metadata
  Downloading pycocotools-2.0.7-cp311-cp311-win_amd64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.7-cp311-cp311-win_amd64.whl (85 kB)
   ---------------------------------------- 0.0/85.8 kB ? eta -:--:--
   -------------------------------------- - 81.9/85.8 kB 4.5 MB/s eta 0:00:01
   ---------------------------------------- 85.8/85.8 kB 1.2 MB/s eta 0:00:00
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.7


In [3]:
import os
import nltk
import pickle
import numpy as np
from PIL import Image
from collections import Counter
from pycocotools.coco import COCO
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.utils.data as data
from torchvision import transforms
import torchvision.models as models
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pack_padded_sequence

In [None]:
nltk.download('punkt')

# Build Vocab

In [None]:
class Vocab(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.w2i = {}
        self.i2w = {}
        self.index = 0

    def __call__(self, token):
        if not token in self.w2i:
            return self.w2i['<unk>']
        return self.w2i[token]

    def __len__(self):
        return len(self.w2i)
    def add_token(self, token):
        if not token in self.w2i:
            self.w2i[token] = self.index
            self.i2w[self.index] = token
            self.index += 1

In [None]:
def build_vocabulary(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    tokens = [token for token, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocab()
    vocab.add_token('<pad>')
    vocab.add_token('<start>')
    vocab.add_token('<end>')
    vocab.add_token('<unk>')

    # Add the words to the vocabulary.
    for i, token in enumerate(tokens):
        vocab.add_token(token)
    return vocab

In [None]:
vocab = build_vocabulary(json='data_dir/annotations/captions_train2014.json', threshold=4)
vocab_path = './data_dir/vocabulary.pkl'
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))