### Optimization of Build_vocab function using Concurrency Methods(Threading, Multiprocessing) to improve tokenization of captions

In [1]:
from tqdm import tqdm
import nltk
from collections import Counter
from time import time
import multiprocessing as mp
import pickle
import numpy as np
import os
from queue import Queue
from threading import Thread
from threading import Lock


In [2]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)
    
    def get_id(self, w):
        return self.word2idx[w]
    
    def encode_seq(self, l):
        return [self.word2idx[i] if i in self.word2idx else self.word2idx['<unk>'] for i in l]
    
    def get_token(self, idx):
        return self.idx2word[idx]
    
    def decode_seq(self, l):
        return [self.idx2word[i] for i in l]

### Building Vocabulary using Mulitprocessing

#### Non optimized version

In [3]:
def build_vocab(ann_file = '../flickr30k/results_20130124.token', threshold = 4):
    """Build a simple vocabulary wrapper."""
    punc_set = set([',',';',':','.','?','!','(',')'])
    counter = Counter()
    caption_list = []
    split = pickle.load(open('train_set.p', 'rb'))
    ann_file = os.path.expanduser(ann_file)
    with open(ann_file) as fh:
        for line in fh:
            img, caption = line.strip().split('\t')
            if img[:-2] in split:
                caption_list.append(caption)
    for caption in tqdm(caption_list):
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        tokens = [elem for elem in tokens if elem not in punc_set] 
        counter.update(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')
    vocab.add_word('<break>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)

    return vocab

In [4]:
start_time = time()

build_vocab()

end_time = time()        
 
print("Time for Non-optimized version: %ssecs" % (end_time - start_time))


100%|██████████| 141960/141960 [00:25<00:00, 5472.37it/s]

Time for Non-optimized version: 26.20974588394165secs





#### Multiprocessing with N=4 processes

In [15]:
def build_vocab(ann_file = '../flickr30k/results_20130124.token', threshold = 4):
    """Build a simple vocabulary wrapper."""
    punc_set = set([',',';',':','.','?','!','(',')'])
    counter = Counter()
    caption_list = []
    split = pickle.load(open('train_set.p', 'rb'))
    ann_file = os.path.expanduser(ann_file)
    with open(ann_file) as fh:
        for line in fh:
            img, caption = line.strip().split('\t')
            if img[:-2] in split:
                caption_list.append(caption)
                
    pool = mp.Pool(4)
    tokens = pool.map(nltk.tokenize.word_tokenize, [caption.lower() for caption in tqdm(caption_list)])
    pool.close()
    tokens = [item for elem in tokens for item in elem]
    tokens = [elem for elem in tokens if elem not in punc_set]
    counter = Counter(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')
    vocab.add_word('<break>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

In [17]:
start_time = time()

build_vocab()

end_time = time()        
 
print("Time for Multiprocessing with N=4 processes: %ssecs" % (end_time - start_time))

100%|██████████| 141960/141960 [00:00<00:00, 875697.33it/s]


Time for Multiprocessing with N=4 processes: 12.161671161651611secs


#### Threading with N=4 threads

In [10]:
def build_vocab(ann_file = '../flickr30k/results_20130124.token', threshold = 4):
    """Build a simple vocabulary wrapper."""
    punc_set = set([',',';',':','.','?','!','(',')'])
    counter = Counter()
    caption_list = []
    split = pickle.load(open('train_set.p', 'rb'))
    ann_file = os.path.expanduser(ann_file)
    with open(ann_file) as fh:
        for line in fh:
            img, caption = line.strip().split('\t')
            if img[:-2] in split:
                caption_list.append(caption)
                
    
    global tokens 
    tokens = []
    lock = Lock()

    class DownloadWorker(Thread):
        def __init__(self, queue):
            Thread.__init__(self)
            self.queue = queue
        def run(self):
            while True:
                caption = self.queue.get()
                lock.acquire()
                tokens.append(nltk.tokenize.word_tokenize(caption))
                lock.release()
                self.queue.task_done()      
    queue = Queue()
    for _ in range(4):
        worker = DownloadWorker(queue)
        worker.daemon = True
        worker.start()

    for caption in caption_list:
        queue.put(caption.lower())
    
    queue.join()
    
    tokens = [item for elem in tokens for item in elem]
    tokens = [elem for elem in tokens if elem not in punc_set]
    counter = Counter(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')
    vocab.add_word('<break>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

In [13]:
start_time = time()

build_vocab()

end_time = time()        
 
print("Time for Threading with N=4 threads: %ssecs" % (end_time - start_time))


Time for Threading with N=4 threads: 36.773322105407715secs


As evident from the timings above, the most efficient method is multiprocessing with N=4 processes. It took 12.16 seconds to generate the counter word pairs (while dropping the uncommon words) as compared to non optimized version which took 26.2 seconds. 