In [5]:
# annotations_path is the path to vizwiz/annotations folder.
from typing import Dict, List
from paths import annotations_path
from collections import Counter
import json, string, csv
from itertools import dropwhile
from textblob import TextBlob

In [6]:
def collect_words(split: string) -> Dict:
    # Collects all NPs from the captions of each image.
    # Returns a dict with image ids and the corresponding tokens
    img_tokens = dict()
    
    with open(f'{annotations_path}{split}.json') as json_file:
        data = json.load(json_file)
    
    annotations = data['annotations']
    
    for item in annotations:
        imgid = item['image_id']
        if imgid not in img_tokens:
            img_tokens[imgid] = set()
        useful_tokens = TextBlob(item['caption']).noun_phrases
        for token in useful_tokens:
            img_tokens[imgid].add(token)
    
    return img_tokens

In [3]:
def count_tokens(tokens: dict, threshold: int) -> Counter:
    # count tokens
    c = Counter()
    for subset in tokens.values():
        for token in subset:
            c[token] += 1
    # remove smaller than threshold
    for key, count in dropwhile(lambda key_count: key_count[1]>=threshold, c.most_common()):
        del c[key]
    return c


In [7]:
x = collect_words('train')

In [11]:
# x = collect_words('train')
y = collect_words('val')
# split = split_train.union(split_val)
split = {**x, **y}

In [14]:
count = count_tokens(x, 15)
keywords = [key for key, _ in count.most_common()]

In [15]:
with open('keywords_new_train.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    for kw in keywords:
        spamwriter.writerow([kw])