In [1]:
import string

In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text


In [3]:
def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

In [4]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word)>1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] =  ' '.join(desc)


In [5]:
def to_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [6]:
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [7]:
filename = 'Flickr8k_text/Flickr8k.token.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
clean_descriptions(descriptions)
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
save_descriptions(descriptions, 'descriptions.txt')

Loaded: 8092 
Vocabulary Size: 8763


In [8]:
vocabulary

{'jockeys',
 'desks',
 'amuseument',
 'throughwindow',
 'cross',
 'sweatband',
 'jacked',
 'bearer',
 'winnie',
 'pops',
 'rack',
 'takeing',
 'faded',
 'block',
 'ox',
 'asian',
 'barechested',
 'handbag',
 'goucho',
 'props',
 'frisbees',
 'patrollers',
 'woamn',
 'landscape',
 'naked',
 'offering',
 'nugent',
 'socks',
 'fellows',
 'sheepdogs',
 'clemson',
 'plling',
 'bathed',
 'scottish',
 'giant',
 'sunshade',
 'crumb',
 'sunflowers',
 'bison',
 'equipments',
 'diving',
 'orca',
 'hiviz',
 'haystack',
 'awning',
 'chins',
 'roadside',
 'interviewing',
 'warming',
 'extravagent',
 'watercraft',
 'fancy',
 'tribal',
 'impact',
 'kickflip',
 'jugs',
 'policeman',
 'males',
 'headlight',
 'headgear',
 'pursues',
 'player',
 'spirit',
 'earrings',
 'perches',
 'vehicles',
 'completing',
 'ponchos',
 'masses',
 'subway',
 'scarily',
 'pig',
 'farmers',
 'chutes',
 'shells',
 'backstage',
 'blurred',
 'social',
 'headcover',
 'progressively',
 'shiner',
 'fabric',
 'wheel',
 'amplifier'