# **Processing Captions 🚀**

In [None]:
# Imports
import pandas as pd
import pickle
import random
import gensim
import pickle
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## **Creating final CSV with image names and corresponding Captions 📈**



In [None]:

def create_final_csv(caption_path):
  with open(caption_path) as f:
    content = f.readlines()
  content = [x.strip() for x in content]
  content = [x.split(".jpg,") for x in content]
  df = pd.DataFrame(content[1:])
  df.columns = ['Images_name','captions']
  df = df.drop_duplicates(subset=['Images_name'])
  df['Images_name']= df['Images_name'].astype(str)+'.jpg'
  df['captions']= df['captions'].str.lower()
  df = df.reset_index(drop=True)
  df.head()
  df.to_csv('/CSVs/flickr8K_final.csv',index=False)

caption_path = '/Captions/captions.txt'
create_final_csv(caption_path)

## **Cleaning captions, removing puncutaions, Stop Words, Sapces etc... 🔡**

In [None]:
def clean_and_tokenize_comments_for_image(comment):
    stop_words = ['a', 'and', 'of', 'to']
    punctuation = r"""!"#$%&'()*+,./:;<=>?@[\]^_`…’{|}~"""
    captions_without_punctuation = [s.translate(str.maketrans(' ', ' ', punctuation)) for s in comment]
    sentences = []

    for clean_caption in captions_without_punctuation:
        clean_caption = re.sub(r"-(?:(?<!\b[0-9]{4}-)|(?![0-9]{2}(?:[0-9]{2})?\b))", ' ', clean_caption)  # replace with space

        temp_tokens = word_tokenize(str(clean_caption).lower())
        tokens = [t for t in temp_tokens if t not in stop_words]
        sentences.append(tokens)
    return sentences

## **Generating vectors of captions using Word2Vec 🔢**

In [None]:
def create_feature_vectors_for_single_comment(word2vec_model, cleaned_comments, image_names):
    vectorized_list = []
    image_list = []

    for comments, image in zip(cleaned_comments, image_names):
        result_array = np.empty((0, 300))
        for word in comments:
            try:
                w = [word2vec_model[word]]
                result_array = np.append(result_array, w, axis=0)
            except KeyError:
                print(word)
                result_array = np.append(result_array, [word2vec_model[random.choice(word2vec_model.index2entity)]], axis=0)

        vectorized_list.append(np.mean(result_array, axis=0).astype('float32'))
        image_list.append(image)

    return image_list, np.array(vectorized_list)

## **Generating embeddings and saving to pickle  🗃️**

In [None]:
def create_sentence_embeddings():
    df = pd.read_csv('CSVs/flickr8K_final.csv')
    model = gensim.models.KeyedVectors.load_word2vec_format('/word2vec_pretrained_model/GoogleNews-vectors-negative300.bin', binary=True)
    cleaned_captions = clean_and_tokenize_comments_for_image(df['captions'].values)
    image_names = df['Images_name'].values
    print('Done tokenizing....')
    i, c = create_feature_vectors_for_single_comment(model, cleaned_captions, image_names)
    word_vector_dict = dict(zip(i, c))
    pickle.dump(word_vector_dict, open('/Pickles/flickr8k_embeddings' + ".p", "wb"))
    print('Done')

create_sentence_embeddings()

Done tokenizing....
atvs
atvs
grey
grey
bloe
grey
12
diveboard
grey
grey
cathing
stonesign
panelling
medatative
grey
starbuck
grey
razzling
grey
angerly
vike
attrative
grey
chasseing
foggyday
grey
grey
contracption
grey
kildare
groucho
campflauge
hawaiin
grey
vuitton
cappedhills
grey
grey
grey
ripstik
grey
broen
grey
froup
bouncey
furocious
buddist
28
19
catc
grey
grey
busstop
darked
fton
grey
outstreached
grey
lilypads
grey
medow
grey
grey
streetpole


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


demonstarting
tobaggons
budweiser
clibing
seaguls
colourful
25
standind
grey
swaetshirt
frolicks
tourquoise
grey
bikina
saroog
surfboarder
perforced
colourfully
worshipping
grey
horro
riwal
footbride
surfboarder
bicylist
parasurfer
biek
drak
underhang
surfboarder
hudge
throughwindow
grey
ractrack
kingsworth
rollerskater
rollerskater
christmastime
basett
biscut
grey
telephot
moustache
hurridly
downsteps
colourful
dacshund
93
lionist
grey
waing
equpitment
30
halway
casque
obligatoire
outstreached
griding
seedoo
grey
grey
dooorway
dirtbikers
battons
indescript
grey
outstreached
528
tongee
retreiver
outstreached
grey
rodderick
fronmt
281
unner
grey
skislope
grey
dupar
intertube
grey
penske
grey
grey
colourful
headwraps
hdr
stiped
grey
obsured
bunchh
grey
waterskies
waterskies
aggitates
grey
grey
iove
outfir
shelton
grey
boogieboard
brighty
tophats
facepaintings
surfboarder
10
gaurdian
midpitch
woooden
plungles
rollskating
aligator
streght
swimmies
corgie
orangesunset
dandylions
moustache
g