#### Imports 

In [9]:
from stop_words import get_stop_words
from gensim import corpora, models
import pandas as pd
import numpy as np
import string
import gensim
import glob

#### Load model

In [10]:
model = models.Word2Vec.load('./data/models/word2vec.model')
model

<gensim.models.word2vec.Word2Vec at 0x7f2d767a9e48>

#### Load and prep captions

In [11]:
whitelist = string.ascii_letters + string.digits + ' '
whitelist

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 '

In [12]:
stop_words = get_stop_words('en')
stop_words[:5]

['a', 'about', 'above', 'after', 'again']

In [13]:
filter_words = ['http', 'https', 'photo', 'picture', 'image', 'insta', 'instagram', 'post']
for word in filter_words:
    stop_words.append(word)

In [14]:
def clean_caption(caption: str) -> str:
    caption = caption.replace('#', '').replace('  ', ' ')
    cleaned_caption = []
    for char in caption:
        if char in whitelist:
            cleaned_caption.append(char)
    return ''.join(cleaned_caption)

In [15]:
captions = {}

for file_name in glob.glob('./data/captions/newyork/*.txt'):
    with open(file_name, 'r') as f:
        caption_id = file_name.split('/')[-1].replace('.txt', '')
        lines = []
        for line in f.readlines():
            lines.append(line)
        caption = ' '.join(lines)
        caption = clean_caption(caption)
        captions[caption_id] = caption

#### Tokenize 

In [16]:
tokenized_captions = {}

for id_, caption in captions.items():
    caption = caption.lower()
    tokens = gensim.utils.simple_preprocess(caption)
    filtered_tokens = [token for token in tokens if not token in stop_words]
    filtered_tokens_in_vocab = [token for token in filtered_tokens if token in model.wv.key_to_index]
    tokenized_captions[id_] = filtered_tokens_in_vocab

#### Create embedding

In [17]:
vector_size = 100 # word representation size

In [18]:
caption_embedding = {}
for id_, caption in tokenized_captions.items():
    embedding = np.zeros(vector_size)
    # compute embedding of each word and the whole caption - average embedding of words
    for token in caption:
        embedding += model.wv[token]
    # normalize the embedding - divide by number of words
    if len(caption) > 1:
        embedding /= len(caption)
    # min at 0 and divide by max
    if min(embedding) < 0:
        embedding = embedding - min(embedding)
    if max(embedding) > 0:
         embedding = embedding / max(embedding)
    vec = []
    for i in range(vector_size):
        vec.append(str(embedding[i]))

    caption_embedding[id_] = ','.join(vec)

In [19]:
len(caption_embedding)

10

In [20]:
df = pd.DataFrame(list(caption_embedding.items()), columns=['id', 'caption_vec'])

In [21]:
df.head(10)

Unnamed: 0,id,caption_vec
0,1480879599445220766,"0.5555035161845882,0.9626889195468908,0.550639..."
1,1480879493756363036,"0.5589446679391084,0.4026430898918909,0.512165..."
2,1480879485913200243,"0.7945310088517241,0.2373072246507963,0.229642..."
3,1480879545439850674,"0.12139521366120073,0.7272543320605608,0.70181..."
4,1480879528443144184,"0.3936537196372196,0.1793626396890647,0.910459..."
5,1480879622614007520,"0.39641157046095477,0.47501276679717197,0.8663..."
6,1480879551018326741,"0.42926041480330285,0.37984876536332884,0.5169..."
7,1480879623192671215,"0.0,0.7998890996349065,0.39711905403346415,0.2..."
8,1480879572073167385,"0.7411503542520272,0.7737430387089678,0.467731..."
9,1480879539524935620,"0.0,0.6279793888068568,0.15500728594231702,0.6..."


In [23]:
df.to_csv('./data/embedding/caption_embedding.csv', sep='\t', index=False, header=False)