#### Imports 

In [1]:
from stop_words import get_stop_words
from gensim import corpora, models
import pandas as pd
import numpy as np
import string
import gensim
import glob

#### Load model

In [2]:
model = models.Word2Vec.load('./data/models/word2vec.model')

#### Load and prep captions

In [3]:
whitelist = string.ascii_letters + string.digits + ' '
whitelist

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 '

In [4]:
stop_words = get_stop_words('en')
stop_words[:5]

['a', 'about', 'above', 'after', 'again']

In [5]:
filter_words = ['http', 'https', 'photo', 'picture', 'image', 'insta', 'instagram', 'post']
for word in filter_words:
    stop_words.append(word)

In [6]:
def clean_caption(caption: str) -> str:
    caption = caption.replace('#', '').replace('  ', ' ')
    cleaned_caption = []
    for char in caption:
        if char in whitelist:
            cleaned_caption.append(char)
    return ''.join(cleaned_caption)

In [7]:
captions = {}

for file_name in glob.glob('./data/captions/*.txt'):
    with open(file_name, 'r') as f:
        caption_id = file_name.split('/')[-1].replace('.txt', '')
        lines = []
        for line in f.readlines():
            lines.append(line)
        caption = ' '.join(lines)
        caption = clean_caption(caption)
        captions[caption_id] = caption

#### Tokenize 

In [8]:
tokenized_captions = {}

for id_, caption in captions.items():
    caption = caption.lower()
    tokens = gensim.utils.simple_preprocess(caption)
    filtered_tokens = [token for token in tokens if not token in stop_words]
    filtered_tokens_in_vocab = [token for token in filtered_tokens if token in model.wv.key_to_index]
    tokenized_captions[id_] = filtered_tokens_in_vocab

#### Create embedding

In [9]:
vector_size = 1000 # word representation size

In [10]:
caption_embedding = {}
for id_, caption in tokenized_captions.items():
    embedding = np.zeros(vector_size)
    # compute embedding of each word and the whole caption - average embedding of words
    for token in caption:
        embedding += model.wv[token]
    # normalize the embedding - divide by number of words
    if len(caption) > 1:
        embedding /= len(caption)
    # min at 0 and divide by max
    if min(embedding) < 0:
        embedding = embedding - min(embedding)
    if max(embedding) > 0:
         embedding = embedding / max(embedding)
    vec = []
    for i in range(vector_size):
        vec.append(str(embedding[i]))

    caption_embedding[id_] = ','.join(vec)

In [11]:
len(caption_embedding)

20

In [12]:
df = pd.DataFrame(list(caption_embedding.items()), columns=['id', 'caption_vec'])

In [13]:
df.head(10)

Unnamed: 0,id,caption_vec
0,1481007530145672379,"0.3525857591226041,0.5628003472746195,0.689418..."
1,1490659882930594965,"0.3981266390220058,0.6765545631293353,0.492382..."
2,1481097035704453947,"0.5060395263183227,0.5219469383988314,0.750949..."
3,1489658491986857252,"0.6603368547307249,0.4578474946102846,0.573608..."
4,1487676925685022333,"0.7360358085075475,0.3539930954278208,0.475945..."
5,1489896267786304778,"0.6593034192456314,0.4165845340416728,0.353498..."
6,1489696141964320706,"0.42207672933397067,0.7121002365829232,0.61733..."
7,1481040658940297881,"0.6359996386674361,0.6005916804915926,0.266619..."
8,1487747485463161463,"0.540376623123047,0.5238551694830208,0.6327733..."
9,1480941788440054562,"0.4797552057907202,0.4400496271326795,0.816089..."


In [14]:
df.to_csv('./data/embeddings/caption_embeddings.csv', sep='\t', index=False, header=False)