#### Prerequisites

In [1]:
%%capture

!pip install gensim==4.0.1
!pip install python-Levenshtein
!pip install stop-words

#### Imports 

In [2]:
from stop_words import get_stop_words
import gensim
import string
import glob

#### Essentials

We will keep only letters and digits of the caption and remove other symbols, emojis, etc.

In [3]:
whitelist = string.ascii_letters + string.digits + ' '
whitelist

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 '

In [4]:
stop_words = get_stop_words('en')
stop_words[:5]

['a', 'about', 'above', 'after', 'again']

In [5]:
filter_words = ['http', 'https', 'photo', 'picture', 'image', 'insta', 'instagram', 'post']
for word in filter_words:
    stop_words.append(word)

#### Load and prep captions

In [6]:
def clean_caption(caption: str) -> str:
    caption = caption.replace('#', '').replace('  ', ' ')
    cleaned_caption = []
    for char in caption:
        if char in whitelist:
            cleaned_caption.append(char)
    return ''.join(cleaned_caption)

In [7]:
captions = []

for file_name in glob.glob('./data/captions/*.txt'):
    with open(file_name, 'r') as f:
        lines = []
        for line in f.readlines():
            lines.append(line)
        caption = ' '.join(lines)
        caption = clean_caption(caption)
        captions.append(caption)

In [8]:
captions[0]

'Blind idealism is reactionary scary DEADLY  Barbara Kruger'

#### Tokenize

In [9]:
tokenized_captions = []
for caption in captions:
    caption = caption.lower()
    tokens = gensim.utils.simple_preprocess(caption)
    filtered_tokens = [token for token in tokens if not token in stop_words]
    tokenized_captions.append(filtered_tokens)

In [10]:
len(tokenized_captions)

20

#### Training parameters

In [11]:
vector_size = 1000 # word representation size
min_count = 1 # discard words with less than 1 appearances
epochs = 10 # iterate over the training corpus x times (train for x epochs)
window = 8 # words window used during training
training_cores = 8 # number of CPU cores used to train the model (based on chosen instance type)

#### Train model

In [12]:
model = gensim.models.Word2Vec(tokenized_captions, 
                               vector_size=vector_size, 
                               min_count=min_count, 
                               workers=training_cores, 
                               epochs=epochs, 
                               window=window)

In [13]:
model.__dict__

{'vector_size': 1000,
 'workers': 8,
 'epochs': 10,
 'train_count': 1,
 'total_train_time': 0.0264550908759702,
 'batch_words': 10000,
 'sg': 0,
 'alpha': 0.025,
 'min_alpha': 0.0001,
 'window': 8,
 'random': RandomState(MT19937) at 0x7FC41865F678,
 'hs': 0,
 'negative': 5,
 'ns_exponent': 0.75,
 'cbow_mean': 1,
 'compute_loss': False,
 'running_training_loss': 0.0,
 'min_alpha_yet_reached': 0.0025899999999999986,
 'corpus_count': 20,
 'corpus_total_words': 394,
 'max_final_vocab': None,
 'max_vocab_size': None,
 'min_count': 1,
 'sample': 0.001,
 'sorted_vocab': 1,
 'null_word': 0,
 'cum_table': array([  51322688,   73837507,   90448669,  107059830,  120447222,
         133834613,  147222004,  157099060,  166976116,  176853172,
         186730228,  196607284,  206484340,  216361396,  226238452,
         236115508,  245992563,  255869619,  265746675,  275623731,
         285500787,  295377843,  305254899,  315131955,  325009011,
         334886067,  344763123,  354640179,  364517235,  

#### Save model

In [14]:
model.save('./data/models/word2vec.model')

#### Test model

In [15]:
from gensim import models

In [16]:
model = models.Word2Vec.load('./data/models/word2vec.model')

In [17]:
# model.wv.index_to_key

['newyork',
 'newyorkcity',
 'nyc',
 'bebworldcom',
 'otro',
 'times',
 'can',
 'travel',
 'canon',
 'california',
 'vacation',
 'follow',
 'hair',
 'australia',
 'change',
 'vacations',
 'jcsmoothcom',
 'city',
 'york',
 'mansions',
 'timessquare',
 'fashion',
 'new',
 'menswear',
 'square',
 'repost',
 'dapper',
 'paris',
 'make',
 'manhattan',
 'casanova',
 'tonight',
 'hit',
 'stage',
 'vegas',
 'party',
 'tour',
 'st',
 'paul',
 'mn',
 'ny',
 'lasvegas',
 'losangeles',
 'style',
 'big',
 'badgirlclub',
 'teamcanon',
 'mm',
 'brooklyn',
 'mpls',
 'stpaul',
 'blackandwhite',
 'localmusic',
 'artist',
 'snapchat',
 'creative',
 'gorgeous',
 'shotsbyzaya',
 'mnstandup',
 'madrid',
 'flatlays',
 'amsterdam',
 'netherlands',
 'mywhitetable',
 'italy',
 'berlin',
 'kashalotbeauty',
 'live',
 'rarest',
 'thing',
 'world',
 'people',
 'just',
 'exist',
 'oscar',
 'wilde',
 'france',
 'donetsk',
 'igukraine',
 'order',
 'viber',
 'whats',
 'textile',
 'dm',
 'scandinavian',
 'interior',
 's

Test most similar words

In [18]:
model.wv.most_similar(positive=['fashion', 'newyork'])

[('times', 0.0945308655500412),
 ('igukraine', 0.092128224670887),
 ('beverlyhills', 0.08560965955257416),
 ('japan', 0.0844288170337677),
 ('vintage', 0.08180374652147293),
 ('ago', 0.08149947226047516),
 ('order', 0.07725626230239868),
 ('lasvegas', 0.06876716017723083),
 ('repost', 0.06680316478013992),
 ('australia', 0.06671624630689621)]

In [19]:
model.wv.most_similar(negative=['fashion', 'newyork'])

[('events', 0.08811479806900024),
 ('igers', 0.07289917021989822),
 ('ukraine', 0.07155462354421616),
 ('dapperstyle', 0.06987272202968597),
 ('back', 0.06290832906961441),
 ('mensfashions', 0.061511777341365814),
 ('boys', 0.06037839874625206),
 ('porque', 0.05505591630935669),
 ('dapper', 0.05375199019908905),
 ('coddiwomple', 0.052894849330186844)]

In [20]:
model.wv.most_similar(positive=['newyork'], negative=['deadly'])

[('couple', 0.09467583149671555),
 ('singapore', 0.08557996153831482),
 ('igukraine', 0.07807522267103195),
 ('streetwear', 0.07463937252759933),
 ('teamcanon', 0.0723017230629921),
 ('tbt', 0.07054192572832108),
 ('vintage', 0.06983204185962677),
 ('styleblogger', 0.0657968819141388),
 ('cars', 0.06495926529169083),
 ('villa', 0.06416767090559006)]

In [21]:
model.wv.doesnt_match(['fashion', 'food', 'show'])

'fashion'

In [22]:
# model.wv.similarity('food', 'truck') - throws an exception since truck is out of vocab