#### Prerequisites

In [1]:
%%capture

!pip install gensim==4.0.1
!pip install python-Levenshtein
!pip install stop-words

#### Imports 

In [2]:
from stop_words import get_stop_words
import gensim
import string
import glob

#### Essentials

We will keep only letters and digits of the caption and remove other symbols, emojis, etc.

In [4]:
whitelist = string.ascii_letters + string.digits + ' '
whitelist

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 '

In [5]:
stop_words = get_stop_words('en')
stop_words[:5]

['a', 'about', 'above', 'after', 'again']

In [6]:
filter_words = ['http', 'https', 'photo', 'picture', 'image', 'insta', 'instagram', 'post']
for word in filter_words:
    stop_words.append(word)

#### Load and prep captions

In [7]:
def clean_caption(caption: str) -> str:
    caption = caption.replace('#', '').replace('  ', ' ')
    cleaned_caption = []
    for char in caption:
        if char in whitelist:
            cleaned_caption.append(char)
    return ''.join(cleaned_caption)

In [11]:
captions = []

for file_name in glob.glob('./data/captions/newyork/*.txt'):
    with open(file_name, 'r') as f:
        lines = []
        for line in f.readlines():
            lines.append(line)
        caption = ' '.join(lines)
        caption = clean_caption(caption)
        captions.append(caption)

In [12]:
captions[0]

'Rubbing elbows and eating Shrimp with some of LAs Best for Clique LA mixology101   thegrovela TheGroveLA Secret CliqueLA Shrimp   WePlanYourEvent EventProfs SpecialEvents EventSpecialist EventPlanner EventPlanning EventProduction Activations PrivateEvent RedCarpetEvent CorporateEvent CompanyParty NonProfitEvent HolidayParty PremiereParty WrapParty ClientOffSites IncentiveTrip LosAngeles NewYork EventLife Events ADMPEvents   w w w  a d m p E v e n t s  c o m admpevents'

#### Tokenize

In [14]:
tokenized_captions = []
for caption in captions:
    caption = caption.lower()
    tokens = gensim.utils.simple_preprocess(caption)
    filtered_tokens = [token for token in tokens if not token in stop_words]
    tokenized_captions.append(filtered_tokens)

In [15]:
len(tokenized_captions)

10

#### Training parameters

In [19]:
vector_size = 100 # word representation size
min_count = 1 # discard words with less than 1 appearances
epochs = 10 # iterate over the training corpus x times (train for x epochs)
window = 8 # words window used during training
training_cores = 8 # number of CPU cores used to train the model (based on chosen instance type)

#### Train model

In [20]:
model = gensim.models.Word2Vec(tokenized_captions, 
                               vector_size=vector_size, 
                               min_count=min_count, 
                               workers=training_cores, 
                               epochs=epochs, 
                               window=window)

In [22]:
model.__dict__

{'vector_size': 100,
 'workers': 8,
 'epochs': 10,
 'train_count': 1,
 'total_train_time': 0.009853740033577196,
 'batch_words': 10000,
 'sg': 0,
 'alpha': 0.025,
 'min_alpha': 0.0001,
 'window': 8,
 'random': RandomState(MT19937) at 0x7F0DD6AE2678,
 'hs': 0,
 'negative': 5,
 'ns_exponent': 0.75,
 'cbow_mean': 1,
 'compute_loss': False,
 'running_training_loss': 0.0,
 'min_alpha_yet_reached': 0.0025899999999999986,
 'corpus_count': 10,
 'corpus_total_words': 270,
 'max_final_vocab': None,
 'max_vocab_size': None,
 'min_count': 1,
 'sample': 0.001,
 'sorted_vocab': 1,
 'null_word': 0,
 'cum_table': array([  51712475,   84534415,  104050457,  123566500,  143082542,
         162598584,  182114627,  201630669,  221146711,  235545412,
         249944113,  264342814,  278741515,  293140215,  307538916,
         321937617,  336336318,  350735018,  365133719,  379532420,
         393931121,  408329822,  422728522,  437127223,  451525924,
         465924625,  480323326,  494722026,  509120727, 

#### Save model

In [24]:
model.save('./data/models/word2vec.model')

#### Test model

In [25]:
from gensim import models

In [27]:
model = models.Word2Vec.load('./data/models/word2vec.model')

Test most similar words

In [28]:
model.wv.most_similar(positive=['fashion', 'newyork'])

[('corporateevent', 0.3125542402267456),
 ('nfl', 0.22858406603336334),
 ('party', 0.2223261445760727),
 ('onlinestore', 0.22230403125286102),
 ('model', 0.21324695646762848),
 ('shrimp', 0.18388710916042328),
 ('wrapparty', 0.17837926745414734),
 ('bowtie', 0.17491039633750916),
 ('true', 0.171781986951828),
 ('crooklyn', 0.1617811620235443)]

In [29]:
model.wv.most_similar(negative=['fashion', 'newyork'])

[('uberpromocode', 0.3113328218460083),
 ('sandiego', 0.2539641857147217),
 ('losangeles', 0.24870024621486664),
 ('beast', 0.1847950667142868),
 ('download', 0.1765165627002716),
 ('met', 0.17650361359119415),
 ('book', 0.1742529273033142),
 ('admpevents', 0.15765082836151123),
 ('foodspo', 0.15553607046604156),
 ('celebration', 0.14369958639144897)]

In [30]:
model.wv.most_similar(positive=['food'], negative=['healthy'])

[('uberjust', 0.2485421597957611),
 ('eventspecialist', 0.24686424434185028),
 ('redcarpetevent', 0.23033036291599274),
 ('done', 0.22004669904708862),
 ('suitandtie', 0.21576322615146637),
 ('fashion', 0.200453981757164),
 ('clique', 0.19829021394252777),
 ('nfl', 0.17814338207244873),
 ('uberfreeride', 0.17691302299499512),
 ('drop', 0.16916896402835846)]

In [36]:
model.wv.doesnt_match(['fashion', 'food', 'show'])

'food'

In [39]:
# model.wv.similarity('food', 'truck') - throws an exception since truck is out of vocab