In [1]:
import random
import sys

import numpy as np

np.random.seed(42)
import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

In [2]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/2-ranking/lab4', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [3]:
train_file_path = 'dataset/train_corpus_descriptions_airbnb.csv'
test_file_path = 'dataset/test_corpus_descriptions_airbnb.csv'
embedding_dim = 100
epochs = 50

In [4]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [5]:
train_corpus = list(read_corpus(train_file_path))
test_corpus = list(read_corpus(test_file_path, tokens_only=True))

In [6]:
print(train_corpus[:2])

[TaggedDocument(words=['run', 'runyon', 'canyon', 'our', 'gym', 'sauna', 'open', 'beautifully', 'furnished', 'mirrored', 'mini', 'suite', 'with', 'fireplacedyson', 'hot', 'cool', 'bladeless', 'fan', 'heaterbeekman', 'goat', 'milk', 'soappremium', 'memory', 'foam', 'pillowsfirst', 'morning', 'complementary', 'starbuck', 'coffee', 'latte', 'style', 'coffee', 'protein', 'bars', 'granola', 'bars', 'fresh', 'baked', 'swedish', 'cinnamon', 'roll', 'continental', 'bottle', 'of', 'artesian', 'or', 'sparkling', 'mineral', 'waterterry', 'robe', 'amish', 'wildflower', 'soapcandy', 'bowltrail', 'mix', 'jarcdc', 'cleaningthe', 'spacewe', 'opportunely', 'strive', 'for', 'peace', 'quiet', 'cleanness', 'neatness', 'by', 'means', 'of', 'happiness', 'here', 'runyon', 'canyon', 'beau', 'furn', 'mir', 'mini', 'suite', 'fireplacerun', 'runyon', 'canyon', 'beautifully', 'furnished', 'mirrored', 'mini', 'suite', 'with', 'fireplace', 'gyms', 'are', 'open', 'plunge', 'pool', 'jacuzzi', 'sundeck', 'runyon', 'ca

In [7]:
class MonitorCallback(CallbackAny2Vec):
    def on_epoch_end(self, model):
        print("Model loss:", model.get_latest_training_loss())
        # print loss

monitor = MonitorCallback()

In [8]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=embedding_dim, min_count=2, epochs=epochs, workers=5, callbacks=[monitor])
model.build_vocab(train_corpus)

In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[ 0.4062481  -0.45970774  0.8737589  -1.1125178   0.36117405 -0.57193446
  0.1993261  -0.35838404  0.3433686  -0.23151097  0.05307581  0.37294447
  0.14602664 -0.7243367   0.39131477  0.42685416 -0.24062386  0.3529859
  0.8403859   0.48177123  0.6516271   0.30122602 -0.8258079   0.00409049
  0.00799834  0.38958058  0.08138184  0.09224967 -0.36196205 -0.369399
  0.05991855  0.05529932  0.2047091   0.717981    0.03228188  0.03249573
 -0.05112756 -0.1730143  -0.2371005   0.8752679  -0.03052846 -0.2884936
 -0.0365307   0.3999309  -0.5021756   0.26192415  0.25978473  0.320639
 -0.07888479 -0.8229658   1.6289259  -0.13849625 -0.20243748  0.89142334
  0.14780585 -0.17813304 -0.0855213  -0.8307622  -0.47072843  0.8830436
 -0.01577668 -0.11214862 -0.35596466  0.14229958 -0.8232616  -0.6206001
  1.1714388   0.12207097 -0.8596817   1.0001296  -0.20719041  0.26260647
  0.24186273 -0.13863896 -0.67035455  0.915935   -0.36935154 -0.94100636
  0.41584057  0.55235136  0.2305257  -0.08570104 -0.2115279

In [11]:
with open("2-ranking/lab4/airbnb_model", 'w') as _:
    model.save("2-ranking/lab4/airbnb_model")



In [12]:
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(test_corpus[sims[index][0]])))

Test Document (7589): «the blackbird is located in southeast washington adjacent to popular capitol hill and just couple miles from the national mall where you will find the us capitol building and smithsonian museums this location is within walking distance of all you need including the potomac metro station harris teeter for groceries the anacostia river trail for riverside jogs and views barracks row shops and your choice of eateries and nightlife few local favorites include ted bulletin rose luxury and trusty neighborhood bar this upscale community features top notch amenities and the roost large dining hall with multiple restaurants including coffee shop beer bar pizza place and more on the ground level enjoy stay with zeus we have reviews see below the spaceat zeus we offer thoughtfully furnished homes for day stays our homes come equipped with handpicked essentials from premium mattresses and linens to»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t5):



IndexError: list index out of range