In [1]:
import random
import sys

import numpy as np

np.random.seed(42)
import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

In [2]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/2- ranking/lab4', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [3]:
train_file_path = 'dataset/train_corpus_descriptions_airbnb.csv'
embedding_dim = 100
epochs = 50

In [4]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [5]:
train_corpus = list(read_corpus(train_file_path))


In [6]:
print(train_corpus[:2])

[TaggedDocument(words=['unit', 'upgraded', 'with', 'new', 'bamboo', 'flooring', 'brand', 'new', 'ultra', 'hd', 'sony', 'tv', 'new', 'paint', 'new', 'lighting', 'new', 'mattresses', 'ultra', 'fast', 'cable', 'internet', 'connection', 'apple', 'tv', 'google', 'chromecast', 'br', 'br', 'gorgeous', 'and', 'elegant', 'furnished', 'condo', 'in', 'front', 'of', 'culver', 'city', 'fox', 'hills', 'park', 'br', 'upper', 'corner', 'unit', 'total', 'silence', 'protected', 'by', 'trees', 'br', 'short', 'walk', 'to', 'the', 'new', 'westfield', 'mall', 'br', 'tennis', 'courts', 'heated', 'pool', 'and', 'jacuzzi', 'hot', 'tub', 'br', 'br', 'the', 'space', 'br', 'unit', 'upgraded', 'with', 'new', 'bamboo', 'flooring', 'brand', 'new', 'ultra', 'hd', 'sony', 'tv', 'new', 'paint', 'new', 'lighting', 'new', 'mattresses', 'ultra', 'fast', 'cable', 'internet', 'connection', 'br', 'br', 'gorgeous', 'and', 'elegant', 'furnished', 'apartment', 'in', 'front', 'of', 'culver', 'city', 'fox', 'hills', 'park', 'br',

In [7]:
class MonitorCallback(CallbackAny2Vec):
    def on_epoch_end(self, model):
        print("Model loss:", model.get_latest_training_loss())
        # print loss

monitor = MonitorCallback()

In [8]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=embedding_dim, min_count=2, epochs=epochs, workers=5, callbacks=[monitor])
model.build_vocab(train_corpus)

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

In [11]:
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)
inferred_vector = model.infer_vector(train_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (5074): «best location in town located in hells kitchen surrounded by bars and restaurants couple of blocks away from times square you can walk to chelsea market high lane moma from here it is the center of the culture here walking distance to the most major subway hub in the city and surprisingly it is very quiet and peaceful inside of the apartment where all essential amenities are included with full kitchen even mini library and comfy queen size bed br br the space br very spacious large living room and spacious bedroom mini library full kitchen and great size bathroom»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t5):

MOST (9935, 0.4349706470966339): «our renovated townhouse very spacious airy and bright stands proud in lively crown heights with culture galore parks large and small enviable food choices and great access to public transportation the architecture and decor here balance historic and modern elegance it is an oasis of calm and com

In [None]:
fname = get_tmpfile("airbnb_model")
model.save(fname)