In [1]:
import random
import sys

import numpy as np

np.random.seed(42)
import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

In [2]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/2-ranking/lab4', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [3]:
train_file_path = 'dataset/train_corpus_descriptions_airbnb.csv'
test_file_path = 'dataset/test_corpus_descriptions_airbnb.csv'
embedding_dim = 100
vocabulary_size_to_use = 50000
epochs = 10

In [4]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [5]:
train_corpus = list(read_corpus(train_file_path))[:vocabulary_size_to_use]
test_corpus = list(read_corpus(test_file_path, tokens_only=True))

In [6]:
print(train_corpus[:2])

[TaggedDocument(words=['run', 'runyon', 'canyon', 'our', 'gym', 'sauna', 'open', 'beautifully', 'furnished', 'mirrored', 'mini', 'suite', 'with', 'fireplacedyson', 'hot', 'cool', 'bladeless', 'fan', 'heaterbeekman', 'goat', 'milk', 'soappremium', 'memory', 'foam', 'pillowsfirst', 'morning', 'complementary', 'starbuck', 'coffee', 'latte', 'style', 'coffee', 'protein', 'bars', 'granola', 'bars', 'fresh', 'baked', 'swedish', 'cinnamon', 'roll', 'continental', 'bottle', 'of', 'artesian', 'or', 'sparkling', 'mineral', 'waterterry', 'robe', 'amish', 'wildflower', 'soapcandy', 'bowltrail', 'mix', 'jarcdc', 'cleaningthe', 'spacewe', 'opportunely', 'strive', 'for', 'peace', 'quiet', 'cleanness', 'neatness', 'by', 'means', 'of', 'happiness', 'here', 'runyon', 'canyon', 'beau', 'furn', 'mir', 'mini', 'suite', 'fireplacerun', 'runyon', 'canyon', 'beautifully', 'furnished', 'mirrored', 'mini', 'suite', 'with', 'fireplace', 'gyms', 'are', 'open', 'plunge', 'pool', 'jacuzzi', 'sundeck', 'runyon', 'ca

In [7]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=embedding_dim, min_count=2, epochs=epochs, workers=5)
model.build_vocab(train_corpus)

In [8]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [9]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[ 0.0543617  -0.09607653 -0.18506919  0.04053085  0.031168   -0.16437367
 -0.04537961  0.19203864  0.07020725 -0.0524042  -0.13580535  0.07174805
  0.01030537  0.0649034  -0.02263543  0.03978287 -0.05187762 -0.03672089
  0.01561754  0.12912893  0.10245254  0.1494444  -0.31008714  0.06073276
  0.13001624 -0.00914801 -0.05314909 -0.19197766 -0.08392991  0.01108635
 -0.12909304 -0.00677363 -0.00275039  0.07569529  0.00140331  0.01929314
 -0.13223873 -0.04534362  0.14854592 -0.10925484  0.01756385  0.02368191
  0.03049802 -0.03207293 -0.05552221  0.01124749 -0.17571908 -0.0949407
  0.14118816 -0.08533246 -0.0667005   0.09750918 -0.01931177  0.1278121
  0.29584658  0.12149689 -0.2980152   0.05182191  0.02721016 -0.06129049
  0.02502088 -0.00191163  0.05696656 -0.21691896  0.00512567  0.04483531
 -0.0731632   0.00524787  0.12567219  0.13939835 -0.0976295   0.02583137
 -0.01094146 -0.16175671 -0.0579786  -0.04792865  0.07094374 -0.10970317
 -0.03815283  0.08914021 -0.04156846  0.17817551  0.0

In [10]:
with open("2-ranking/lab4/airbnb_model", 'w') as _:
    model.save("2-ranking/lab4/airbnb_model")



In [31]:
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'MOST SIMILAR %s: «%s»\n' % (sims[index], ' '.join(train_corpus[sims[0][0]].words)))

Test Document (10420): «comfortable and elegant flat with private garden located minutes walk from archway tube station and local amenities including supermarkets bars and restaurants just minutes away from trendy camden town and minutes from central london by tube the area is very safe and modern new couch tv fully equipped kitchen and beautiful garden will make your stay very easy and comfortable»

MOST SIMILAR (49977, 0.6798374652862549): «comfortable private room located in north austin located off of near major shopping centers bus train stops and minutes from downtown»

