In [None]:
import random
import sys

import numpy as np

np.random.seed(42)
import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

In [None]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

In [None]:
train_file_path = 'dataset/train_corpus_descriptions_airbnb.csv'
test_file_path = 'dataset/test_corpus_descriptions_airbnb.csv'
embedding_dim = 100
vocabulary_size_to_use = 50000
epochs = 10

In [None]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [None]:
train_corpus = list(read_corpus(train_file_path))[:vocabulary_size_to_use]
test_corpus = list(read_corpus(test_file_path, tokens_only=True))

In [None]:
print(train_corpus[:2])

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=embedding_dim, min_count=2, epochs=epochs, workers=5)
model.build_vocab(train_corpus)

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

In [None]:
with open("2-ranking/lab4/airbnb_model", 'w') as _:
    model.save("2-ranking/lab4/airbnb_model")



In [None]:
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'MOST SIMILAR %s: «%s»\n' % (sims[0], ' '.join(train_corpus[sims[0][0]].words)))