<a href="https://colab.research.google.com/github/axel-sirota/practical-nlp/blob/main/1-similarity/Practical_NLP_3_Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import random
import sys

import numpy as np

import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
from sklearn.model_selection import train_test_split


np.random.seed(42)
embedding_dim = 100
vocabulary_size_to_use = 50000
epochs = 10
train_file_path = './train_yelp.csv'
test_file_path = './test_yelp.csv'

In [15]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi

Overwriting get_data.sh


In [16]:
!bash get_data.sh

In [17]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
X = yelp_best_worst.text
y = yelp_best_worst.stars.map({1:0, 5:1})
y = yelp_best_worst.stars.map({1:0, 5:1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.to_csv(train_file_path, header=False, index=False, columns=['text'])
X_test.to_csv(test_file_path, header=False, index=False, columns=['text'])

In [18]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [19]:
train_corpus = list(read_corpus(train_file_path))[:vocabulary_size_to_use]
test_corpus = list(read_corpus(test_file_path, tokens_only=True))

In [20]:
print(train_corpus[:2])


[TaggedDocument(words=['if', 'could', 'give', 'it', 'more', 'than', 'would', 'sweet', 'pea', 'and', 'live', 'down', 'the', 'street', 'literally', 'down', 'the', 'street', 'from', 'this', 'bar', 'we', 'waited', 'for', 'it', 'to', 'open', 'for', 'what', 'seemed', 'like', 'decades', 'praying', 'that', 'this', 'was', 'going', 'to', 'be', 'the', 'type', 'of', 'place', 'that', 'could', 'become', 'our', 'local', 'it', 'has', 'exceeded', 'our', 'expectations', 'the', 'atmosphere', 'is', 'amazing', 'the', 'drinks', 'are', 'amazing', 'every', 'last', 'one', 'of', 'them', 'but', 'the', 'margaritas', 'are', 'the', 'best', 've', 'ever', 'had', 'they', 'tasted', 'like', 'fresh', 'squeeze', 'of', 'sunshine', 'that', 'makes', 'me', 'happy', 'inside', 'margarita', 'mondays', 'margs', 'and', 'free', 'food', 'happy', 'hours', 'are', 'amazing', 'new', 'year', 'eve', 'last', 'year', 'was', 'amazing', 'the', 'year', 'anniversary', 'party', 'was', 'amazing', 'but', 'most', 'of', 'all', 'the', 'owner', 'and',

In [21]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=embedding_dim, min_count=2, epochs=epochs, workers=5)
model.build_vocab(train_corpus)

In [22]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


In [23]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.02762986  0.00179712  0.03048734 -0.00375619 -0.03567026 -0.05924828
 -0.00979371  0.05149005 -0.05577854 -0.00663567 -0.01950982 -0.01442671
  0.02828592  0.04166871  0.03524429 -0.07006898  0.02940768 -0.05654595
 -0.02355556 -0.02870545  0.02540269 -0.02266831 -0.0315986   0.01148102
 -0.04188496 -0.10340449  0.01125128  0.03223917  0.06352893 -0.00545531
  0.070298   -0.05531643 -0.0072738   0.01125406 -0.03984931  0.05630789
  0.05494026  0.05946151 -0.02652008 -0.06166969  0.03374788 -0.01944711
 -0.01121592  0.03518479 -0.05244188 -0.01358799 -0.04486623 -0.04464933
  0.00853691 -0.02403549  0.00398715 -0.03169059  0.02048805 -0.01051473
 -0.00430318 -0.03759081  0.01480608  0.0246211  -0.00719381  0.0687056
 -0.02043215 -0.06467283  0.0650305   0.01485862 -0.06586582 -0.04737824
 -0.00392825 -0.0202585  -0.0050929  -0.0207573   0.04117596  0.00476713
  0.02556793 -0.00289785  0.03210641 -0.052872   -0.00215181 -0.00910484
  0.00828335 -0.05540834  0.0420768  -0.03149442 -0.

In [28]:
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'MOST SIMILAR %s: «%s»\n' % (sims[0], ' '.join(train_corpus[sims[0][0]].words)))

Test Document (1531): «do agree with couple of the reviewers who mentioned salt there were couple of dishes that had just tad too much however think that was not due to mistake of the chef believe it is just because they use cured meats in lot of their dishes another reviewer mentioned bad service but we did not experience that at all our server was extremely warm and friendly she was very knowledgeable on the food and could answer any questions we had for her she also took the time for each course to explain each dish and in my case the wine and why they went together only one time did my wine not come out at the same time with the food course and in that case our server apologized immediately do agree with some of the reviewers that the service is very slow however believe that this is intentional they have created very cozy and comfortable environment and they want you to linger and enjoy yourself with the farmer feast in particular it took us almost hours start to finish but enjoye