<a href="https://colab.research.google.com/github/axel-sirota/practical-nlp/blob/main/1-similarity/Practical_NLP_3_Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import random
import sys

import numpy as np

import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
from sklearn.model_selection import train_test_split


np.random.seed(42)
embedding_dim = 100
vocabulary_size_to_use = 50000
epochs = 10
train_file_path = './train_yelp.csv'
test_file_path = './test_yelp.csv'

In [13]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi

Overwriting get_data.sh


In [14]:
!bash get_data.sh

In [15]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
X = yelp_best_worst.text
y = yelp_best_worst.stars.map({1:0, 5:1})
y = yelp_best_worst.stars.map({1:0, 5:1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.to_csv(train_file_path, header=False, index=False, columns=['text'])
X_test.to_csv(test_file_path, header=False, index=False, columns=['text'])

In [16]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)  # ' I like ice cream'  -> ['i', 'like', 'ice', 'cream']
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [17]:
train_corpus = list(read_corpus(train_file_path))[:vocabulary_size_to_use]
test_corpus = list(read_corpus(test_file_path, tokens_only=True))

In [18]:
print(train_corpus[:2])


[TaggedDocument(words=['if', 'could', 'give', 'it', 'more', 'than', 'would', 'sweet', 'pea', 'and', 'live', 'down', 'the', 'street', 'literally', 'down', 'the', 'street', 'from', 'this', 'bar', 'we', 'waited', 'for', 'it', 'to', 'open', 'for', 'what', 'seemed', 'like', 'decades', 'praying', 'that', 'this', 'was', 'going', 'to', 'be', 'the', 'type', 'of', 'place', 'that', 'could', 'become', 'our', 'local', 'it', 'has', 'exceeded', 'our', 'expectations', 'the', 'atmosphere', 'is', 'amazing', 'the', 'drinks', 'are', 'amazing', 'every', 'last', 'one', 'of', 'them', 'but', 'the', 'margaritas', 'are', 'the', 'best', 've', 'ever', 'had', 'they', 'tasted', 'like', 'fresh', 'squeeze', 'of', 'sunshine', 'that', 'makes', 'me', 'happy', 'inside', 'margarita', 'mondays', 'margs', 'and', 'free', 'food', 'happy', 'hours', 'are', 'amazing', 'new', 'year', 'eve', 'last', 'year', 'was', 'amazing', 'the', 'year', 'anniversary', 'party', 'was', 'amazing', 'but', 'most', 'of', 'all', 'the', 'owner', 'and',

In [19]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=embedding_dim, min_count=2, epochs=epochs, workers=5)
model.build_vocab(train_corpus)

In [20]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


In [22]:
vector = model.infer_vector(gensim.utils.simple_preprocess('only you can prevent. \n forest fires'))
print(vector)

[-0.04697444  0.02877179  0.00197103  0.00176261 -0.04424812 -0.10461224
  0.03244643  0.05416036 -0.07026895  0.00257482  0.03210533 -0.0506878
  0.00201686  0.00659462  0.02546122 -0.02567191  0.03484739 -0.01378562
  0.01637085 -0.05976913  0.05574014  0.03910057  0.03734747 -0.05033958
 -0.05854894 -0.00663432 -0.06904349 -0.02832006 -0.02064372 -0.03622911
  0.03930648  0.0045964   0.01615743 -0.05271109 -0.00996348  0.01414592
  0.05090109 -0.03366894 -0.0433057  -0.06742075  0.01101882 -0.07000057
  0.01633526 -0.00498816 -0.02093288 -0.01915625 -0.05492952 -0.01395677
 -0.05535625  0.02585294  0.01661433 -0.01738672 -0.01116242 -0.03142674
 -0.02715968  0.04111617  0.00095678  0.00482185 -0.04075468 -0.02324656
  0.02286347 -0.00769972 -0.05162548  0.00200101 -0.06422206  0.10389072
  0.01468617  0.08428773 -0.03896919  0.06988147 -0.00429043  0.02294633
  0.0713889   0.00587143  0.08054425  0.0689339  -0.01796136  0.04527169
 -0.04427878 -0.03100914 -0.01562753  0.03802808 -0.

In [24]:
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'MOST SIMILAR %s: «%s»\n' % (sims[0], ' '.join(train_corpus[sims[0][0]].words)))

Test Document (1315): «we had an extremely atypical experience at the royal palms our first night have no question about that but they more than made up for it they were gracious they smiled they apologized and they made it right immediately can ask for anything more from that from place that prides itself in its customer service»

MOST SIMILAR (3695, 0.678717851638794): «it seems like cutting something in half so we could easily share it particularly after the plate has been delivered to the table for us to appreciate it beauty and after we ve communicated to our server and demonstrated with the first two courses that we were splitting the items is in the vein of nothing fancy»



  sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
