In [None]:
!pip install 'gensim==4.2.0'

In [21]:
import gensim
import numpy as np
import pandas as pd
from gensim.models import Doc2Vec

np.random.seed(42)

In [22]:
%%writefile get_data.sh

if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi
if [ ! -f doc2vec_yelp_model ]; then
  wget -O doc2vec_yelp_model https://www.dropbox.com/s/bibu9bashb0cd68/doc2vec_yelp_model?dl=0
fi

Overwriting get_data.sh


In [23]:
!bash get_data.sh

In [24]:
model = Doc2Vec.load("./doc2vec_yelp_model")

In [25]:
query = 'Best french restaurant'

In [26]:
# Use the same simple_preprocess from the last lab 4 to tokenize the query
tokenized_query = list(gensim.utils.simple_preprocess(query))

In [27]:
inferred_vector = model.infer_vector(tokenized_query)
print(inferred_vector)

[-0.015852   -0.01245708  0.02448442  0.02316836 -0.00636528 -0.05666345
  0.01188675  0.05251787 -0.00991789 -0.01537098  0.00817956 -0.02695475
  0.00129227  0.01644902  0.00480043 -0.02299844  0.019534   -0.00244595
 -0.01085451 -0.0460495   0.04253983  0.01998582  0.02777729  0.00373844
 -0.02647129  0.02146443 -0.03385974  0.04424274 -0.03547577 -0.00168219
 -0.01801299 -0.02068545 -0.00867598 -0.0788483  -0.00564839 -0.0199126
  0.01797485 -0.00956179 -0.01614873 -0.0399931  -0.00167424 -0.0258941
 -0.0281709  -0.05421452 -0.01145232 -0.00881896 -0.05787582  0.01509779
 -0.02921747  0.00258792  0.03219331 -0.01198142 -0.03314912 -0.02825442
 -0.03034867  0.00687339 -0.01623205 -0.04018967 -0.00646119  0.0002666
 -0.02310766 -0.01837127 -0.0026562  -0.00716793 -0.01591156  0.0560048
  0.02939916  0.04081299 -0.04366104  0.03770551 -0.0025445   0.01388076
 -0.00741539 -0.03035516  0.05523631  0.02708992 -0.02044608  0.03778033
 -0.01453955 -0.03884731 -0.03727143  0.03676966  0.008

In [28]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
sample_reviews = yelp_best_worst.text.sample(n=200)

In [29]:
similarities = []
for review in sample_reviews:
    try:
        similarity = model.similarity_unseen_docs(doc_words1= list(gensim.utils.simple_preprocess(query)), doc_words2= list(gensim.utils.simple_preprocess(review)))
    except KeyError:
        similarity = 0
    similarities.append(similarity)

    print(similarities)

[0.263291]
[0.263291, 0.15908472]
[0.263291, 0.15908472, 0.012330348]
[0.263291, 0.15908472, 0.012330348, 0.50509095]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085, 0.23583852]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085, 0.23583852, 0.051990747]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085, 0.23583852, 0.051990747, 0.16713591]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085, 0.23583852, 0.051990747, 0.16713591, 0.17089179]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085, 0.23583852, 0.051990747, 0.16713591, 0.17089179, 0.114798315]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587085, 0.23583852, 0.051990747, 0.16713591, 0.17089179, 0.114798315, 0.59309036]
[0.263291, 0.15908472, 0.012330348, 0.50509095, 0.15128589, 0.5587

Ok, what we have done is having a query term, then finding the similarity between the query and the reviews.

The idea behind the algorithm would be to reorder the results based on the similarity score, not on BM25. Let's see which one is better.

In [36]:
# Create a dataframe that has as columns the sample reviews and the similarity with the query
reviews_with_similarities = pd.concat([sample_reviews.reset_index(drop=True), pd.Series(similarities).reset_index(drop=True)], axis=1).rename(columns={0: 'similarity', 'text': 'review'})
reviews_with_similarities.shape

(200, 2)

In [38]:
a = reviews_with_similarities.sort_values(by="similarity", ascending=False) # Sort the df_results by similarity column in descending order
a.shape
print(a)

                                                review  similarity
99   Best bruschetta I have had here. Good wine sel...    0.851139
90                          Huge store, friendly staff    0.825329
49   This place is great! I really appreciate the e...    0.764835
40   Jon, Tess, Charlie, Erin, Kathleen and whole g...    0.762752
83                                Fresh and delicious.    0.721806
..                                                 ...         ...
158  Lets just say i was not to please with my expe...   -0.080616
122         Came for the beer, stayed for the scenery.   -0.092666
80   Do not get your oil changed at this walmart or...   -0.166028
139  If I were ever to be executed, I'd want a Meat...   -0.197665
53   Easily the worst "burrito" I have ever had.\nI...   -0.269309

[200 rows x 2 columns]


In [39]:

print(f'Most similar document after reranking within retrieved results has description: \n\n{a["review"].iloc[0]}\n\nWith similarity: {a["similarity"].iloc[0]}\n\n---------\n\n')

Most similar document after reranking within retrieved results has description: 

Best bruschetta I have had here. Good wine selection as well.

My favorite bruschetta are:
-smoked prosciutto, fig, mascarpone, truffle oil
-roasted chicken, sundried tomatoes, goat cheese
-pulled pork, housemade pesto, roasted red pepper
-mozz, basil, roma tomato, qc olive oil & balsamic

With similarity: 0.8511393666267395

---------




In [None]:
print(f'Most similar document before reranking within retrieved results has description: \n\n{reviews_with_similarities["review"].iloc[0]}\n\nWith similarity: {reviews_with_similarities["similarity"].iloc[0]}\n\n---------\n\n')

In [None]:
print(f'Number of documents that surpass 0.5 similarity threshold: {len(a[a["similarity"] >= 0.5])}')

It is remarkable how using DBOW the most similar result understood the need for good prices and good food (which can be said characterizes french food). On the other hand the least similar result is a sports bar, which seems about right as well!.

It is not a perfect method, but a very good indication. A good idea is to have something like this **between** the raw results (thousands), filter them by similarity (hundreds) and then have a learning to rank recommender (dozens).

Tensorflow has opensources TF Recommenders which is great to plug in as an algorithm **after** these results. But this alone would work just fine.
