In [None]:
import gensim
import numpy as np
import pandas as pd
import pysolr
from gensim.models import Doc2Vec

np.random.seed(42)
import tensorflow as tf
import sys

In [None]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '~/ml-solr-course'

In [None]:
model_path = '3-query-generation/lab6/alternative_queries'
query = 'Midtown sunny two bedroom'
alternative_queries_model = tf.saved_model.load(model_path)
print(f'Model loaded')


In [None]:
# Create a client instance. The timeout and authentication options are not required.
solr = pysolr.Solr('http://localhost:8983/solr/airbnb', always_commit=True, timeout=10)

In [None]:
non_expanded_results = solr.search(query, **{
                'rows': 100,
            })

In [None]:
i = 0
for result in non_expanded_results:
    i +=1
    if i == 10:
        break
    print(f'The Neighborhood is {result["neighbourhood_cleansed"]} and title is {result["name"]}')


In [None]:
states = None
next_char = tf.constant([query])
alternative_query = []

for n in range(75):
  next_char, states = alternative_queries_model.generate_one_step(next_char, states=states)
  alternative_query.append(next_char)

print(tf.strings.join(alternative_query)[0].numpy().decode("utf-8"))


In [None]:
new_query = ' '.join([query, ' ', tf.strings.join(alternative_query)[0].numpy().decode("utf-8")])

In [None]:
new_query

In [None]:
expanded_results = solr.search(new_query, **{
                'rows': 100,
            })

In [None]:
i = 0
for result in expanded_results:
    i +=1
    if i == 10:
        break
    print(f'The Neighborhood is {result["neighbourhood_cleansed"]} and title is {result["name"]}')

In [None]:
model_path = '2-ranking/lab4/airbnb_model'
doc2vec_model = Doc2Vec.load(model_path)
print(f'Doc2Vec Model loaded')

In [None]:
tokenized_query = list(gensim.utils.simple_preprocess(query))
tokenized_new_query = list(gensim.utils.simple_preprocess(new_query))

In [None]:
tokenized_new_query

In [None]:
df_non_expanded_results = pd.DataFrame(non_expanded_results)
similarities = []
for result in non_expanded_results:
    try:
        similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized_query, doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    similarities.append(similarity)
df_non_expanded_results["Similarity"] = pd.Series(similarities)
df_non_expanded_results.sort_values(by="Similarity", ascending=False, inplace=True)


In [None]:
df_non_expanded_results

In [None]:
print(f'Most similar document before expansion has description: \n\n{df_non_expanded_results["description"].iloc[0]}\nWith similarity: {df_non_expanded_results["Similarity"].iloc[0]}')

In [None]:
df_expanded_results = pd.DataFrame(expanded_results)
new_similarities = []
for result in expanded_results:
    try:
        similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized_query, doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    new_similarities.append(similarity)
df_expanded_results["Similarity"] = pd.Series(new_similarities)
df_expanded_results.sort_values(by="Similarity", ascending=False, inplace=True)

In [None]:
print(f'Most similar document after expansion has description: \n\n{df_expanded_results["description"].iloc[0]}\nWith similarity: {df_expanded_results["Similarity"].iloc[0]}')

In [None]:
print(f'Number of documents that surpass 0.5 similarity threshold: {len(df_expanded_results[df_expanded_results["Similarity"] >= 0.5])}')
