In [None]:
import gensim
import numpy as np
import pandas as pd
import pysolr
from gensim.models import Doc2Vec
from transformers import pipeline, set_seed
set_seed(42)
np.random.seed(42)
import sys

In [None]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '~/ml-solr-course'

In [None]:
generator = pipeline('text-generation', model='gpt2')
query = 'Midtown sunny two bedroom'
expanded_queries = generator(query, max_length=50, num_return_sequences=10)

In [None]:
expanded_queries

In [None]:
model_path = '2-ranking/lab4/airbnb_model'
doc2vec_model = Doc2Vec.load(model_path)
print(f'Doc2Vec Model loaded')

In [None]:
queries = []
for expanded_query in expanded_queries:
    tokenized = list(gensim.utils.simple_preprocess(expanded_query["generated_text"]))
    similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized, doc_words2= list(gensim.utils.simple_preprocess(query)))
    queries.append({"query": expanded_query["generated_text"], "similarity": similarity})

df_queries = pd.DataFrame(queries).sort_values(by="similarity", ascending=False, inplace=False)
df_queries.head()

In [None]:
expanded_query = df_queries.iloc[0]["query"]

In [None]:
# Create a client instance. The timeout and authentication options are not required.
solr = pysolr.Solr('http://localhost:8983/solr/airbnb', always_commit=True, timeout=10)

In [None]:
non_expanded_results = solr.search(query, **{
                'rows': 100,
            })

In [None]:
expanded_query

In [None]:
expanded_results = solr.search(expanded_query, **{
                'rows': 100,
            })

In [None]:
tokenized_query = list(gensim.utils.simple_preprocess(query))
tokenized_new_query = list(gensim.utils.simple_preprocess(expanded_query))

In [None]:
df_non_expanded_results = pd.DataFrame(non_expanded_results)
similarities = []
for result in non_expanded_results:
    try:
        similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized_query, doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    similarities.append(similarity)
df_non_expanded_results["Similarity"] = pd.Series(similarities)
df_non_expanded_results.sort_values(by="Similarity", ascending=False, inplace=True)


In [None]:
print(f'Most similar document before expansion has description: \n\n{df_non_expanded_results["description"].iloc[0]}\n\nWith similarity: {df_non_expanded_results["Similarity"].iloc[0]}')

In [None]:
df_expanded_results = pd.DataFrame(expanded_results)
new_similarities = []
for result in expanded_results:
    try:
        similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized_query, doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    new_similarities.append(similarity)
df_expanded_results["Similarity"] = pd.Series(new_similarities)
df_expanded_results.sort_values(by="Similarity", ascending=False, inplace=True)

In [None]:
print(f'Most similar document after expansion has description: \n\n{df_expanded_results["description"].iloc[0]}\n\nWith similarity: {df_expanded_results["Similarity"].iloc[0]}')

In [None]:
print(f'Number of documents that surpass 0.5 similarity threshold: {len(df_expanded_results[df_expanded_results["Similarity"] >= 0.5])}')
