In [None]:
import sys

import numpy as np
import pysolr
from gensim.models import Doc2Vec

np.random.seed(42)
import smart_open
import pandas as pd
import gensim

In [None]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '~/ml-solr-course'

In [None]:
model_path = '2-ranking/lab4/airbnb_model'
query = 'Midtown sunny chateau'
number_of_initial_retrieved = 100
model = Doc2Vec.load(model_path)
print(f'Model loaded')


In [None]:
train_file_path = 'dataset/train_corpus_descriptions_airbnb.csv'
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(train_file_path))

In [None]:
solr = pysolr.Solr('http://localhost:8983/solr/airbnb', always_commit=True, timeout=10)

In [None]:
results = solr.search(query, **{
                'rows': number_of_initial_retrieved,
            })
print(f'Number of results were {len(results)}')

In [None]:
tokenized_query = list(gensim.utils.simple_preprocess(query))

In [None]:
tokenized_query

In [None]:
inferred_vector = model.infer_vector(tokenized_query)
print(inferred_vector)

In [None]:
df_results = pd.DataFrame(results)
similarities = []
for result in results:
    try:
        similarity = model.similarity_unseen_docs(doc_words1= list(gensim.utils.simple_preprocess(query)), doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    similarities.append(similarity)
df_results["Similarity"] = pd.Series(similarities)

In [None]:
df_results.head()

In [None]:
a = df_results.sort_values(by="Similarity", ascending=False)

In [None]:
a = a[:10].reset_index(drop=True)

In [None]:
print(f'Most similar document after reranking within retrieved results has description: \n\n{a["description"].iloc[0]}\nWith similarity: {a["Similarity"].iloc[0]}')

In [None]:
print(f'Most similar document before reranking within retrieved results has description: \n\n{df_results["description"].iloc[0]}\nWith similarity: {df_results["Similarity"].iloc[0]}')


In [None]:
print(f'Number of documents that surpass 0.5 similarity threshold: {len(a[a["Similarity"] >= 0.5])}')