In [1]:
import random
import sys

import numpy as np
import pysolr
from gensim.models import Doc2Vec

np.random.seed(42)
import smart_open
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

In [2]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/2-ranking/lab5', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [3]:
model_path = '2-ranking/lab4/airbnb_model'
query = 'Midtown sunny chateau'
number_of_initial_retrieved = 100
model = Doc2Vec.load(model_path)
print(f'Model loaded')


Model loaded


In [4]:
train_file_path = 'dataset/train_corpus_descriptions_airbnb.csv'
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(train_file_path))

In [5]:
solr = pysolr.Solr('http://localhost:8983/solr/airbnb', always_commit=True, timeout=10)

In [6]:
results = solr.search(query, **{
                'rows': number_of_initial_retrieved,
            })
print(f'Number of results were {len(results)}')

Number of results were 100


In [7]:
tokenized_query = list(gensim.utils.simple_preprocess(query))

In [8]:
tokenized_query

['midtown', 'sunny', 'chateau']

In [9]:
inferred_vector = model.infer_vector(tokenized_query)
print(inferred_vector)

[-0.13317741 -0.08327162 -0.05896323  0.198863    0.08837474 -0.10128927
 -0.03774839  0.12804486 -0.11637027  0.07752918 -0.08379339 -0.24341781
 -0.1152764  -0.0380342  -0.01162513  0.00096184  0.15842196 -0.03149448
 -0.06447918 -0.04966461  0.05297375  0.04499943  0.26300025 -0.00495637
 -0.00811639 -0.01237526 -0.31035635 -0.04554832 -0.08744516  0.13197044
 -0.07512539 -0.07277572  0.27309597 -0.03244337 -0.1021488   0.03336933
  0.1033741   0.05921059  0.08747765 -0.22919369  0.05673542 -0.03701238
 -0.05196878  0.06557074  0.05705949  0.18787165  0.11636234 -0.08702054
  0.16798095 -0.04939402  0.12859981 -0.09130021 -0.11478294  0.09781408
 -0.24457586 -0.13920201  0.04701333  0.05600748 -0.11426292  0.32006794
  0.08258063  0.05610396  0.17860837  0.1011209  -0.1183629   0.06168132
  0.19987011  0.3221601  -0.20740926  0.17994316 -0.05479532 -0.02522329
  0.03568282 -0.02662557  0.08518705  0.08833198  0.0362552   0.07632203
 -0.08863655  0.09839624 -0.08653539  0.07403546  0

In [10]:
df_results = pd.DataFrame(results)
similarities = []
for result in results:
    try:
        similarity = model.similarity_unseen_docs(doc_words1= list(gensim.utils.simple_preprocess(query)), doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    similarities.append(similarity)
df_results["Similarity"] = pd.Series(similarities)

In [11]:
df_results.head()

Unnamed: 0,id,listing_url,name,description,neighborhood_overview,host_id,host_url,host_name,neighbourhood,neighbourhood_cleansed,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,reviews_per_month,_version_,Similarity
0,10684164,https://www.airbnb.com/rooms/10684164,Private 1 Bdrm Apt - Prospect Park,"Located in the beautiful Chateau Frontenac, Br...","A nice retreat from the city, this area of Pro...",19227044,https://www.airbnb.com/users/show/19227044,Adam,"Brooklyn, New York, United States",Flatbush,...,4,1 bath,1.0,2.0,"[""Kitchen"", ""TV"", ""Essentials"", ""Washer"", ""Dry...",$75.00,1,[0.02],1705862867379552257,0.183487
1,4248788,https://www.airbnb.com/rooms/4248788,W70s Lg. Studio with outdoor space,Welcome to the chateau 70-something! This larg...,Charming townhouse on tree-lined Street. <br /...,22047286,https://www.airbnb.com/users/show/22047286,Chloe,"New York, United States",Upper West Side,...,2,1 bath,,1.0,"[""Heating"", ""TV"", ""Fire extinguisher"", ""Carbon...",$250.00,5,[0.07],1705862866071977984,0.259785
2,48270841,https://www.airbnb.com/rooms/48270841,Romantic East Village Getaway,Welcome to the french chateau in the trees. Th...,The apartment is located in the heart of the E...,202489613,https://www.airbnb.com/users/show/202489613,Louis,"New York, United States",Gramercy,...,2,1 bath,1.0,1.0,"[""Kitchen"", ""TV"", ""Hangers"", ""Washer"", ""Essent...",$120.00,0,,1705862874349436928,0.221894
3,39920510,https://www.airbnb.com/rooms/39920510,Large Elevator 1 Bedroom Best Location,Sunny & Spacious furnished apartment with an a...,,61391963,https://www.airbnb.com/users/show/61391963,Stay With Vibe,,Midtown,...,3,1 bath,1.0,2.0,"[""Essentials"", ""Dryer"", ""Long term stays allow...",$87.00,4,[0.26],1705862872164204548,0.610566
4,15721847,https://www.airbnb.com/rooms/15721847,Colorful Artist Loft w. your own Ocean Oasis,"The Broadway Chateau, as it's known, is a uniq...",We are in a classic south Williamsburg nabe fi...,909710,https://www.airbnb.com/users/show/909710,Claire,"Brooklyn, New York, United States",Williamsburg,...,1,1 shared bath,1.0,1.0,"[""Dishes and silverware"", ""Smoke alarm"", ""Stov...",$49.00,17,[0.32],1705862868196392963,0.143336


In [12]:
df_results.sort_values(by="Similarity", ascending=False, inplace=True)

In [13]:
a = df_results[:10].reset_index(drop=True)

In [14]:
a

Unnamed: 0,id,listing_url,name,description,neighborhood_overview,host_id,host_url,host_name,neighbourhood,neighbourhood_cleansed,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,reviews_per_month,_version_,Similarity
0,39920510,https://www.airbnb.com/rooms/39920510,Large Elevator 1 Bedroom Best Location,Sunny & Spacious furnished apartment with an a...,,61391963,https://www.airbnb.com/users/show/61391963,Stay With Vibe,,Midtown,...,3,1 bath,1.0,2.0,"[""Essentials"", ""Dryer"", ""Long term stays allow...",$87.00,4,[0.26],1705862872164204548,0.610566
1,19160912,https://www.airbnb.com/rooms/19160912,Spacious Guest room 3 blocks from NY Presbyter...,"Spacious guest bedroom available in sunny, cha...",,1728792,https://www.airbnb.com/users/show/1728792,Brittany,,Washington Heights,...,2,1 shared bath,1.0,1.0,"[""Hot water"", ""Essentials"", ""Smoke alarm"", ""Wi...",$76.00,4,[0.16],1705862868714389505,0.58536
2,46962568,https://www.airbnb.com/rooms/46962568,15 foot ceiling Bright Studio - Madison Avenue!,"Open Studio with vaulted 15 foot ceilings, per...",Located in midtown south in a pre classic pre-...,86771798,https://www.airbnb.com/users/show/86771798,Daia,"New York, United States",Midtown,...,2,1 bath,,1.0,"[""Kitchen"", ""TV"", ""Carbon monoxide alarm"", ""Ha...",$79.00,0,,1705862873947832320,0.504169
3,25971425,https://www.airbnb.com/rooms/25971425,"Big, beautiful, sunny room in Hamilton Heights","Big sunny room!<br />Close to 1, A B C D train...","Hamilton Heights is an amazing, vibrant part o...",54369863,https://www.airbnb.com/users/show/54369863,,"New York, United States",Harlem,...,2,1 shared bath,1.0,1.0,"[""Lock on bedroom door"", ""Dryer"", ""Essentials""...",$55.00,1,[0.03],1705862869853143041,0.493059
4,11584340,https://www.airbnb.com/rooms/11584340,Super Cute and Sunny Studio,This sunny studio is located in the heart of H...,,10563705,https://www.airbnb.com/users/show/10563705,Hailey,,Washington Heights,...,2,1 bath,,1.0,"[""Kitchen"", ""TV"", ""Carbon monoxide alarm"", ""Es...",$110.00,0,,1705862867488604161,0.466355
5,4707057,https://www.airbnb.com/rooms/4707057,"Private, sunny room!",Located in northern Harlem and close to the tr...,"Great neighborhood, quiet and friendly people....",6327629,https://www.airbnb.com/users/show/6327629,Bb,"New York, United States",Harlem,...,1,1.5 baths,1.0,1.0,"[""Kitchen"", ""Washer"", ""Dryer"", ""Wifi"", ""Long t...",$90.00,1,[0.01],1705862866169495552,0.445695
6,26291721,https://www.airbnb.com/rooms/26291721,Gem in NoMad - entire apartment,Beautiful and very cozy renovated 1 bedroom ap...,Nomad is a blooming neighborhood near the Empi...,197675041,https://www.airbnb.com/users/show/197675041,Alexandra,"New York, United States",Midtown,...,2,1 bath,1.0,1.0,"[""Heating"", ""Shampoo"", ""Smoke alarm"", ""Carbon ...",$149.00,7,[0.34],1705862869890891776,0.408655
7,13912725,https://www.airbnb.com/rooms/13912725,Master bedroom with a California king size bed,One sunny master bedroom located on the fourth...,Astoria is a very diverse neighborhood with a ...,8163438,https://www.airbnb.com/users/show/8163438,PeiYoung,"Queens, New York, United States",Astoria,...,2,1 private bath,1.0,1.0,"[""Hair dryer"", ""Shampoo"", ""Kitchen"", ""Breakfas...",$70.00,0,,1705862867927957504,0.396342
8,39411928,https://www.airbnb.com/rooms/39411928,Modern Sunny Apartment - 10 mins to Manhattan,Looking for a SHORT TERM SUB-LETTER for 3 mont...,Astoria has an amazing selection of restaurant...,927072,https://www.airbnb.com/users/show/927072,Hind,"Queens, New York, United States",Astoria,...,4,1 bath,1.0,2.0,"[""Shampoo"", ""Kitchen"", ""Private entrance"", ""De...",$90.00,0,,1705862872004820992,0.39527
9,25635162,https://www.airbnb.com/rooms/25635162,LARGE SUNNY 1BD 10 MIN TO MANHATTAN,"Steps to the Subway Stop, 15 min to Midtown Ma...",,193126561,https://www.airbnb.com/users/show/193126561,,,Astoria,...,4,1 bath,1.0,2.0,"[""Kitchen"", ""TV"", ""Hot water"", ""Hangers"", ""Ess...",$109.00,70,[2.13],1705862869819588610,0.391785


In [15]:
print(f'Most similar document after reranking within retrieved results has description: \n\n{a["description"].iloc[0]}\nWith similarity: {a["Similarity"].iloc[0]}')

Most similar document after reranking within retrieved results has description: 

Sunny & Spacious furnished apartment with an amazing layout located right in the heart of Midtown East.
With similarity: 0.610565721988678


In [16]:
print(f'Number of documents that surpass 0.5 similarity threshold: {len(a[a["Similarity"] >= 0.5])}')

Number of documents that surpass 0.5 similarity threshold: 3
