In [6]:
import gensim
import numpy as np
import pandas as pd
import pysolr
from gensim.models import Doc2Vec
from transformers import pipeline, set_seed
set_seed(42)
np.random.seed(42)
import sys

In [7]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/3-query-generation/lab8', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [10]:
generator = pipeline('text-generation', model='gpt2')
query = 'Midtown sunny two bedroom'
expanded_query = generator(query, max_length=50, num_return_sequences=1)[0]['generated_text']

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [11]:
# Create a client instance. The timeout and authentication options are not required.
solr = pysolr.Solr('http://localhost:8983/solr/airbnb', always_commit=True, timeout=10)

In [15]:
non_expanded_results = solr.search(query, **{
                'rows': 10,
            })

In [16]:
for result in non_expanded_results:
    print(f'The ID is {result["id"]} and title is {result["name"]}')


The ID is 24894476 and title is Super Large Bedroom in Harlem
The ID is 30811958 and title is Sunny 2 Bedroom Penthouse with Good Vibes
The ID is 18971993 and title is Sunny Bedroom Steps to Subway & Central Park
The ID is 27903031 and title is Lovely Room/ private Bath in Prewar Midtown Gem
The ID is 21422090 and title is Bedroom 5 min away from E, 7, F, M, R trains!
The ID is 18108648 and title is Large 2bd 15 min to Midtown Manhattan
The ID is 982276 and title is *Perfect Bedroom! Large Sunny Apt*Manhattan*NYC
The ID is 4221999 and title is Spacious & Bright: 1 BR Midtown!
The ID is 16339429 and title is Quiet Room in Unique Carroll Gardens
The ID is 387666 and title is Historic townhouse in convenient location


In [17]:
expanded_query

'Midtown sunny two bedroom, two bathrooms available. A few bedrooms.\n\n1-800-273-8255\n\nwww.jimsonjames.com\n\n(206) 522-2046\n\nwww.n'

In [19]:
expanded_results = solr.search(expanded_query, **{
                'rows': 10,
            })

In [20]:
for result in expanded_results:
    print(f'The ID is {result["id"]} and title is {result["name"]}')


The ID is 30562364 and title is Stunning 2,000 sq ft 2BR in Williamsburg
The ID is 20607037 and title is Midtown East 2 Bed/2 Bath Elevator Building Apt
The ID is 43324727 and title is 2-bedroom apartment right next to Mt. Sinai West
The ID is 387666 and title is Historic townhouse in convenient location
The ID is 1150869 and title is Large 2-bedr UWS Apt $200.00/day, May- Sept 2020
The ID is 11486601 and title is Beautiful Spacious Loft near subway and train
The ID is 3395238 and title is 3 Bedroom House with Garden
The ID is 21422090 and title is Bedroom 5 min away from E, 7, F, M, R trains!
The ID is 48102119 and title is 3 Bedroom Apartment in Midtown West Manhattan
The ID is 295231 and title is Sunny, clean br available


In [21]:
model_path = '2-ranking/lab4/airbnb_model'
doc2vec_model = Doc2Vec.load(model_path)
print(f'Doc2Vec Model loaded')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Doc2Vec Model loaded


In [22]:
tokenized_query = list(gensim.utils.simple_preprocess(query))
tokenized_new_query = list(gensim.utils.simple_preprocess(expanded_query))

In [23]:
tokenized_new_query

['midtown',
 'sunny',
 'two',
 'bedroom',
 'two',
 'bathrooms',
 'available',
 'few',
 'bedrooms',
 'www',
 'jimsonjames',
 'com',
 'www']

In [24]:
df_non_expanded_results = pd.DataFrame(non_expanded_results)
similarities = []
for result in non_expanded_results:
    try:
        similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized_query, doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    similarities.append(similarity)
df_non_expanded_results["Similarity"] = pd.Series(similarities)
df_non_expanded_results.sort_values(by="Similarity", ascending=False, inplace=True)


In [25]:
df_non_expanded_results

Unnamed: 0,id,listing_url,name,description,neighborhood_overview,host_id,host_url,host_name,neighbourhood,neighbourhood_cleansed,...,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,reviews_per_month,_version_,Similarity
0,24894476,https://www.airbnb.com/rooms/24894476,Super Large Bedroom in Harlem,Large and sunny bedroom in Harlem. Two blocks ...,Sugar Hill is a landmark residential area in N...,15940253,https://www.airbnb.com/users/show/15940253,Raimundo,"New York, United States",Harlem,...,2,1 shared bath,1.0,1.0,"[""Shampoo"", ""Wifi"", ""Long term stays allowed"",...",$72.00,4,[0.14],1705862869734653954,0.512166
1,30811958,https://www.airbnb.com/rooms/30811958,Sunny 2 Bedroom Penthouse with Good Vibes,Welcome to NYC!!!<br />Spacious Private Sunny ...,"Jacky Robinson park, New York sports club, man...",230498530,https://www.airbnb.com/users/show/230498530,Patricia,"New York, United States",Harlem,...,5,1 bath,2.0,2.0,"[""Refrigerator"", ""Essentials"", ""Dryer"", ""Long ...",$95.00,3,[0.13],1705862870632235009,0.503906
6,982276,https://www.airbnb.com/rooms/982276,*Perfect Bedroom! Large Sunny Apt*Manhattan*NYC,"Large sunny apt in historic Chinatown, Lower M...","I live in Chinatown, Lower East Side. I love t...",1449904,https://www.airbnb.com/users/show/1449904,Edel,"New York, United States",Two Bridges,...,1,1.5 baths,1.0,1.0,"[""Refrigerator"", ""Essentials"", ""Long term stay...",$98.00,49,[0.51],1705862865376772098,0.492481
4,21422090,https://www.airbnb.com/rooms/21422090,"Bedroom 5 min away from E, 7, F, M, R trains!",Sunny bedroom available in a two bedroom apart...,"The neighborhood is very quiet and dynamic, fu...",106766048,https://www.airbnb.com/users/show/106766048,Chelsea,"Queens, New York, United States",Jackson Heights,...,2,1 shared bath,1.0,1.0,"[""Kitchen"", ""Essentials"", ""Hangers"", ""Heating""...",$35.00,12,[0.42],1705862869131722752,0.487014
2,18971993,https://www.airbnb.com/rooms/18971993,Sunny Bedroom Steps to Subway & Central Park,Sunny queen size bedroom in a large two bedroo...,,128237188,https://www.airbnb.com/users/show/128237188,Mante,,East Harlem,...,2,1 shared bath,1.0,1.0,"[""Kitchen"", ""TV"", ""Carbon monoxide alarm"", ""Ha...",$79.00,91,[1.97],1705862868682932224,0.44807
9,387666,https://www.airbnb.com/rooms/387666,Historic townhouse in convenient location,"Welcome to Michaelny, our town house in the Ju...",The Morris Jumel HIstoric District is a few bl...,1939728,https://www.airbnb.com/users/show/1939728,Michael & Aleksandra,"New York, United States",Washington Heights,...,2,2 shared baths,2.0,3.0,"[""Cable TV"", ""Refrigerator"", ""Essentials"", ""Lo...",$99.00,9,[0.29],1705862865104142338,0.437023
5,18108648,https://www.airbnb.com/rooms/18108648,Large 2bd 15 min to Midtown Manhattan,Large and sunny 2 bedroom apartment. Elevator ...,"Located in Astoria, one of the best neighborho...",10194688,https://www.airbnb.com/users/show/10194688,Mariela,"Queens, New York, United States",Ditmars Steinway,...,6,1 bath,2.0,4.0,"[""Cable TV"", ""Refrigerator"", ""Essentials"", ""Lo...",$176.00,152,[3.16],1705862868544520194,0.421073
3,27903031,https://www.airbnb.com/rooms/27903031,Lovely Room/ private Bath in Prewar Midtown Gem,Very well located elegant 3 bedroom sunny prew...,This is a great family apartment in the middle...,7580102,https://www.airbnb.com/users/show/7580102,Jamie,"New York, United States",Midtown,...,2,1 shared bath,1.0,1.0,"[""Cable TV"", ""Essentials"", ""Dryer"", ""Long term...",$175.00,5,[0.16],1705862870150938624,0.393158
7,4221999,https://www.airbnb.com/rooms/4221999,Spacious & Bright: 1 BR Midtown!,Our spacious and clean one bedroom apartment s...,"The neighborhood and our block is residential,...",20335235,https://www.airbnb.com/users/show/20335235,Stephen,"New York, United States",Midtown,...,2,1 bath,1.0,1.0,"[""Kitchen"", ""Cable TV"", ""TV"", ""Carbon monoxide...",$225.00,10,[0.13],1705862866067783680,0.354973
8,16339429,https://www.airbnb.com/rooms/16339429,Quiet Room in Unique Carroll Gardens,A sunny furnished room in quiet building on th...,,106977720,https://www.airbnb.com/users/show/106977720,Kristan,,Carroll Gardens,...,1,2 shared baths,1.0,1.0,"[""Kitchen"", ""Dedicated workspace"", ""Wifi"", ""TV...",$40.00,2,[0.04],1705862868306493444,0.338038


In [30]:
print(f'Most similar document before expansion has description: \n\n{df_non_expanded_results["description"].iloc[0]}\n\nWith similarity: {df_non_expanded_results["Similarity"].iloc[0]}')

Most similar document before expansion has description: 

Large and sunny bedroom in Harlem. Two blocks from A and D express train (that will take you to midtown in 20 minutes) and 7 minutes to the 1 train. Plenty of stores, restaurants, cafes, parks, supermarkets nearby, but still a very quiet apartment. Beautiful area.<br /><br /><b>The space</b><br />The room is very large and sunny. Two windows that receive amazing light and a nice view to two large trees :)

With similarity: 0.5121657252311707


In [27]:
df_expanded_results = pd.DataFrame(expanded_results)
new_similarities = []
for result in expanded_results:
    try:
        similarity = doc2vec_model.similarity_unseen_docs(doc_words1=tokenized_query, doc_words2= list(gensim.utils.simple_preprocess(result["description"])))
    except KeyError:
        similarity = 0
    new_similarities.append(similarity)
df_expanded_results["Similarity"] = pd.Series(new_similarities)
df_expanded_results.sort_values(by="Similarity", ascending=False, inplace=True)

In [31]:
print(f'Most similar document after expansion has description: \n\n{df_expanded_results["description"].iloc[0]}\n\nWith similarity: {df_expanded_results["Similarity"].iloc[0]}')

Most similar document after expansion has description: 

Spacious & beautiful two bedroom, two bathroom apartment available in Midtown Manhattan. Great for families or couples, this apartment has two large bedrooms (queen beds), 2 sleek bathrooms, updated kitchen and large living room. Equipped with wifi, cable TV, dishwasher and more! Elevator building just blocks from Grand Central Station, offering easy access to subways, buses and all that midtown has to offer! Feel free to reach out with specific questions.

With similarity: 0.5262804627418518


In [29]:
print(f'Number of documents that surpass 0.5 similarity threshold: {len(df_expanded_results[df_expanded_results["Similarity"] >= 0.5])}')


Number of documents that surpass 0.5 similarity threshold: 1
