# Homework 3 - Find the perfect place to stay in Texas!

###### Alessandro Flaborea, Egon Ferri, Melis Kaymaz

The homework consists in analyzing the text of Airbnb property listings and building a search engine.

In [68]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk
import nltk
import math
import json
from geopy import distance
import numpy
import heapq_max


## Step 2: Create documents

We want to create a `.tsv` file for each record of the dataset.
First thing to do is reading the file.

In [573]:
#opening the csv file containing all the rooms in Texas
f = pd.read_csv(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\Airbnb_Texas_Rentals.csv')


#taking only the average_rate_per_night for each entry in the dataframe
f['price_rank'] = list(map(lambda x: str(x)[1:], f['average_rate_per_night']))


Now we can create `.tsv` files and store them in a directory.

In [None]:
#creating all the tsv files from the dataframe
for i in range(f.index.max()+1):
    op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\doc\doc_' + str(i) + '.tsv', 'w', encoding="utf-8")
    for j in range(10):
        op.write('%s\t' %f.iloc[i, j])
    op.close()

## Step 3: Search Engine

Now, we want to create two different Search Engines that, given as input a query, return the houses that match the query.

As a first common step, we want to preprocess the documents by

1. Removing stopwords
2. Removing punctuation
3. Stemming

Then we want to build a file named `vocabulary.txt`, that maps each word to an integer (`term_id`).

In [574]:
#FUNCTIONS

#preprocess takes a list of words as input and RETURNS a list of stemmed words without stopwords and punctuation
def preprocess(l):
    final = []
    for i in l:
        if not((ps.stem(i) in stopWords) or (ps.stem(i) in (string.punctuation) )):
            final.append(ps.stem(i))
    return (final)

#takes a dictionary, a list of words and an integer. RETURNS a dictionary of words and an integer.
#this functions map each word in vocabulary to an integer (that starts from index)
def vocabularization(vocabulary, final, index):
    for word in final:
        if not(word in vocabulary):
            vocabulary[word] = str(index)
            index = index + 1
    return(vocabulary, index)

#list of stopwords
stopWords = set(stopwords.words('english'))
#the stemmer used
ps = PorterStemmer()
#list of punctuation used
string.punctuation = string.punctuation + '–“”’'


In [71]:
#creating the vocabulary that maps a word to an integer
vocabulary= {}
index = 0

for i in range(18259):
    
    op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\doc\doc_' + str(i) + '.tsv', 'r', encoding="utf-8")
    for line in op:
        ou = line.strip().split('\t')
        sentence = ou[5].replace('\\n', ' ').replace('/', ' ').replace('*', ' ').replace('\\r', ' ').replace('\\t', ' ') + ' ' + ou[8].replace('\\n', ' ').replace('/', ' ').replace('*', ' ').replace('\\r', ' ').replace('\\t', ' ')
    op.close()
        
    #preprocessing data deleting stop words, punctuations, ecc.  
    final = preprocess(word_tokenize(sentence))
    
    # IF  word not in vocabulary -> add the word
    vocabulary, index = vocabularization(vocabulary, final, index)
            
op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\vocabulary.txt', 'w', encoding="utf-8")
op.write(json.dumps(vocabulary))
op.close()

In [575]:
#opening the created vocabulary.txt
op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\vocabulary.txt', 'r', encoding="utf-8")
vocabulary = json.loads(op.read())

### 3.1) Conjunctive query
At this moment, we narrow out interest on the `description` and `title` of each document. It means that the first Search Engine will evaluate queries with respect to the aforementioned information.

#### 3.1.1) Creating our index!

We want to create the Inverted Index. It will be a dictionary of this format:

```
{
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
...}
```

where _document\_i_ is the *id* of a document that contains the word.

We also want to store it in a separate file and load it in memory when needed.

In [None]:
#from the vocabulary, tha inverted index is created
inverted_index = {}

for file in range(18259):

    op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\doc\doc_' + str(file) + '.tsv', 'r', encoding="utf-8")
    for line in op:
        ou = line.strip().split('\t')
        sentence = ou[5].replace('\\n', ' ').replace('/', ' ').replace('*', ' ').replace('\\r', ' ').replace('\\t', ' ') + ' ' + ou[8].replace('\\n', ' ').replace('/', ' ').replace('*', ' ').replace('\\r', ' ').replace('\\t', ' ')
    op.close()
 
    
    #preprocessing data deleting stop words, punctuations, ecc.  
    final = preprocess(word_tokenize(sentence))
    
    
    #CREATING INVERTED INDEX
    for word in final:
        index = vocabulary[word]
        if not (index in inverted_index):
            inverted_index[index] = ['doc_' + str(file)]
        elif not('doc_' + str(file) in inverted_index[index]):
            inverted_index[index] = inverted_index[index] + ['doc_' + str(file)]

op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\inverted_index.txt', 'w', encoding="utf-8")
op.write(json.dumps(inverted_index))
op.close()

In [577]:
#opening the created inverted_index.txt

op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\inverted_index.txt', 'r', encoding="utf-8")
inverted_index = json.loads(op.read())


#### 3.1.2) Execute the query
Now given a query, that we let the user enter:
```
queen netflix
```
we want that the Search Engine returns a list of documents that contains all the words in the query.

Query input:

In [578]:
#handling the query
user_query = input('What are you searching?\n')
    
#preprocessing data deleting stop words, punctuations, ecc.  
#final contains the processed words
query = preprocess(word_tokenize(user_query))


What are you searching?
Room with Netflix and garden


Traducing the query in our 'language':

In [579]:
#taking from the inverted index only the words present in the query
voc = {}
inverted_index_query = {}
i=0
for word in query: 
    voc[i]  = vocabulary[word]
    i = i+1
for index in range(i):
    inverted_index_query[voc[index]] = inverted_index[voc[index]]


Finding list of docs that contain all the words in the query and printing them in the format that we want:

In [580]:
#finding list of docs that contains all the words in the query
docs = []

for i in range(18259):
    doc = 'doc_' + str(i)
    b = True
    for j in voc.values(): 
        b = b and (doc in inverted_index_query[j])
    if b:
        docs.append(i)

#printing the table with docs with all the words in the query
df = f.filter(items = ['title', 'description', 'city', 'url']).loc[docs]
df.description = list(map(lambda x: x.replace('\\n', ' '), df.description.tolist()))
df.style.hide_index()

title,description,city,url
2 blocks to Rainey! Walk downtown!,"HOLIDAYS, LOCAL EVENTS, HIGH DEMAND WEEKENDS, DATES WITH LOW AVAILABILITY MAY BE SUBJECT TO HIGHER RATES, NIGHT MINIMUMS, AND/OR HIGHER FEES. AVAILABILITY SHOWN ON THIS CALENDAR'S WEBSITE IS NOT ALWAYS 100% ACCURATE. IT IS ALWAYS BEST TO INQUIRE WITH THE HOST/MANAGER/HOMEOWNER TO VERIFY AVAILABILTY AND A SPECIFIC QUOTE FOR YOUR STAY. QUOTES MAY DIFFER FROM THE BASE RATES, BASE FEES, NIGHT MINIMUMS AND/OR QUOTES PROVIDED BY THIS WEBSITE AND/OR DISPLAYED ON CALENDAR PLEASE READ ALL HOUSE RULES AVAILABLE ON WEBSITE AND/OR PROVIDED BEFORE BOOKING. ALL GUESTS ON RESERVATION MUST UNDERSTAND AND FOLLOW ALL RULES. Taylor House is in a residential neighborhood and all Guests will need to acknowledge they agree to respect my neighbors and the surrounding community to be accepted. Noise levels will be closely monitored. PARTIES ARE NOT ALLOWED. See below for more information. Welcome to my home! I look forward to hosting you! This is the perfect vacation rental home in downtown Austin! I strive to provide Guests with the perfect home base with all the creature comforts, in the perfect location for exploring what Austin has to offer. I have lived in Austin my entire life and I want to help make your trip one-of-a-kind! I’m available for suggestions of trendy new hot spots, and unique, only in Austin activities! Your perfect stay in the perfect location for the perfect vacation! A must stay! 2 blocks from the Rainey Street entertainment district! This area is the go-to spot for the Austin young professional nightlife scene. A short walk anywhere else downtown, including the convention center. The East 6th Street entertainment district is blocks away. Walk 2 blocks to Lady Bird Lake. Walk/Run/Bike the scenic Hike and Bike Trail or Kayak/Paddle Board on the lake as the sun sets. Kayak/Paddle Board/Small sailing and paddle boat rentals available blocks away. Walking distance to 6th Street, Congress, Paramount Theater, restaurants, bars, music venues, Texas Longhorn tailgating, DKR stadium, museums, a public pool, grocery store, coffee shops, comedy shows, improv shows, movie theaters (Alamo Ritz and Violet Crown), breweries (Hops & Grain), theater, art galleries, and other nightlife. The perfect location for the vacationer who wants to walk to everything unique that Austin has to offer. The home has tons of upgrades. Real wood floors. Large master bedroom with big bathroom. Modern kitchen with granite counters and stainless steel appliances. High ceilings. Huge granite island. The perfect fenced in yard for hanging out/re-cooperation. Ping pong table, a cornhole set, a washer set, tons of patio chairs, a Weber grill and tons of games are also included. Everything you need! (More about the home) -The home is very well maintained and very clean. -Tons of windows to let in light for a very open feel. -1st floor: Big fully furnished living room with a LED TV. Amazon Fire Stick with streaming Netflix subscription and other Apple TV content. There is an antenna that picks up the major networks and some other channels, but this is not guaranteed. There is no cable TV. Great place to recoup while watching some Netflix or grabbing a provided local magazine to find out more about what Austin has to offer. A large granite counter with several stools divides the kitchen and living area. Perfect for meal preparation, dining, a game of cards or just socializing. A state of the art kitchen complete with stainless steel appliances, gas stove, refrigerator/freezer, garbage disposal, dishwasher, microwave, toaster, blender, and coffee maker. Large surfaces for cooking and serving. All needed basic cooking ware and dining ware provided. Stained concrete flooring throughout. Large one half bath. -2nd floor: Master bedroom with private large bathroom. It has a double vanity and a large garden tub/shower. Some soap, shower gel, shampoo and conditioner typically provided. Hair dryer provided. Wood flooring on the stairs and landing. Energy efficient, front-loading Washer/Dryer. -3rd floor: All wood flooring. 2nd bedroom. Deck overlooking the neighborhood and the downtown skyline. Deck has plenty of seating and a table. -Yard: The perfect yard with privacy fence for rest, relaxation and socializing. Corhole set, washer set, tons of chairs, ping pong table, and a Weber grill. -Big front patio with chairs. -Central Energy Efficient HVAC. -1 parking spot on the lot with plenty of free street parking. Please do not park in front of neighboring homes. -High speed WIFI. -Fresh towels provided. -Cards and tons of games. -Fresh coffee. -BBQ tools. (Bring your own charcoal, lighter fluid and matches) -Hair Dryer. -Laundry detergent, softener and bleach provided for on-site laundry. -Iron and ironing board. Easy to Find! Arriving by car: 4 blocks off of Cesar Chavez. Very easy to find with plenty of parking in the area. Arriving by plane: Capitol Metro Bus Route '100-AIRPORT FLYER' drop offs downtown. 15 minute cab ride from the airport. Remember, Austin cabs only take 4 persons at one time. Uber/Lyft/Car2go all available in Austin. *15% LOCAL AND STATE TAXES ARE TYPICALLY INCLUDED IN THE ONLINE QUOTE . Austin City Code - Chapt – “Except as otherwise provided in this section, not more than six unrelated adult may reside in a dwelling unit.” (Ordinance No. ). THIS APPLIES TO ALL SHORT TERM/VACATION RENTALS IN AUSTIN. Please inquire further if you have any questions or concerns. ***Furnishings and amenities will be furnished as described whenever possible. In the event that an item is damaged/broken/lost/consumed the Host will restock as soon as possible. All items in description/pictures are not guaranteed.*** Let me help you have the perfect vacation! Thank you for looking! Have a great day! OL #",Austin,https://www.airbnb.com/rooms/1250575?location=Colorado%20River%2C%20TX
Gracious home in quiet neighborhood,"“PJ & Kay go the extra mile to make you feel like you're at a boutique hotel at a quarter of the cost.” (Christin, Feb 2017) A plush queen bed, fresh ground morning coffee, off street parking, personal refrigerator, and free WiFi provide a quiet, private retreat. Spacious library/dining with TV/Netflix set aside for guests. Kitchen, laundry room, & garden use included. Price is for one bedroom & private bath. For additional bedrooms, please book through Two Bedrooms in a Gracious Home.",Amarillo,https://www.airbnb.com/rooms/4936459?location=Amarillo%2C%20TX


### 3.2) Conjunctive query & Ranking score
In the new Search Engine, given a query, we want to get the *top-k* (the choice of *k* it's up to you!) documents related to the query. In particular we want:

* Find all the documents that contains all the words in the query (as before...).
* Sort them by their similarity with the query
* Return in output *k* documents, or all the documents with non-zero similarity with the query when the results are less than _k_.

To solve this task, we use the *tfIdf* score, and the _Cosine similarity_. Let's see how.

First thing; we create a new inverted index that contains `tfIdf`s:

In [87]:
#from the vocabulary the inverted index is created. In this case the index are mapped to a tuple (doc, tf-idf)

inverted_index_2 = {}

for file in range(18259):

    op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\doc\doc_' + str(file) + '.tsv', 'r', encoding="utf-8")
    for line in op:
        ou = line.strip().split('\t')
        sentence = ou[5].replace('\\n', ' ').replace('/', ' ').replace('*', ' ').replace('\\r', ' ').replace('\\t', ' ') + ' ' + ou[8].replace('\\n', ' ').replace('/', ' ').replace('*', ' ').replace('\\r', ' ').replace('\\t', ' ')
    op.close()
 
    
    #preprocessing data deleting stop words, punctuations, ecc.  
    final = preprocess(word_tokenize(sentence))
    
    
    #CREATING INVERTED INDEX with tf-idf values
    for word in final:
        index = vocabulary[word]
        
        tf = final.count(word) / len(final)
        idf = math.log( 18259 / len(inverted_index[vocabulary[word]]))
        tf_idf = tf*idf
        
        if not (index in inverted_index_2):
            inverted_index_2[index] = [('doc_' + str(file), tf_idf )]
        elif not(('doc_' + str(file), tf_idf)  in inverted_index_2[index]):
            inverted_index_2[index] = inverted_index_2[index] + [('doc_' + str(file), tf_idf)]


op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\inverted_index_2.txt', 'w', encoding="utf-8")
op.write(json.dumps(inverted_index_2))
op.close()

In [581]:
#opening the created inverted_index_2.txt

op = open(r'C:\Users\mccol\Desktop\Sapienza\ADM\HW3\inverted_index_2.txt', 'r', encoding="utf-8")
inverted_index_2 = json.loads(op.read())


Query input:

In [598]:
user_query = input('What are you searching?\n')

#preprocessing data deleting stop words, punctuations, ecc.  
query = preprocess(word_tokenize(user_query))
query

What are you searching?
Room with Netflix and garden


['room', 'netflix', 'garden']

Traducing the query in our 'language':

In [623]:
voc = {}
inverted_query = {}
i=0

#taking only the words in query from the inverted_index_2
for word in query: 
    voc[i]  = vocabulary[word]
    i = i+1
for index in range(i):
    inverted_query[voc[index]] = inverted_index_2[voc[index]]



finding nominator of cosine similarity formula:

In [622]:
n = {}
index = 0

#finding numerator for the cosine similarity
#normalization of the document vector to the length of the query  
for i in inverted_query:
    for j in inverted_query[i]:
        if index == 0:
            n[j[0]] = [j[1]]
        elif (not (j[0] in n)):
            n[j[0]] = [0]*index + ([j[1]])
        else:   
            n[j[0]] = n[j[0]] + [0]*(index - len(n[j[0]])) + ([j[1]])
    index = index + 1

    
for i in n:
    if len(n[i]) < len(query) :
        n[i] = n[i] + [0]*(len(query)-len(n[i]))


finding tfidfs of the query:

In [615]:
#calculating tf-idf of the query
tfidf_query = []  
for word in query:
    tf_query = query.count(word) / len(query)
    idf_query = math.log( 18259 / len(inverted_index[vocabulary[word]]))
    tfidf_query.append(tf_query * idf_query)


finding denominator of our dear formula: norm of the query and norm of docs:

In [616]:
#norm of the query
norm_query = numpy.sqrt(sum(list(map(lambda x: x**2, tfidf_query))))

In [617]:
#norm of each 'normalized' document 
norm_doc = {}
for i in range(18259):
    doc = 'doc_'+ str(i)
    nomin = 0
    for i in inverted_query:
        for j in inverted_query[i]:
            if j[0] == doc :
                nomin = nomin + j[1]**2
                norm_doc[doc] = nomin
for i in norm_doc:
    norm_doc[i] = numpy.sqrt(norm_doc[i])


sorting them with haep algorithms:

In [618]:
#finding the cosine similarities for each word 
cosines= []
for i in n:
    st = i
    cos = numpy.dot(tfidf_query, n[st])
    cosine = round(cos / (norm_doc[st]*norm_query), 10)

    cosines.append((cosine, i ))


In [620]:
#create the max heap from the cosine similarituìies values and taking the k-largest values
k = 5

heap_max = []
for i in cosines:
    heapq_max.heappush_max(heap_max, i)
heap_max

best_cosine = []
docs = []
for i in range(k):
    cos = heapq_max.heappop_max(heap_max)
    best_cosine.append(cos[0])
    docs.append(int(cos[1][4:]))
print('The best ' + str(k) + ' docs are: ', docs)

The best 5 docs are:  [17221, 9967, 6957, 17622, 1671]


In [621]:
#print the table with the best k documents found and their ranking

df = f.filter(items = ['title', 'description', 'city', 'url']).loc[docs]
df.description = list(map(lambda x: x.replace('\\n', ' '), df.description.tolist()))
df['ranking'] = best_cosine
df.style.hide_index()

title,description,city,url,ranking
Gracious home in quiet neighborhood,"“PJ & Kay go the extra mile to make you feel like you're at a boutique hotel at a quarter of the cost.” (Christin, Feb 2017) A plush queen bed, fresh ground morning coffee, off street parking, personal refrigerator, and free WiFi provide a quiet, private retreat. Spacious library/dining with TV/Netflix set aside for guests. Kitchen, laundry room, & garden use included. Price is for one bedroom & private bath. For additional bedrooms, please book through Two Bedrooms in a Gracious Home.",Amarillo,https://www.airbnb.com/rooms/4936459?location=Amarillo%2C%20TX,1.0
Museum District Flat w/Garden Deck,Urban Hideaway with 81 WalkScore: MFAH/Menil/Rothko/Natural Science Museums Texas Medical Ctr House of Blues/Jones Hall/Wortham Center/Alley Theatre Rice & St. Thomas Universities George R. Brown & Toyota Ctrs Minute Maid & NRG by rail* WiFi Netflix/Amazon with Starz/NO cable Microwave/Keurig/Mini-Fridge Review photo CAPTIONS FYI Urban setting near 527 Spur. Ample Street Parking. Shared Deck & W/D with 2 Airbnb units. *Rail at Wheeler Station E of 59 attracts all kinds,Houston,https://www.airbnb.com/rooms/5968747?location=Bellaire%2C%20TX,0.976871
Museum District Flat w/Garden Deck,Urban Hideaway with 81 WalkScore: MFAH/Menil/Rothko/Natural Science Museums Texas Medical Ctr House of Blues/Jones Hall/Wortham Center/Alley Theatre Rice & St. Thomas Universities George R. Brown & Toyota Ctrs Minute Maid & NRG by rail* WiFi Netflix/Amazon with Starz/NO cable Microwave/Keurig/Mini-Fridge Review photo CAPTIONS FYI Urban setting near 527 Spur. Ample Street Parking. Shared Deck & W/D with 2 Airbnb units. *Rail at Wheeler Station E of 59 attracts all kinds,Houston,https://www.airbnb.com/rooms/5968747?location=Baytown%2C%20TX,0.976871
Museum District Flat w/Garden Deck,Urban Hideaway with 81 WalkScore: MFAH/Menil/Rothko/Natural Science Museums Texas Medical Ctr House of Blues/Jones Hall/Wortham Center/Alley Theatre Rice & St. Thomas Universities George R. Brown & Toyota Ctrs Minute Maid & NRG by rail* WiFi Netflix/Amazon with Starz/NO cable Microwave/Keurig/Mini-Fridge Review photo CAPTIONS FYI Urban setting near 527 Spur. Ample Street Parking. Shared Deck & W/D with 2 Airbnb units. *Rail at Wheeler Station E of 59 attracts all kinds,Houston,https://www.airbnb.com/rooms/5968747?location=Alvin%2C%20TX,0.976871
Museum District Flat w/Garden Deck,Urban Hideaway with 81 WalkScore: MFAH/Menil/Rothko/Natural Science Museums Texas Medical Ctr House of Blues/Jones Hall/Wortham Center/Alley Theatre Rice & St. Thomas Universities George R. Brown & Toyota Ctrs Minute Maid & NRG by rail* WiFi Netflix/Amazon with Starz/NO cable Microwave/Keurig/Mini-Fridge Review photo CAPTIONS FYI Urban setting near 527 Spur. Ample Street Parking. Shared Deck & W/D with 2 Airbnb units. *Rail at Wheeler Station E of 59 attracts all kinds,Houston,https://www.airbnb.com/rooms/5968747?location=Brazos%20River%2C%20TX,0.976871


### USER QUERIES

In [331]:
#asking the user where he/she wanna go
user_city = input('Where do you want to go?\n')

#asking the user how much he want to spend
user_price = input('How much do you want to spend?\n')

#asking the user how many bedrooms the client wants
user_bedroom = input('How many bedrooms do you want?\n')

Where do you want to go?
dallas
How much do you want to spend?
100
How many bedrooms do you want?
2


#### CITY RANKING

In [624]:
#taking from the dataframe only the cities with their means of coordinates
f['city'] = list(map(lambda x: x.lower(), f['city']))
f.at[1216, 'latitude'] = 31.498774
f.at[1216, 'longitude'] = -94.345574
geo_city = f.groupby('city').mean().filter(['latitude', 'longitude'])

In [625]:
#find the coordinates for the inserted city
coord_city = (geo_city[geo_city.index == user_city]['latitude'].values[0], geo_city[geo_city.index == user_city]['longitude'].values[0])
print('Coordinates for the request city: ', coord_city)

Coordinates for the request city:  (32.846866634629194, -96.78907793292947)


In [626]:
#fill the NaN latitudine and longitude with the mean location of the city 
for i in f[numpy.isnan(f.latitude)].index:
    latitude = (geo_city.loc[(f.loc[i, 'city'])][0])
    longitude = (geo_city.loc[(f.loc[i, 'city'])][1])

    f.at[i, 'latitude'] = latitude
    f.at[i, 'longitude'] = longitude

In [627]:
#distance betweeen user coords and room coords 
dist = []

index = 0
for i in f.index:
    dist.append(distance.distance((f.loc[i, 'latitude'], f.loc[i, 'longitude']), coord_city).km)  


In [628]:
#rankings for city
rankings_city = {}
for i in range(len(dist)):
    if(dist[i] >= 0):
        rankings_city[i] = 1 - ((dist[i] - min(dist) ) / (max(dist)- min(dist)))
    else:
        rankings_city[i] = None
rankings_city    
f['city_rank'] = rankings_city.values()

In [629]:
rankings_city = {}
rang  = max(dist) - min(dist) 

#rank to each city in Texas. Highest ranks are for the closest cities to the user's one. 
for city in geo_city.index:
    coord = (geo_city[geo_city.index == city]['latitude'].values[0], geo_city[geo_city.index == city]['longitude'].values[0])
    rank = 1 - ((distance.distance(coord, coord_city).km - min(dist)) / rang)
    rankings_city[city] = rank


#### PRICE RANKING

In [630]:
#column of average price without the '$' char
f['price_rank'] = list(map(lambda x: str(x)[1:], f['average_rate_per_night']))

In [631]:
#distances between the average price of the room and the user price
price_dist=[]
for i in f.price_rank.values:
    try:
        price_dist.append(abs((int(i)) - int(user_price ))  )
    except:
        price_dist.append(-1)


In [632]:
#taking the range (min and max) from the distances
minimum = price_dist[0]
for i in price_dist:
        if i < minimum and not(i == -1):
            minimum = i
rang = max(price_dist) - minimum           

In [633]:
#finding the rankings
#logarithm cause the range min-max is really high
rankings_price = {}
for i in range(len(price_dist)):
    if(price_dist[i] >= 0):
        rankings_price[i] = 1-(numpy.log(price_dist[i]+1))/10
    else:
        rankings_price[i] = None
rankings_price    
f['price_rank'] = rankings_price.values()

#### BEDROOM RANKING

In [634]:
#taking the distances between the #bedrooms of the room and the user nuumber
room_dist=[]
for i in f.bedrooms_count.values:
    try:
        if i == 'Studio':
            room_dist.append(abs(1.5 - int(user_bedroom )))
        else:
            room_dist.append(abs((int(i)) - int(user_bedroom )))
    except:
        room_dist.append(-1)

In [635]:
#taking the range (min and max) from the distances
minimum = room_dist[0]
for i in room_dist:
        if i < minimum and not(i == -1):
            minimum = i
rang = max(room_dist) - minimum           

In [636]:
#finding the rankings for bedrooms
rankings_bedroom = {}
for i in range(len(room_dist)):
    if(room_dist[i] >= 0):
        rankings_bedroom[i] = 1 - ((room_dist[i] - minimum ) / rang)
    else:
        rankings_bedroom[i] = None
rankings_bedroom    
f['bedroom_rank'] = rankings_bedroom.values()

#### FINAL RANKING

In [637]:
# giving weithed priority to the different rankings

rank = []
for i in f.index:
    w1 = 0.15
    w2 = 0.75
    w3 = 0.10
    if (f.loc[i, 'price_rank'] == None) and (f.loc[i, 'bedroom_rank'] == None):
        r = f.loc[i, 'city_rank']
    elif f.loc[i, 'bedroom_rank'] == None:
        w1 = w1 + w3/2
        w2 = w2 + w3/2
        r = f.loc[i, 'price_rank']*w1 + f.loc[i, 'city_rank']*w2
    elif f.loc[i, 'price_rank'] == None:
        w3 = w3 + w1/2
        w2 = w2 + w1/2
        r = f.loc[i, 'bedroom_rank']*w3 + f.loc[i, 'city_rank']*w2
    else:
        r = f.loc[i, 'price_rank']*w1 + f.loc[i, 'city_rank']*w2 + f.loc[i, 'bedroom_rank']*w3
    rank.append(str(round(r*100, 3)) + '%')
    
f['final_rank'] = rank

In [638]:
#tup contains (ranking, index_of_doc) for each doc
tup = []
for i in f.index:
    tup.append((float(f.loc[i , 'final_rank'][:-1]) , i))

In [640]:
#create the max heap from the cosine similarituìies values and taking the k-largest values
k = 10

heap_max = []
for i in tup:
    heapq_max.heappush_max(heap_max, i)
heap_max

best_doc = []
docs = []
for i in range(k):
    cos = heapq_max.heappop_max(heap_max)
    best_doc.append(cos[0])
    docs.append(int(cos[1]))

#displaying the best k rooms    
df = f.filter(items = ['title', 'description', 'city', 'url']).loc[docs]
df.description = list(map(lambda x: x.replace('\\n', ' '), df.description.tolist()))
df.style.hide_index()

title,description,city,url
Terrific Townhouse on Lower Greenville,"My townhouse is withing walking distance of the Lower Greenville and Knox Henderson neighborhoods. Convenient to restaurants and dining, family-friendly activities, and nightlife. You’ll love my place because of the neighborhood. My place is good for couples, solo adventurers, and business travelers.",dallas,https://www.airbnb.com/rooms/15952123?location=Balch%20Springs%2C%20TX
Dynamic Historic Studio in Uptown,Elegant studio in Uptown's premier historic building. Full of light and beautiful furniture.,dallas,https://www.airbnb.com/rooms/18391457?location=Carrollton%2C%20TX
Lone Star Darling,"Our beautiful, comfortable, historic Oak Cliff home is just 2 miles from the vibrant Bishop Arts District, 3.5 miles from Trinity Groves, and 3 miles from downtown Dallas, with a bus stop right outside the front door. The house is located on a busy street. That makes it easy to get around, but also means there is some traffic noise in the front of the house. The master bedroom is in the back and noise should not be an issue there.",dallas,https://www.airbnb.com/rooms/3230663?location=Cedar%20Hill%2C%20TX
Private Room and Full Bath,"This Dallas location is very close to Love Field Airport, great restaurants/dining. Good for couples, solo adventurers, and business travelers.",dallas,https://www.airbnb.com/rooms/13883549?location=Addison%2C%20TX
2BD Luxury Apartment In Central Upscale Area,"You will feel like you are at a resort at this luxury community which is located in one of the most desirable neighborhoods of Dallas. You'll only be a quick 15 minutes from DFW airport, 20 minutes to Love Field airport, and 15-20 Minutes from the Uptown/Downtown area. There is a bunch of shopping and restaurants nearby. You'll have 24 Hour access to the fully equipped on-site gym, spin/yoga room, and lounge where you can enjoy 6 large flat screen TVs, shuffleboard, billiards and a poker table.",irving,https://www.airbnb.com/rooms/13942575?location=Carrollton%2C%20TX
2BD Luxury Apartment In Central Upscale Area,"You will feel like you are at a resort at this luxury community which is located in one of the most desirable neighborhoods of Dallas. You'll only be a quick 15 minutes from DFW airport, 20 minutes to Love Field airport, and 15-20 Minutes from the Uptown/Downtown area. There is a bunch of shopping and restaurants nearby. You'll have 24 Hour access to the fully equipped on-site gym, spin/yoga room, and lounge where you can enjoy 6 large flat screen TVs, shuffleboard, billiards and a poker table.",irving,https://www.airbnb.com/rooms/13942575?location=Coppell%2C%20TX
2BD Luxury Apartment In Central Upscale Area,"You will feel like you are at a resort at this luxury community which is located in one of the most desirable neighborhoods of Dallas. You'll only be a quick 15 minutes from DFW airport, 20 minutes to Love Field airport, and 15-20 Minutes from the Uptown/Downtown area. There is a bunch of shopping and restaurants nearby. You'll have 24 Hour access to the fully equipped on-site gym, spin/yoga room, and lounge where you can enjoy 6 large flat screen TVs, shuffleboard, billiards and a poker table.",irving,https://www.airbnb.com/rooms/13942575?location=Colleyville%2C%20TX
Beautiful Cozy Hideout | SMU and North Park Mall,"Everything inside is new!! Close to over 50 restaurants, shopping, SMU, North Park Mall, UTD, lower Greenville and 10 min from downtown/uptown/west village. 28 min to AT&T Cowboys Stadium. Sleeps 6 comfortably. You’ll L-O-V-E this place because of the private patio, beds, kitchen, coziness, and it's the perfect location. It's a newly renovated condo unit, with brand new appliances and fully furnished with all necessities. Close to Interstate I-75. Walking distance to the Park Lane station.",dallas,https://www.airbnb.com/rooms/17761909?location=Addison%2C%20TX
SwissAve CozyAttic/Fits a Crowd,"GOT PEEPS? refinished attic space, en suite bath antique tub with shower. Main room with queen, small twin in cozy window dormer behind french doors. For extra sleepers, add xtra long twin and twin trundle beds. $20 each extra",dallas,https://www.airbnb.com/rooms/5027175?location=Balch%20Springs%2C%20TX
Modern East Dallas Garden Home,"Close to Downtown, Deep Ellum, Fair Park, Convention Center, AA Center, Arts District, Uptown, and Greenville Avenue. Please have your ID verified before submitting a reservation request. Guests with positive reviews are preferred.",dallas,https://www.airbnb.com/rooms/10518366?location=Balch%20Springs%2C%20TX
