In [1]:
import pandas as pd
import numpy as np
import pickle
import csv
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine



from collections import defaultdict
from collections import Counter
from math import log




[nltk_data] Downloading package punkt to /Users/miguel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miguel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import data

In [5]:
texas1 = pd.read_csv("data/Airbnb_Texas_Rentals.csv")
texas1 = texas1.drop(['Unnamed: 0'], axis=1)

In [6]:
#pd.set_option('expand_frame_repr',False)
texas1.head()

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,$60,1,Bryan,February 2016,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,$75,2,Fort Worth,February 2017,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...


# Create docuemnts as tsv file

In [2]:
path1 = 'data/docs/'
path2 = '.tsv'
stemmed_path='data/tokenized_docs/'
sp = string.punctuation+'“”–’'



In [3]:
def remove_step(doc):
    """
    takes as input the string of the document
    removes stopwords, punctuation and makes stemming 
    """
    
    # check if it's a nan value 

    if isinstance(doc, float):
        return str(doc)
    
    doc=doc.replace("\\n", " ")
    # punctuations
    doc = [ c if c not in sp else " "  for c in doc ]
    doc = ''.join(doc)
    # stopwords
    doc = [ word for word in doc.split() if word.lower() not in stopwords.words('english') ]
    doc = ' '.join(doc)
    
    # stemming
    ps = PorterStemmer()
    words = word_tokenize(doc)
    
    w_lst = []
    for w in words:
        w_lst.append(ps.stem(w))
    
    # something else
    
    return ' '.join(w_lst)

In [4]:
def save_obj(obj, name):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
def preprocessing(data):
    n=len(data)

    for i in range(n):
        with open(path1 + 'doc_'+ str(i) + '.tsv', 'w') as doc:
            a = csv.writer(doc, delimiter='\t')
            a.writerow([data.iloc[i]['average_rate_per_night'],data.iloc[i]['bedrooms_count'] 
                      ,data.iloc[i]['city'] ,data.iloc[i]['date_of_listing'], data.iloc[i]['description']
                      ,data.iloc[i]['latitude'],data.iloc[i]['longitude'] ,data.iloc[i]['title'] ,data.iloc[i]['url']])

    return

def create_vocabulary_and_ii1 (data):
    n = len(data)
    vocabulary = {}
    ii1 = {}    
    cnt = 0
    
    for i in range(n):
        # creating a tokenized string with title and description
        tokenized_str = (remove_step(data.iloc[i]['title']) + ' ' 
                                     + remove_step(data.iloc[i]['description']))

        # creating the dictionary
        for term in tokenized_str.split(' '):
            if term in vocabulary.keys():
                term_id = vocabulary[term]
            else:
                vocabulary[term] = cnt
                term_id = cnt
                cnt+=1

            if term_id not in ii1.keys():
                ii1[term_id] = ['doc_'+str(i)]
            else:
                lista = ii1[term_id]
                document = 'doc_'+str(i)
                if document in lista:
                    continue
                else:        
                    ii1[term_id].append('doc_'+str(i))

    # store vocabulary in pickle format
    save_obj(vocabulary, 'vocabulary')
    save_obj(ii1, 'inverted_index_1')
    return
            

In [6]:
#create_vocabulary and inverted index 1 (for the first search engine)

#create_vocabulary_and_ii1(texas1)

In [None]:
# do not run if you don't want to waste time :-D 
#preprocessing(texas1)

In [7]:
def search_engine_1(query): 
    
    query = remove_step(query)
    query = list(set(query.split(' ')))
    
    lst_of_lst=[]
    
    vocabulary = load_obj('vocabulary')
    ii1 = load_obj('inverted_index_1')
    
    for w in query:
        if w not in vocabulary:
            print('No results')
            return
        i = vocabulary[w]
        lst_of_lst.append(ii1[i])


    doc_list = set.intersection(*[set(sublist) for sublist in lst_of_lst])
    doc_list = list(doc_list)
    dl = len(doc_list)
    
    if dl ==0:
        print('No results')
        return

    list_for_df=[]
    for i in range(dl):
        with open ("data/docs/" + doc_list[i] + '.tsv') as doc:
            row = doc.read()
            lst = row.split('\t')
            lst = [lst[7],lst[4],lst[2],lst[8]]
            list_for_df.append(lst)
        
    df=pd.DataFrame(list_for_df, columns=['Title', 'Description', 'City', 'Url'])
    
    return df.head(5)   
        

In [98]:
search_engine_1('private room bathroom')


Unnamed: 0,Title,Description,City,Url
0,Master room near SMU/Downtown/DLA AP/Train Sta...,1 PRIVATE clean comfy Master bedroom and an en...,Dallas,https://www.airbnb.com/rooms/12186608?location...
1,10 MIN WALK TO NRG: Private Room! Near Med Center,"Located less than a mile from NRG Stadium, thi...",Houston,https://www.airbnb.com/rooms/16649137?location...
2,Cool and Updated Space,Cozy and private bedroom and bathroom located ...,Austin,https://www.airbnb.com/rooms/14930777?location...
3,The Stuttgart Room (Schaefer Haus),This is the Stuttgart Room of the Schaefer Hau...,Galveston,https://www.airbnb.com/rooms/16917205?location...
4,G's Resort V 2.2,Convenient central location near SMU with priv...,Dallas,https://www.airbnb.com/rooms/8341908?location=...


In [None]:
#ii2 = dict_with_tf(texas1)

In [12]:
#temp = ii2

In [None]:
############ SPARK
for key,values in ii2.items():
    
    newlst = sc.parallelize(values)
    
    newlst = newlst.reduceByKey(lambda a,b:a+b)
    
    ii2[key] = newlst.take(len(values))
    

    II with TF:
    Taking each list for each term in the inverted index,
    we wanted to find the occurrencies for that term in each document.
    Thus, we created the inverted index with every document appended inside the value of each 
    term (the key of the dictionary), 
    and using in a loop the reduce_doc_list method.
    In this method we reduced the repetitions of the same docs in each list, summing them. 
    We created the  

In [27]:
def reduce_doc_list(doc_list):
    """
    function called by dict_TFIDF
    
    It reduces the list of documents into a list 
    of tuple with doc_id and its occurencies
    
    input:
    - list 
    output:
    - list 
    """
    tf_term_i = Counter(doc_list)
    doc_tf_lst = []
    doc_tf_lst = [tuple([key,value]) for key,value in tf_term_i.items()]
    return doc_tf_lst

def compute_ii2_TFIDF(ii2,n):
    """
    compute the ii2_TFIDF
    input:
    - inverted index matrix (with TF)
    - number of documents
    output:
    - ii2 
    """
    for key, value in ii2.items():
        N = len(value)
        new_list = []
        for item in value:
            new_list.append(tuple([item[0], round(float(item[1])* log(n/N),3)]))
            
        ii2[key] = new_list
    return ii2

In [28]:
def dict_with_TFIDF(data):
    """
    creates the TFIDF inverted index as dict
    and store it into a pickle file
    input:
    - data
    """
    vocabulary = load_obj('vocabulary')
    n = len(data)
    
    ii2 = defaultdict(list)
    
    for i in range(n):
        tokenized_str = (remove_step(data.iloc[i]['title']) + ' ' 
                                     + remove_step(data.iloc[i]['description']))
    
        for term in tokenized_str.split(' '):
            doc_name = 'doc_%s'%i
            ii2[vocabulary[term]].append(doc_name)
            
    
    for key,value in ii2.items():
        ii2[key] = reduce_doc_list(value)
    
    ii2 = compute_ii2_TFIDF(ii2,n)
    save_obj(ii2,'inverted_index_TFIDF')
    return 

In [None]:
import time
start = time.time()
dict_with_TFIDF(texas1)
print (time.time()-start)

In [29]:
ii2 = load_obj('inverted_index_TFIDF')

In [30]:
scrauso = {
    0: [('doc_0', 2), ('doc_1', 1), ('doc_3', 1), ('doc_7', 1)],
    1: [('doc_0', 2), ('doc_7', 1)]
       }

In [None]:
temp = ii2
temp = compute_ii2_TFIDF(temp, len(texas1))

In [None]:
temp = {
    0: ii2[0][0:5],
    1: ii2[1][:5],
    2: ii2[2][:5],
    3: ii2[3][:5]
}

In [None]:
# make the query
q = [1,2,3]



In [33]:
ii1 = load_obj('inverted_index_1')
ii1 = {i}

In [62]:
# initialize doc
docs = []

In [63]:
for i, values in enumerate(ii1.values()):
    if i == 0:
        docs += values
        docs = set(docs)
    else:
        docs.intersection(set(values))

In [63]:
#for values in ii1.values():
#    print(values)
    

In [None]:
def take_docs(inv_ind_1):
    #we need to take the indices from the numpy array - tricky
    
    docs = set.intersection(*[set(inv_ind_1[i]) for i in inv_ind_1])
    docs = list(docs)
    
    return docs
ᐧ

In [None]:
print(len(min(ii2.values())))

In [None]:
new_dict = {query:ii1[query] for query in q}

In [None]:
docs = []

for i,values in enumerate(new_dict.values()):
    
    if i == 0:
        docs = set(values)
    else:
        docs.intersection(values)
    
docs = list(set(first_set))

In [None]:
# doc
docs = filter(lambda x: x)

In [None]:
ii2 = load_obj('inverted_index_TFIDF')

In [None]:
ii2 = {query:ii2[query] for query in q}

In [None]:
len(ii2)

In [None]:
len(min(ii2.values()))

In [None]:
q = [1,2,3]
ii2 = load_obj('inverted_index_TFIDF')

new_dict = {query:ii1[query] for query in q}
rank_lst = []
query_a = np.ones(len(array), dtype=float)

for doc_id in docs:
    array = []
    for lst_value in ii2.values():
        array += (filter(lambda x:x[0]==doc_id, lst_value))
    
    if len(array) == 3:
        array = np.array([x[1] for x in array])
        rank_lst.append(tuple([doc_id,round(1-cosine(array,ways),3)]))
    

In [None]:
rank_lst.sort(key = lambda x:x[1],reverse=True)

In [None]:
#rank_lst

In [None]:
ways = np.ones(len(array), dtype=float)

In [173]:
from scipy.spatial.distance import cosine


In [None]:
np.dot(array,ways)/ np.mod()

In [None]:
rank_lst[list(['doc_1',np.dot()])]

In [None]:
np.dot()

In [None]:
array

In [None]:
print (q_new.values())

In [None]:
array = np.zeros(0,dtype=)



In [None]:
array

In [None]:
for i in range(len(q_vector)): 
    s = 0
    if q_vector[i] != 0:#in temp.keys():
        docs_tfidf = temp[q_vector[i]]
        
        print (temp[q_vector[i]])
        print('\n')

In [None]:
d1 = set([('doc1',2),('doc2',1)])

In [None]:
d2 = set(['doc1','doc3'])


In [None]:
docname = 'doc_7992'

In [None]:
len(docs)

In [None]:
r = list(filter(lambda x: x == docname,docs ))

In [None]:
q = [1,2,3]
ii2 = load_obj('inverted_index_TFIDF')
ii2 = {query:ii2[query] for query in q}

rank_lst = []
query_a = np.ones(len(array), dtype=float)

# filter the docs
docs = min(ii2.values())
for doc_id in docs:
    array = []
    for lst_value in ii2.values():
        array += (filter(lambda x:x[0]==doc_id, lst_value))

In [None]:
docs = min(ii2.values())



In [None]:
q = [1,2,3]
ii2 = load_obj('inverted_index_TFIDF')

new_dict = {query:ii2[query] for query in q}

rank_lst = []
query_a = np.ones(len(array), dtype=float)
docs = min(ii2.values())

for doc_id, _ in docs:
    array = []
    for lst_value in ii2.values():
        array += (filter(lambda x:x[0]==doc_id, lst_value))
    
    if len(array) == 3:
        array = np.array([x[1] for x in array])
        rank_lst.append(tuple([doc_id,round(1-cosine(array,ways),3)]))
    

In [None]:
q = [1,2,3]
ii2 = load_obj('inverted_index_TFIDF')

new_dict = {query:ii2[query] for query in q}

rank_lst = []
query_a = np.ones(len(array), dtype=float)
docs = min(ii2.values())




In [None]:
for docs_ID,_ in docs:
    print('ciao')
    break

In [None]:
docs

In [None]:
q = [1,2,3]
ii1 = load_obj('inverted_index_1')

ii1 = {query:ii1[query] for query in q}

rank_lst = []
query_a = np.ones(len(array), dtype=float)

docs = []

In [None]:
for i,lst in enumerate(ii1.values()):
    if i == 0:
        docs = set(lst)
    else:
        docs.intersection(lst)


In [None]:
ii2 = load_obj('inverted_index_TFIDF')
ii2 = {query:ii1[query] for query in q}

In [None]:
for doc_id in docs:
    array = []
    for lst_value in ii2.values():
        array += tuple(filter(lambda x:x[0]==doc_id, lst_value))
        
    if len(array) == len(ii2):
        array = np.array([x[1] for x in array])
        rank_lst.append(tuple([doc_id,round(1-cosine(array,ways),3)]))


if len(rank_lst) != 0:
    rank_lst.sort(key= lambda x:x[1], reverse=True )


In [None]:
a = [list(filter(lambda x: x[0] == 'doc1', [('doc1',1),('doc3',20)]))[0] for i in range(3)]


In [None]:
for doc_id in docs:
    array = []
    array += [list(filter(lambda x: x[0] == doc_id, lst_value))[0] for lst_value in ii2.values()]
  
    if len(array) == len(ii3):
        array = np.array([x[1] for x in array])
        rank_lst.append(tuple([doc_id,round(1-cosine(array,ways),3)]))


#if len(rank_lst) != 0:
#    rank_lst.sort(key= lambda x:x[1], reverse=True )



In [84]:
array

array([0, 1])

# prova

In [100]:
query = [1,2,3]

In [101]:
ii1 = load_obj('inverted_index_1')
ii1 = {query:ii1[query] for query in q}

In [None]:
#list(filter(lambda x: x[0] == 'doc_5153',prova))

In [None]:
#array += tuple(list(filter(lambda x: x[0] == 'doc_5153',prova)))

In [None]:
#array

In [97]:
vocabulary = load_obj('vocabulary')
vocabulary

{'2': 0,
 'privat': 1,
 'room': 2,
 'bathroom': 3,
 '10min': 4,
 'iah': 5,
 'airport': 6,
 'welcom': 7,
 'stay': 8,
 'queen': 9,
 'bed': 10,
 'detach': 11,
 'second': 12,
 'floor': 13,
 'anoth': 14,
 'bedroom': 15,
 'sofa': 16,
 'avail': 17,
 'addit': 18,
 'guest': 19,
 '10': 20,
 'pick': 21,
 'drop': 22,
 'trip': 23,
 'uniqu': 24,
 'locat': 25,
 'alamo': 26,
 'height': 27,
 'design': 28,
 'inspir': 29,
 'stylish': 30,
 'fulli': 31,
 'remodel': 32,
 'home': 33,
 'upscal': 34,
 'NW': 35,
 'area': 36,
 'amaz': 37,
 'hous': 38,
 'conveni': 39,
 'quiet': 40,
 'street': 41,
 'beauti': 42,
 'season': 43,
 'tree': 44,
 'prestigi': 45,
 'neighborhood': 46,
 'close': 47,
 '281': 48,
 '410': 49,
 'loop': 50,
 'town': 51,
 'featur': 52,
 'open': 53,
 'plan': 54,
 'origin': 55,
 'hardwood': 56,
 '3': 57,
 'full': 58,
 'independ': 59,
 'garden': 60,
 'TV': 61,
 'sleep': 62,
 'european': 63,
 'kitchen': 64,
 'top': 65,
 'line': 66,
 'decor': 67,
 'driveway': 68,
 'park': 69,
 '4': 70,
 'car': 71,
 '

# prova

In [102]:
q = [1,2,3]
ii1 = load_obj('inverted_index_1')
ii1 = {query:ii1[query] for query in q}

rank_lst = []

In [89]:
docs = []
for i,value_lst in enumerate(ii1.values()):
    if i == 0:
        docs = set(value_lst)
    else:
        docs.intersection(set(value_lst))

In [144]:
# find the smallest list values in dictionary:

smallest_value = []
docs = []
for i,key in enumerate(ii1.keys()):
    if i == 0:
        smallest_value = ii1[key]
        smallest_key = key
    else:
        if len(smallest_value) > len(ii1[key]):
            smallest_value = ii1[key]
            smallest_key = key

docs = set(ii1[smallest_key])

for key in ii1.keys():
    docs.intersection(ii1[key])

In [152]:
smallest_value = []
docs = []
for i,key in enumerate(ii1.keys()):
    if i == 0:
        smallest_value = ii1[key]
        smallest_key = key
    else:
        if len(smallest_value) > len(ii1[key]):
            smallest_value = ii1[key]
            smallest_key = key

docs = set(ii1[smallest_key])

for key in ii1.keys():
    docs.intersection(ii1[key])


In [188]:
docs = []
for i,key in enumerate(ii1.keys()):
    if i == 0:
        docs = set(ii1[key])
    else:
        docs.intersection(set(ii1[key]))

In [16]:
query = set([1,2,3])
docs = []
ii1 = load_obj('inverted_index_1')
ii1 = {query:ii1[query] for query in q}
docs = set.intersection(*[set(value) for value in ii1.values()])

In [17]:
len(docs)

1066

In [18]:
ii2 = load_obj('inverted_index_TFIDF')
ii2 = {query:ii2[query] for query in q}
query_array = np.ones(len(query), dtype=float)

In [19]:
rank_lst = []

In [29]:
for doc_id in docs:
    array = []
    for lst_value in ii2.values():
        array += tuple(filter(lambda x:x[0]==doc_id, lst_value))
        
    #if len(array) == len(ii2):
    array = np.array([x[1] for x in array])
    rank_lst.append(tuple([doc_id,(1-cosine(array,query_array))]))

rank_lst.sort(key = lambda x: x[1],reverse=True)

In [59]:
#rank_lst
from time import time

# point 3.2

In [8]:
from time import time

In [14]:
start = time()
query = 'room with private bathroom'
query = remove_step(query).split(' ')
vocabulary = load_obj('vocabulary')
query = filter(lambda x: x in vocabulary.keys(),query)
query = list(map(lambda x: vocabulary[x], query))


docs = []
ii1 = load_obj('inverted_index_1')
ii1 = {term_id:ii1[term_id] for term_id in query}
docs = set.intersection(*[set(value) for value in ii1.values()])

ii2 = load_obj('inverted_index_TFIDF')
ii2 = {term_id:ii2[term_id] for term_id in query}
query_array = np.ones(len(query), dtype=float)


rank_lst = []

for doc_id in docs:
    array = []
    for lst_value in ii2.values():
        array += tuple(filter(lambda x:x[0]==doc_id, lst_value))
        
    #if len(array) == len(ii2):
    array = np.array([x[1] for x in array])
    rank_lst.append(tuple([doc_id,(1-cosine(array,query_array))]))

rank_lst.sort(key = lambda x: x[1],reverse=True)
print(time()-start)

2.996943235397339


In [13]:
# BETTER ONE

In [15]:
def search_engine_2(query):

    query = remove_step(query).split(' ')
    vocabulary = load_obj('vocabulary')
    query = filter(lambda x: x in vocabulary.keys(),query)
    query = list(map(lambda x: vocabulary[x], query))


    docs = []
    ii1 = load_obj('inverted_index_1')
    ii1 = {term_id:ii1[term_id] for term_id in query}
    docs = set.intersection(*[set(value) for value in ii1.values()])

    ii2 = load_obj('inverted_index_TFIDF')
    ii2 = {term_id:ii2[term_id] for term_id in query}
    query_array = np.ones(len(query), dtype=float)


    rank_lst = []

    for doc_id in docs:
        array = []
        for item in query:
            array += tuple(filter(lambda x:x[0]==doc_id, ii2[item]))

        #if len(array) == len(ii2):
        array = np.array([x[1] for x in array])
        rank_lst.append(tuple([doc_id,(1-cosine(array,query_array))]))

    rank_lst.sort(key = lambda x: x[1], reverse = True)
    
    df = first_k_documents(rank_lst, 5)
    return df

In [16]:
def first_k_documents(rank_lst, k = 5):
    
    list_for_df = []
    for i in range(k):
        with open ("data/docs/" + rank_lst[i][0] + '.tsv') as doc:
            row = doc.read()
            lst = row.split('\t')
            lst = [lst[7],lst[4],lst[2],lst[8], round(rank_lst[i][1],3)]
            list_for_df.append(lst)

    return pd.DataFrame(list_for_df, columns=['Title', 'Description', 'City', 'Url', 'Similarity'])

In [17]:
start = time()
df = search_engine_2('room with private bathroom')
print(time()-start)
df

2.627304792404175


Unnamed: 0,Title,Description,City,Url,Similarity
0,Big room w/ private entrance and private bathroom,Enjoy a private room with your own dedicated e...,Denton,https://www.airbnb.com/rooms/14450478?location...,0.999
1,Pool/Hot Tub - 3 Room Suite - Private Bathroom,"Two bedrooms, a sitting room &amp; a private b...",DeSoto,https://www.airbnb.com/rooms/12791862?location...,0.999
2,Spacious Bedroom w/ Private Entrance and Bathroom,Spacious room with a private entrance and priv...,Austin,https://www.airbnb.com/rooms/17365892?location...,0.999
3,Pool/Hot Tub - 3 Room Suite - Private Bathroom,"Two bedrooms, a sitting room &amp; a private b...",DeSoto,https://www.airbnb.com/rooms/12791862?location...,0.999
4,Big room w/ private entrance and private bathroom,Enjoy a private room with your own dedicated e...,Denton,https://www.airbnb.com/rooms/14450478?location...,0.999


In [34]:
first_k_documents(rank_lst, 5)

Unnamed: 0,Title,Description,City,Url,Similarity
0,Big room w/ private entrance and private bathroom,Enjoy a private room with your own dedicated e...,Denton,https://www.airbnb.com/rooms/14450478?location...,0.999
1,Big room w/ private entrance and private bathroom,Enjoy a private room with your own dedicated e...,Denton,https://www.airbnb.com/rooms/14450478?location...,0.999
2,North Austin private room and bathroom,Private room in a second-floor flat of a two-y...,Austin,https://www.airbnb.com/rooms/17463665?location...,0.999
3,Pool/Hot Tub - 3 Room Suite - Private Bathroom,"Two bedrooms, a sitting room &amp; a private b...",DeSoto,https://www.airbnb.com/rooms/12791862?location...,0.999
4,Big room w/ private entrance and private bathroom,Enjoy a private room with your own dedicated e...,Denton,https://www.airbnb.com/rooms/14450478?location...,0.999


In [15]:
for i in range(10):
    print(rank_lst[i][0])

doc_15988
doc_16309
doc_7508
doc_544
doc_2470
doc_8424
doc_7859
doc_13970
doc_12679
doc_4869


In [None]:
('doc_8424', 0.9993531235127938)
('doc_16309', 0.9993531235127938)
('doc_15988', 0.9993531235127938)
('doc_7859', 0.9993531235127938)
('doc_544', 0.9993531235127938)
('doc_7508', 0.9993531235127938)
('doc_2470', 0.9993531235127938)
('doc_4869', 0.9894524449320425)
('doc_3374', 0.9894524449320425)
('doc_8859', 0.9894524449320425)

In [68]:
queryarray = np.array([2,1,3])

In [82]:
array = np.array([14,10,2,0])
array = np.take(array, [2,1,3])
array = np.nonzero(array)[0]

In [83]:
array

array([0, 1])

In [187]:
vocabulary

{'2': 0,
 'privat': 1,
 'room': 2,
 'bathroom': 3,
 '10min': 4,
 'iah': 5,
 'airport': 6,
 'welcom': 7,
 'stay': 8,
 'queen': 9,
 'bed': 10,
 'detach': 11,
 'second': 12,
 'floor': 13,
 'anoth': 14,
 'bedroom': 15,
 'sofa': 16,
 'avail': 17,
 'addit': 18,
 'guest': 19,
 '10': 20,
 'pick': 21,
 'drop': 22,
 'trip': 23,
 'uniqu': 24,
 'locat': 25,
 'alamo': 26,
 'height': 27,
 'design': 28,
 'inspir': 29,
 'stylish': 30,
 'fulli': 31,
 'remodel': 32,
 'home': 33,
 'upscal': 34,
 'NW': 35,
 'area': 36,
 'amaz': 37,
 'hous': 38,
 'conveni': 39,
 'quiet': 40,
 'street': 41,
 'beauti': 42,
 'season': 43,
 'tree': 44,
 'prestigi': 45,
 'neighborhood': 46,
 'close': 47,
 '281': 48,
 '410': 49,
 'loop': 50,
 'town': 51,
 'featur': 52,
 'open': 53,
 'plan': 54,
 'origin': 55,
 'hardwood': 56,
 '3': 57,
 'full': 58,
 'independ': 59,
 'garden': 60,
 'TV': 61,
 'sleep': 62,
 'european': 63,
 'kitchen': 64,
 'top': 65,
 'line': 66,
 'decor': 67,
 'driveway': 68,
 'park': 69,
 '4': 70,
 'car': 71,
 '