In [1]:
import numpy as np
import pandas as pd
import csv
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
sno = nltk.stem.SnowballStemmer('english')

In [2]:
import json

with open('regular_index.json', encoding='utf-8') as fp:
    indx = json.loads(fp.read())

with open('inverted_index.json', encoding='utf-8') as fp:
    inv_indx = json.loads(fp.read())

with open('words_transformation.json', encoding='utf-8') as fp:
    final_dict = json.loads(fp.read())

with open('words_inverse_transformation.json', encoding='utf-8') as fp:
    final_dict_inv = json.loads(fp.read())

In [3]:
import os

path, dirs, files = next(os.walk("directory"))
file_count = len(files)

# Creating a dictionary of the inverse index with tf_idf score in order to compute the cosine similarity

In [30]:
from math import log
inv_indx_cosine = {i:[] for i in list(final_dict.keys())}
for word in list(final_dict.keys()):
    for i,l in enumerate(list(indx.values())):
        if int(word) in l:
            tf = l.count(int(word))/len(l)
            id_f = log(file_count/len(inv_indx[word]))
            inv_indx_cosine[word].append((i+1,tf*id_f))

# Creating the respective index dictionary

In [33]:
indx_cosine = {}
for doc in range(file_count):
    indx_cosine[str(doc+1)] = []
    for term in list(final_dict.keys()):
        if int(doc+1) in [r for r in inv_indx[str(term)]]:
            k  = inv_indx[term].index(doc+1)
            indx_cosine[str(doc+1)].append((term,inv_indx_cosine[term][k][1]))

In [39]:
with open('inverted_index_cosine.json', 'w') as fp:
    json.dump(inv_indx_cosine, fp)
    
with open('index_cosine.json', 'w') as fp:
    json.dump(indx_cosine, fp)   

In [40]:
query = 'a beautiful house with garden and beach'

In [41]:
m_raw = query
#replace $ with dollar
m = re.sub('[$]','dollar',m_raw)
# replace new line symbols with whitespace
m = m.replace('\\r', ' ').replace('\\n', ' ')
#remove punctuations with 2 ways
#m = ''.join([c for c in m if c not in string.punctuation])
m = re.sub('[%s]' % re.escape(string.punctuation), ' ', m)
# seperate numbers from words
m = re.sub(r'(?<=[\d+])(?=[a-zA-Z_])', r' ', m)
# remove digits from messages
#m = ''.join(c for c in m if not c.isdigit())
m_new = nltk.tokenize.word_tokenize(m)
#remove stopwords
m_new = [word for word in m_new if word.lower() not in stopwords.words('english')]
#stemming of words
q = [sno.stem(word) for word in m_new]

In [43]:
from collections import Counter
q_new = Counter([final_dict_inv[str(element)] for element in q if element in list(final_dict.values())])
documents = [inv_indx_cosine[str(term)] for term in list(q_new.keys())]
documents_final = set([y[0] for x in documents for y in x]) # in order to find the cosine similarity different to zero and save time

## Computing of the cosine similarity between the query and each document and saving into a heap data structure

In [46]:
from heapq import heappush, heappop
from math import sqrt
similarity = []
for doc in documents_final:
    s = 0
    for term in list(q_new.keys()):
# in order to compute the inner product between the documents and the query in which the terms of the query exists
        if doc in inv_indx[str(term)]:  
            k  = inv_indx[str(term)].index(doc)
            s += inv_indx_cosine[str(term)][k][1]*(q_new[term]/len(q_new))
            #s += inv_indx_cosine[term][k][1]*(q_new[term])
    qm = sqrt(sum([k/len(q_new)**2 for k in q_new.values()]))
    #qm = sqrt(sum([k**2 for k in q_new.values()]))  ## compute without dividing with the length of the query
    dm = sqrt(sum([k[1]**2 for k in indx_cosine[str(doc)]]))
    #similarity.append((doc,s/(sqrt(qm)*sqrt(dm))))  ## ------------//-----------//-----------//--------
    heappush(similarity,(s/(sqrt(qm)*sqrt(dm)),doc))
    

In [47]:
#keeping only top-k documents
k = 100
for i in range(len(similarity)-k):
    heappop(similarity)

In [48]:
# sorting of the documents according the similarity with the query
similarity = sorted(similarity)[::-1]
cols=['Title','Description','City','Url']
df = pd.DataFrame([])
docs = [k[1] for k in similarity]
for i in range(len(docs)):
    filename = 'directory/doc_{}.tsv'.format(docs[i])
    #[7,4,2,8] taking only the elements that I was=nt to show to the user --> ['Title','Description','City','Url']
    df = df.append(pd.read_csv(filename,sep='\t',header=None).T.loc[:,[7,4,2,8]],ignore_index=True)
#df['Similarity']. = similarity[i][0]
df.columns = cols
df['Similarity'] = [k[0] for k in similarity]
df.index = list(range(1,101))

In [49]:
df

Unnamed: 0,Title,Description,City,Url,Similarity
1,peacful garden guest room,beautiful garden room with larg closet.,San Antonio,https://www.airbnb.com/rooms/18235720?location...,0.333165
2,Charming 4 bedrooms beach house,"Beautiful beach house at surfside beach , grea...",Surfside Beach,https://www.airbnb.com/rooms/16841588?location...,0.268824
3,"""Cottage by the Beach"" condo",\,Galveston,https://www.airbnb.com/rooms/9383668?location=...,0.236307
4,"Beautiful, New Beach House","Our house is close to restaurants and dining, ...",Crystal Beach,https://www.airbnb.com/rooms/15068378?location...,0.232600
5,Beach View Home - Ideal Location. Crystal Beach,\,Crystal Beach,https://www.airbnb.com/rooms/13837834?location...,0.227137
6,Beach View Home - Ideal Location. Crystal Beach,\,Crystal Beach,https://www.airbnb.com/rooms/13837834?location...,0.227137
7,Queen Overlooking the Gardens,Queen room upstairs overlooking the gardens wi...,Glen Rose,https://www.airbnb.com/rooms/11982728?location...,0.212739
8,Queen Overlooking the Gardens,Queen room upstairs overlooking the gardens wi...,Glen Rose,https://www.airbnb.com/rooms/11982664?location...,0.212739
9,Queen Overlooking the Gardens,Queen room downstairs overlooking the gardens ...,Glen Rose,https://www.airbnb.com/rooms/11968178?location...,0.210935
10,Queen Overlooking the Gardens,Queen room downstairs overlooking the gardens ...,Glen Rose,https://www.airbnb.com/rooms/11967653?location...,0.210935
