In [1]:
import numpy as np
import pandas as pd
import csv
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
sno = nltk.stem.SnowballStemmer('english')

In [2]:
import os

path, dirs, files = next(os.walk("directory"))
file_count = len(files)

# Creating a full corpus of with all the words appeared in the descriptions and the titles

In [3]:
corpus = []
for i in range(file_count):
    filename = 'directory/doc_{}.tsv'.format(i+1)
    with open(filename) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        document = []
        for row in reader:
            document.append(row)
        corpus.append(document[4][0] + document[7][0])

In [4]:
#data cleaning of the corpus and tokenization
lista = []
for i in range(len(corpus)):
    try:
        m_raw = corpus[i]
        #replace $ with dollar
        m = re.sub('[$]','dollar',m_raw)
        # replace new line symbols with whitespace
        m = m.replace('\\r', ' ').replace('\\n', ' ')
        #remove punctuations with 2 ways
        #m = ''.join([c for c in m if c not in string.punctuation])
        m = re.sub('[%s]' % re.escape(string.punctuation), ' ', m)
        # seperate numbers from words
        m = re.sub(r'(?<=[\d+])(?=[a-zA-Z_])', r' ', m)
        # remove digits from messages
        #m = ''.join(c for c in m if not c.isdigit())
        m_new = nltk.tokenize.word_tokenize(m)
        #remove stopwords
        m_new = [word for word in m_new if word.lower() not in stopwords.words('english')]
        #stemming of words
        m_final = [sno.stem(word) for word in m_new]
        lista.append(m_final)
    except:
        print(i)

In [5]:
#flatten the list for every word occurence
lista_flatten = [y for x in lista for y in x]

In [6]:
#create a dictionary with all the words counts sorted by frequency
from collections import Counter
c = Counter(lista_flatten)
c= dict(c.most_common())

In [7]:
final_dict = {str(i+1):x for i,x in enumerate(c)}
final_dict_inv = {str(x):(i+1) for i,x in enumerate(c)}

## Index dictionary with all the words appeared in each document

In [8]:
indx = {}
for n,document in enumerate(lista):
    new_document = []
    for word in document:
        new_document.append(final_dict_inv[str(word)])
    indx[str(n+1)] = new_document  

## Inverse index dictionary with all the documents each word existing

In [16]:
inv_indx = {str(i):[] for i in list(final_dict.keys())}
for word in list(final_dict.keys()):
    for i in range(len(indx)):
        if int(word) in indx[str(i+1)]:
            inv_indx[word].append(i+1)

In [None]:
import json

with open('regular_index.json', 'w') as fp:
    json.dump(indx, fp)

with open('inverted_index.json', 'w') as fp:
    json.dump(inv_indx, fp)
    
with open('words_transformation.json', 'w') as fp:
    json.dump(final_dict, fp)    
    
with open('words_inverse_transformation.json', 'w') as fp:
    json.dump(final_dict_inv, fp) 

In [19]:
query = 'a beautiful house with garden and beach'

# Cleaning and tokenization of the query

In [20]:
m_raw = query
#replace $ with dollar
m = re.sub('[$]','dollar',m_raw)
# replace new line symbols with whitespace
m = m.replace('\\r', ' ').replace('\\n', ' ')
#remove punctuations with 2 ways
#m = ''.join([c for c in m if c not in string.punctuation])
m = re.sub('[%s]' % re.escape(string.punctuation), ' ', m)
# seperate numbers from words
m = re.sub(r'(?<=[\d+])(?=[a-zA-Z_])', r' ', m)
# remove digits from messages
#m = ''.join(c for c in m if not c.isdigit())
m_new = nltk.tokenize.word_tokenize(m)
#remove stopwords
m_new = [word for word in m_new if word.lower() not in stopwords.words('english')]
#stemming of words
q = [sno.stem(word) for word in m_new]


# Mapping the query words to the final_dictionary integer values

In [21]:
q_new = [final_dict_inv[str(element)] for element in q if element in list(final_dict.values())]
documents = [inv_indx[str(term)] for term in q_new]
documents_final = set([y for x in documents for y in x])

# Create to final dataframe for the search enginee

In [22]:
cols=['Title','Description','City','Url']
df = pd.DataFrame([])
for i in documents_final:
    filename = 'directory/doc_{}.tsv'.format(i)
    #[7,4,2,8] taking only the elements that I was=nt to show to the user --> ['Title','Description','City','Url']
    df = df.append(pd.read_csv(filename,sep='\t',header=None).T.loc[:,[7,4,2,8]],ignore_index=True)
df.columns = cols
df.index = list(range(1,len(documents_final)+1))

In [23]:
df.head(20)

Unnamed: 0,Title,Description,City,Url
1,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...
2,River house near the city,'River house on island close to the city' \nA ...,Houston,https://www.airbnb.com/rooms/16926307?location...
3,Private Room Close to Campus,Private bedroom in a cute little home situated...,Bryan,https://www.airbnb.com/rooms/11839729?location...
4,Friendly Private Room in َQuiet Neighborhood,This is a beautiful bedroom with a queen size ...,Fort Worth,https://www.airbnb.com/rooms/18977363?location...
5,Quiet Cozy Room,Gated Estate in the heart of the DFW metro. Se...,Euless,https://www.airbnb.com/rooms/4251773?location=...
6,A Cozy Home with a Beautiful Nature Views,"This home is on the North Side of San Antonio,...",San Antonio,https://www.airbnb.com/rooms/19190311?location...
7,"Cozy cottage charm,right in Kerrville!",My place is close to downtown Kerrville. Beaut...,Kerrville,https://www.airbnb.com/rooms/14694333?location...
8,2 bedrooms 1.5 bath in a great location.,This is new house with a hint of Texas Country...,New Braunfels,https://www.airbnb.com/rooms/18683743?location...
9,Ocean Getaway with Magnificent Views,"My place is close to the beach, restaurants an...",Port Aransas,https://www.airbnb.com/rooms/17402254?location...
10,New Super Bowl House & Luxury Car,4 Bedroom 3 Baths Sleeps 10-12 Theater Room Pl...,Katy,https://www.airbnb.com/rooms/17005150?location...
