In [208]:
import pandas as pd
import pickle
import csv
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/miguel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miguel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import data

In [3]:
texas1 = pd.read_csv("data/Airbnb_Texas_Rentals.csv")
texas1 = texas1.drop(['Unnamed: 0'], axis=1)

In [11]:
#pd.set_option('expand_frame_repr',False)
texas1.head()

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,$60,1,Bryan,February 2016,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,$75,2,Fort Worth,February 2017,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...


# Create docuemnts as tsv file

In [26]:
path1 = 'data/docs/'
path2 = '.tsv'
stemmed_path='data/tokenized_docs/'
sp = string.punctuation+'“”–’'



In [24]:
def remove_step(doc):
    """
    takes as input the string of the document
    removes stopwords, punctuation and makes stemming 
    """
    
    # check if it's a nan value 

    if isinstance(doc, float):
        return str(doc)
    
    doc=doc.replace("\\n", " ")
    # punctuations
    doc = [ c if c not in sp else " "  for c in doc ]
    doc = ''.join(doc)
    # stopwords
    doc = [ word for word in doc.split() if word.lower() not in stopwords.words('english') ]
    doc = ' '.join(doc)
    
    # stemming
    ps = PorterStemmer()
    words = word_tokenize(doc)
    
    w_lst = []
    for w in words:
        w_lst.append(ps.stem(w))
    
    # something else
    
    return ' '.join(w_lst)

In [12]:
def store_ii1_voc(vocabulary, ii1):
    
    with open('data/vocabulary.csv', 'w') as fp:
        writer = csv.writer(fp)
        for key, value in vocabulary.items():
            writer.writerow([key, value])

    with open('data/inverted_index_1.csv', 'w') as fp:
        writer = csv.writer(fp)
        for key, value in ii1.items():
            writer.writerow([key, value])
    return

In [None]:
def save_obj(obj, name):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [214]:
def preprocessing(data):
    n=len(data)
    vocabulary = {}
    cnt = 0
    ii1 = {}

    for i in range(n):
        with open(path1 + 'doc_'+ str(i) + '.tsv', 'w') as doc:
            a = csv.writer(doc, delimiter='\t')
            a.writerow([data.iloc[i]['average_rate_per_night'],data.iloc[i]['bedrooms_count'] 
                      ,data.iloc[i]['city'] ,data.iloc[i]['date_of_listing'], data.iloc[i]['description']
                      ,data.iloc[i]['latitude'],data.iloc[i]['longitude'] ,data.iloc[i]['title'] ,data.iloc[i]['url']])

            # creating a tokenized string with title and description
            tokenized_str = remove_step(data.iloc[i]['title']) + ' ' + remove_step(data.iloc[i]['description'])
        
            # add new words into the vocabulary and ii1 (inverted index 1)

            for term in tokenized_str.split(' '):
                if term in vocabulary.keys():
                    term_id = vocabulary[term]
                else:
                    vocabulary[term] = cnt
                    term_id = cnt
                    cnt+=1

                if term_id not in ii1:
                    ii1[term_id] = ['doc_'+str(i)]
                else:
                    lista = ii1[term_id]
                    document = 'doc_'+str(i)
                    if document in lista:
                        continue
                    else:        
                        ii1[term_id].append('doc_'+str(i))
            
    # store vocabulary in pickle format
    save_obj(vocabulary, 'vocabulary')
    save_obj(ii1, 'inverted_index_1')
    return        
            

In [217]:
# do not run if you don't want to waste time :-D 
preprocessing(texas1)

nan
nan
nan
nan
nan


In [223]:
def search_engine_1(query): 
    
    query = remove_step(query)
    query = list(set(query.split(' ')))
    
    lst_of_lst=[]
    
    vocabulary = load_obj('vocabulary')
    ii1 = load_obj('inverted_index_1')
    
    for w in query:
        if w not in vocabulary:
            print('No results')
            return
        i = vocabulary[w]
        lst_of_lst.append(ii1[i])


    doc_list = set.intersection(*[set(sublist) for sublist in lst_of_lst])
    doc_list = list(doc_list)
    dl = len(doc_list)
    
    if dl ==0:
        print('No results')
        return

    list_for_df=[]
    for i in range(dl):
        with open ("data/docs/" + doc_list[i] + '.tsv') as doc:
            row = doc.read()
            lst = row.split('\t')
            lst = [lst[7],lst[4],lst[2],lst[8]]
            list_for_df.append(lst)
        
    df=pd.DataFrame(list_for_df, columns=['Title', 'Description', 'City', 'Url'])
    
    return df.head(3)   
        

In [248]:
search_engine_1('yellow cabs')


Unnamed: 0,Title,Description,City,Url
0,The Little House on the Hill,"Hi, I'm Adam from Austin, Texas and I am excit...",Austin,https://www.airbnb.com/rooms/895403?location=B...
1,The Little House on the Hill,"Hi, I'm Adam from Austin, Texas and I am excit...",Austin,https://www.airbnb.com/rooms/895403?location=B...
2,The Little House on the Hill,"Hi, I'm Adam from Austin, Texas and I am excit...",Austin,https://www.airbnb.com/rooms/895403?location=C...
