In [1]:
import pandas as pd
import pickle
import csv
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from collections import defaultdict
from collections import Counter
from math import log
import numpy as np

from pyspark import SparkContext as sc
import pyspark as sp

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alessandra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alessandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
from scipy import spatial
import numpy as np

# Import data

In [3]:
texas1 = pd.read_csv("data/Airbnb_Texas_Rentals.csv")
texas1 = texas1.drop(['Unnamed: 0'], axis=1)

In [4]:
#pd.set_option('expand_frame_repr',False)
texas1.head()

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,$60,1,Bryan,February 2016,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,$75,2,Fort Worth,February 2017,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...


# Create docuemnts as tsv file

In [2]:
path1 = 'data/docs/'
path2 = '.tsv'
stemmed_path='data/tokenized_docs/'
sp = string.punctuation+'“”–’'

In [3]:
def remove_step(doc):
    """
    takes as input the string of the document
    removes stopwords, punctuation and makes stemming 
    """
    
    # check if it's a nan value 

    if isinstance(doc, float):
        return str(doc)
    
    doc=doc.replace("\\n", " ")
    # punctuations
    doc = [ c if c not in sp else " "  for c in doc ]
    doc = ''.join(doc)
    # stopwords
    doc = [ word for word in doc.split() if word.lower() not in stopwords.words('english') ]
    doc = ' '.join(doc)
    
    # stemming
    ps = PorterStemmer()
    words = word_tokenize(doc)
    
    w_lst = []
    for w in words:
        w_lst.append(ps.stem(w))
    
    # something else
    
    return ' '.join(w_lst)

In [4]:
def save_obj(obj, name):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

Preprocessing create the tsv files.
We created vocabulary parsing every document and updating it when the algorithm finds a new word. 
Format of vocabulary => 'string':integer.
Creation of ii1:
The first inverted index is built, updating it every time a word is find in a document.
At the end we stored vocabulary and the inverted index using pickle.

In [8]:
def preprocessing(data):
    n=len(data)

    for i in range(n):
        with open(path1 + 'doc_'+ str(i) + '.tsv', 'w') as doc:
            a = csv.writer(doc, delimiter='\t')
            a.writerow([data.iloc[i]['average_rate_per_night'],data.iloc[i]['bedrooms_count'] 
                      ,data.iloc[i]['city'] ,data.iloc[i]['date_of_listing'], data.iloc[i]['description']
                      ,data.iloc[i]['latitude'],data.iloc[i]['longitude'] ,data.iloc[i]['title'] ,data.iloc[i]['url']])

    return

def create_vocabulary_and_ii1 (data):
    n = len(data)
    vocabulary = {}
    ii1 = {}    
    cnt = 0
    
    for i in range(n):
        # creating a tokenized string with title and description
        tokenized_str = (remove_step(data.iloc[i]['title']) + ' ' 
                                     + remove_step(data.iloc[i]['description']))

        # creating the dictionary
        for term in tokenized_str.split(' '):
            if term in vocabulary.keys():
                term_id = vocabulary[term]
            else:
                vocabulary[term] = cnt
                term_id = cnt
                cnt+=1

            if term_id not in ii1.keys():
                ii1[term_id] = ['doc_'+str(i)]
            else:
                lista = ii1[term_id]
                document = 'doc_'+str(i)
                if document in lista:
                    continue
                else:        
                    ii1[term_id].append('doc_'+str(i))

    # store vocabulary in pickle format
    save_obj(vocabulary, 'vocabulary')
    save_obj(ii1, 'inverted_index_1')
    return
            

In [9]:
#create_vocabulary and inverted index 1 (for the first search engine)
create_vocabulary_and_ii1(texas1)

In [10]:
# do not run if you don't want to waste time :-D 
#preprocessing(texas1)

In [11]:
def search_engine_1(query): 
    
    query = remove_step(query)
    query = list(set(query.split(' ')))
    
    lst_of_lst=[]
    
    vocabulary = load_obj('vocabulary')
    ii1 = load_obj('inverted_index_1')
    
    for w in query:
        if w not in vocabulary:
            print('No results')
            return
        i = vocabulary[w]
        lst_of_lst.append(ii1[i])


    doc_list = set.intersection(*[set(sublist) for sublist in lst_of_lst])
    doc_list = list(doc_list)
    dl = len(doc_list)
    
    if dl ==0:
        print('No results')
        return

    list_for_df=[]
    for i in range(dl):
        with open ("data/docs/" + doc_list[i] + '.tsv') as doc:
            row = doc.read()
            lst = row.split('\t')
            lst = [lst[7],lst[4],lst[2],lst[8]]
            list_for_df.append(lst)
        
    df=pd.DataFrame(list_for_df, columns=['Title', 'Description', 'City', 'Url'])
    
    return df.head(5)   
        

In [5]:
vocabulary = load_obj('vocabulary')

In [9]:
search_engine_1('beach')


Unnamed: 0,Title,Description,City,Url
0,"Padre Beach View Home, Walk to Beach","This 4BR, 3BA Padre Beach View Home is just st...",Corpus Christi,https://www.airbnb.com/rooms/18451940?location...
1,Key Lime Cabin,If you need a relaxing place to stay that is h...,Matagorda,https://www.airbnb.com/rooms/11312175?location...
2,Isla Del Sol beachfront condo,"Isla Del Sol is a small, quiet beachfront comp...",South Padre Island,https://www.airbnb.com/rooms/12188975?location...
3,Beach house on private resort,"Relaxing, comfortable, and very clean home wit...",Port Isabel,https://www.airbnb.com/rooms/16338158?location...
4,Looks like Coastal Living Magazine,Gorgeous- Old world charm w modern convenience...,Galveston,https://www.airbnb.com/rooms/6119177?location=...


In [27]:
def dict_with_tf(data):
    vocabulary = load_obj('vocabulary')
    n = len(data)
    
    ii2 = defaultdict(list)
    
    
    for i in range(n):
        tokenized_str = (remove_step(data.iloc[i]['title']) + ' ' 
                                     + remove_step(data.iloc[i]['description']))
    
        for term in tokenized_str.split(' '):
            doc_name = 'doc_%s'%i
            ii2[term].append((doc_name,1))
        
    return ii2

    

In [28]:
ii2 = dict_with_tf(texas1)

In [None]:
############ SPARK
nSlices = 5
for key,values in ii2.items():
    
    sc = sp.SparkContext(appName = 'parallelization')
    
    newlst = sc.parallelize(values, nSlices)
    
    newlst = newlst.reduceByKey(lambda a,b:a+b)
    
    ii2[key] = newlst.take(len(values))
#     need to normalize by the len of the document

    sc.close()

    

In [85]:
save_obj(ii2,'inverted_index_onlyTF')

    II with TF:
    Taking each list for each term in the inverted index,
    we wanted to find the occurrencies for that term in each document.
    Thus, we created the inverted index with every document appended inside the value of each 
    term (the key of the dictionary), 
    and using in a loop the reduce_doc_list method.
    In this method we reduced the repetitions of the same docs in each list, summing them. 
   

In [30]:
def reduce_doc_list(doc_list):
    """
    function called by dict_TFIDF
    
    It reduces the list of documents into a list 
    of tuple with doc_id and its occurencies
    
    input:
    - list 
    output:
    - list 
    """
    tf_term_i = Counter(doc_list)
    doc_tf_lst = []
    doc_tf_lst = [tuple([key,value]) for key,value in tf_term_i.items()]
    return doc_tf_lst

def compute_ii2_TFIDF(ii2,n):
    """
    compute the ii2_TFIDF
    input:
    - inverted index matrix (with TF)
    - number of documents
    output:
    - ii2 
    """
    for key, value in ii2.items():
        N = len(value)
        new_list = []
        for item in value:
            new_list.append(tuple([item[0], round(float(item[1])* log(n/N),3)]))
            
        ii2[key] = new_list
    return ii2

In [31]:
def dict_with_TFIDF(data):
    """
    creates the TFIDF inverted index as dict
    and store it into a pickle file
    input:
    - data
    """
    vocabulary = load_obj('vocabulary')
    n = len(data)
    
    ii2 = defaultdict(list)
    
    for i in range(n):
        tokenized_str = (remove_step(data.iloc[i]['title']) + ' ' 
                                     + remove_step(data.iloc[i]['description']))
    
        for term in tokenized_str.split(' '):
            doc_name = 'doc_%s'%i
            ii2[vocabulary[term]].append(doc_name)
            
    
    for key,value in ii2.items():
        ii2[key] = reduce_doc_list(value)
    
    ii2 = compute_ii2_TFIDF(ii2,n)
    save_obj(ii2,'inverted_index_TFIDF')
    return 

In [23]:
import time
start = time.time()
dict_with_TFIDF(texas1)
print (time.time()-start)

242.7152338027954


In [7]:
ii2 = load_obj('inverted_index_TFIDF')

prove



In [133]:
scrauso = {
    0: [('doc_0', 2), ('doc_1', 1), ('doc_3', 1), ('doc_7', 1)],
    1: [('doc_0', 2), ('doc_7', 1)]
       }

In [137]:
temp = ii2
temp = compute_ii2_TFIDF(temp, len(texas1))

In [10]:
def reduce_doc_list(doc_list):
    tf_term_i = Counter(doc_list)
    doc_tf_lst = []
    doc_tf_lst = [tuple([key,value]) for key,value in tf_term_i.items()]
    return doc_tf_lst

In [27]:
temp = {
    0: [('doc_1',2.78), ('doc_2', 1.364), ('doc_3',2.729)],
    1: [('doc_1',2.78), ('doc_2', 1.364)],
    2: [('doc_3',1.354)]
}

In [29]:
query = [0,1,1]





In [76]:
reduce_doc_list(ii2['river'])

[('doc_2', 4)]

In [11]:
reduce_doc_list(['d1','d1','d3','d4'])

[('d1', 2), ('d3', 1), ('d4', 1)]

In [14]:
Counter(['d1','d1','d3','d4'])

Counter({'d1': 2, 'd3': 1, 'd4': 1})

We gonna create a 'truth' matrix that has determined rows index as the terms of vocabulary (dicted by integers), and as columns index the documents ids.
The matrix has stored 0 or tf-idf value in relation to the presence of i-th term in the i-th document.
After the construction of this matrix, it will be easier to make the operations for Cosine Similarity for every query we could have.

In [20]:
rows = len(vocabulary)
cols = 18259#len(texas1)
ii1 = load_obj('inverted_index_1')

In [67]:
def build_matrix(cols,rows, inv_ind):
    matrix = np.zeros(shape = (rows,cols), dtype=float)
    for key in inv_ind.keys():
        res_term_id = inv_ind[key]
        for doc in res_term_id:
            doc_tfidf = doc[1]
            doc_id = int(doc[0][4:])
            matrix[doc_id][key] = doc_tfidf
    return matrix

In [68]:
matrix = build_matrix(rows, cols, ii2)

In [163]:
def compute_query_vector_and_take_docs(query, inv_ind_1):
    query = remove_step(query)
    query = query.split(' ')
    query_vector = np.zeros(len(vocabulary))
    iiL = len(inv_ind_1)
    # query_vect = [  for i in range(len(vocabulary))]
    # creating the vector of the query in [0, 1, 0, 2], the numbers depends on the occurrences of terms in the query, 
    #putted in the i-th position depending on the vocabulary
    for el in query:
        query_vector[vocabulary[el]] += 1        
        compares = len(inv_ind_1[vocabulary[el]])
        query_vector[vocabulary[el]] *= 1+log(iiL/compares)
    #return query_vect
    #we need to take the indices from the numpy array - tricky
    non_zero = [el.tolist() for el in query_vector.nonzero()][0]
    docs = set.intersection(*[set(inv_ind_1[i]) for i in non_zero])
    docs = list(docs)
    return query_vector, docs

In [103]:
'''
def take_docs(query_vector, inv_ind_1):
    #we need to take the indices from the numpy array - tricky
    non_zero = [el.tolist() for el in query_vector.nonzero()][0]
    docs = set.intersection(*[set(inv_ind_1[i]) for i in non_zero])
    docs = list(docs)

    return docs
'''

In [164]:
vec, lista = compute_query_vector_and_take_docs("beautiful room with view", ii2)

Now we have the matrix, the vectorized query and docs. 
We will use the matrix to take the vectorized document with all the tfidf related to each term.
Docs need to have the presence of all the words present in the query.
This check is done using the row of the matrix (that is the vector of the i-th document taken in consideration). 
Thus, it will be possible to compute the Cosine Similarity and first ranking of documents related to the query.

In [166]:
#vec
#matrix
#lista
def cs(query_vector, matrix, list_of_documents):  
    docs_int_lst = []
    #take the non zero values indexes into the query vector 
    non_zero = [el.tolist() for el in query_vector.nonzero()][0]
     
    for doc in list_of_documents:
        # for each doc that contains one term of the query
        doc_id = int(doc[4:])
        # I take the row vector from the matrix 
        doc_vector = matrix[doc_id]
        
        # Taking the indexes of the ( document ) row vector related to the position of the query
        query_words_doc = np.take(doc_vector, non_zero)
        # take off the zero values from the row vector (I check where the document doesn't have a term of the query)
        query_words_doc = query_words_doc.nonzero()
        
        query_words_doc = (query_words_doc)[0]
        
        if len(query_words_doc) == len(non_zero):
            #compute cosine similarity
            cs = 1 - spatial.distance.cosine(query_words_doc,non_zero)
            docs_int_lst.append((doc_id, cs))
    return docs_int_lst

In [153]:
cosines = cs(vec, matrix)

In [156]:
cosines.sort(key= lambda x:x[1], reverse =True)

In [157]:
cosines

[(8476, 0.9734549745236509),
 (6209, 0.9734549745236509),
 (5573, 0.9734549745236509),
 (2648, 0.9734549745236509),
 (18243, 0.9734549745236509),
 (9902, 0.9734549745236509),
 (9220, 0.9734549745236509),
 (1945, 0.9734549745236509),
 (2908, 0.9734549745236509),
 (14378, 0.9734549745236509),
 (15505, 0.9734549745236509),
 (8480, 0.9734549745236509),
 (14887, 0.9734549745236509),
 (8354, 0.9734549745236509),
 (5551, 0.9734549745236509),
 (17578, 0.9734549745236509),
 (17605, 0.9734549745236509),
 (15282, 0.9734549745236509),
 (14665, 0.9734549745236509),
 (10450, 0.9734549745236509),
 (4662, 0.9734549745236509),
 (787, 0.9734549745236509),
 (12230, 0.9734549745236509),
 (7085, 0.9734549745236509),
 (1238, 0.9734549745236509),
 (9547, 0.9734549745236509),
 (8073, 0.9734549745236509),
 (14612, 0.9734549745236509),
 (1563, 0.9734549745236509),
 (10781, 0.9734549745236509),
 (2113, 0.9734549745236509),
 (9256, 0.9734549745236509),
 (8495, 0.9734549745236509),
 (12862, 0.9734549745236509),
 (

In [176]:
%whos

Variable                             Type                          Data/Info
----------------------------------------------------------------------------
Counter                              type                          <class 'collections.Counter'>
PorterStemmer                        ABCMeta                       <class 'nltk.stem.porter.PorterStemmer'>
build_matrix                         function                      <function build_matrix at 0x0000023322069950>
cols                                 int                           18259
compute_query_vector_and_take_docs   function                      <function compute_query_v<...>cs at 0x000002331D2F42F0>
cs                                   function                      <function cs at 0x000002331D2F4510>
csv                                  module                        <module 'csv' from 'C:\\U<...>\Anaconda3\\lib\\csv.py'>
defaultdict                          type                          <class 'collections.defaultdict'>
ii1  

In [174]:
del texas1, path1, path2, stemmed_path, sp