In [1]:
import requests as rq
import bs4 as bs
import re
import time

In [71]:
import pandas as pd
from datetime import datetime
from re import search
import os
import numpy as np
from multiprocessing import  Pool, cpu_count

from func import *

# 1. Data collection
First get the 7200 links corresponding to the first 18 pages of popular places

In [None]:
cities_url = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
for page in range(1,401):
    try:
        url = f"https://www.atlasobscura.com/places?page={page}&sort=likes_count"
        raw = rq.get(url,headers=headers)
        if raw.status_code != 200:
            print("Sleeping")
            time.sleep(120)
            raw = rq.get(url,headers=headers)
        
        soup = bs.BeautifulSoup(raw.content)
        
        for d in soup.find_all("a",attrs={'data-place-id':re.compile(r"\d+")}):
            cities_url.append(d["href"])
            
    except Exception as e:
        print(page)
        print(e)
        print(raw.status_code)

In [26]:
len(cities_url)

7200

Save the links file

In [28]:
with open("links.txt","w") as f:
    for link in cities_url:
        complete_url = "https://www.atlasobscura.com"+link+"\n"
        f.write(complete_url)

## 1.2 Crawl places

In [None]:
!pip install fake-useragent

using the _fake-useragent_ module to change the headers

In [2]:
import os
from concurrent.futures import ThreadPoolExecutor, wait
from fake_useragent import UserAgent
ua = UserAgent()
import random

In [3]:
# create a list of fake headers
headers = [ua.random for _ in range(1000)]

In [4]:
with open("links.txt","r") as f:
    links = list(map(lambda x: x.rstrip("\n"), f.readlines()))

We used multithreading to download in parallel the pages at the maximum speed achievable.\
The bottleneck then became the server "_too many access_" status respond

In [7]:
%%time
parallel_html_downloader(links)

CPU times: user 1min 37s, sys: 7.84 s, total: 1min 45s
Wall time: 1h 56min 13s


## 1.3 Parse downloaded pages

In [None]:
dir="downloads/"
    
res = [] 
files = os.listdir(dir)
files.remove(".ipynb_checkpoints")   # remove unwanted files

# create first row of the file (with the index)
pd.DataFrame(extract_single_place(f"{dir}/{files[0]}"),index=[0]).to_csv("res.tsv",index=None,sep="\t")

for file in files[1:]:
    df = pd.DataFrame(extract_single_place(f"{dir}/{file}"),index=[0])  # append every row to the file creted before
    if len(df)>1:
         
    df.to_csv("res.tsv",index=None,mode="a",sep="\t",header=None,encoding='utf-8')

# 2 Search Engine

We processed the data with the function *text_cleaner_2* defined in the _func.py_ file

In [7]:
df_res = pd.read_csv("res.tsv", delimiter = '\t')

# create two separate columns in the dataframe with the cleaned and stemmed description
df_res["cleanedDesc"] = df_res.placeDesc.apply(text_cleaner_2)
df_res["cleanedShortDesc"] = df_res.placeShortDesc.apply(text_cleaner_2)

# create the set of words contained in all the documents' description
s = set()
for desc in df_res.cleanedDesc:
    if desc is not None:
        for word in desc:
            s.add(word)

Creating the _vocabulary_ file:

In [None]:
vocabulary_maker(s, name="vocabulary.json")

### 2.1.1 Creating the first index

In [15]:
with open("vocabulary.json", "r") as f:
    vocabulary = json.load(f)

reverse_index = {}
for w in s:
    filtered = df_res.cleanedDesc.apply(lambda x: x != None and w in x)   # filter the documents that cointain the word
    docs = df_res[filtered].index.tolist()                                # save the index of the filtered files
    reverse_index[vocabulary[w]] = docs

with open("reverse_index.json", "w") as f:                                # save the index in a json file
    f.write(json.dumps(reverse_index))

## 2.1.2 Execute the query

Using the *query_function* from _func.py_

In [None]:
result = query_function("system")

In [14]:
result.head()

Unnamed: 0,placeName,placeDesc,placeURL
1,The Snow Hole,"In a rare geophysical phenomenon, snow and ic...",https://www.atlasobscura.com/places/the-snow-h...
2049,Webster Place,Brooklyn has an abundance of spectacular arch...,https://www.atlasobscura.com/places/webster-place
8,Eisbachwelle,When you think of wave surfing you probably h...,https://www.atlasobscura.com/places/eisbachwelle
4108,Rothschild Patent Model Collection,From 1790 to 1880 the United States required...,https://www.atlasobscura.com/places/rothschild...
2062,Atlantic Avenue Tunnel,UPDATE: Currently closed by New York departme...,https://www.atlasobscura.com/places/atlantic-a...


## 2.2 Conjunctive query & Ranking score

Creating the second version of the reverse index with _tf-idf_ scores

In [None]:
with open("vocabulary.json", "r") as f:
    vocabulary = json.load(f)

reverse_index_v2 = {}

for w in s:
    # filter the documents that cointain the word
    filtered = df_res.cleanedDesc.apply(lambda x: x != None and w in x)     
    
    # compute the tf-idf only for the files that contains the word
    docs = df_res[filtered].cleanedDesc.apply(lambda x: x.count(w)/len(x) * np.log(len(df_res)/len(df_res[filtered])))  
    tfidf= list(zip(docs.index.tolist(), docs.values.tolist()))
    
    # we decided to ignore saving zeros for efficency sake
    #tfidf.append(list(zip(df_res[-filtered].index.tolist(), [0 for i in range(len(df_res[-filtered].index.tolist()))])))    # 0 for all the files that do not contain the word
    
    reverse_index_v2[vocabulary[w]] = tfidf
    
with open("reverse_index_v2.json", "w") as f:     # save the file
    f.write(json.dumps(reverse_index_v2))

### 2.2.2 Execute the query

Using the *query_function_v2* from _func.py_

In [9]:
query_function_v2("campo")

Unnamed: 0,placeName,placeDesc,placeURL,Cosine similarity
5282,Islet of Vila Franca do Campo,If swimming pools and ponds have become too m...,https://www.atlasobscura.com/places/islet-of-v...,1.0
5040,Ponte dei Pugni (Bridge of Fists),For generations of Venetians epic fist fights...,https://www.atlasobscura.com/places/ponte-dei-...,1.0
5872,Drowned Village of Vilarinho da Furna,"In a rare occurrence, during a particularly d...",https://www.atlasobscura.com/places/vilarinho-...,1.0
437,Campo de Fiori,"Flower stalls, cafes, and throngs of tourists...",https://www.atlasobscura.com/places/campo-de-f...,1.0
5943,Scuola Grande di San Marco,Once one of the six “great schools” of Venic...,https://www.atlasobscura.com/places/scuola-gra...,1.0
4248,Grand Hotel Campo dei Fiori,With all the charm of the Grand Budapest Hote...,https://www.atlasobscura.com/places/grand-hote...,1.0
2841,Bordallo Pinheiro Garden,A beautiful small garden in the Museum of the...,https://www.atlasobscura.com/places/bordallo-p...,1.0
6652,Scala Contarini del Bovolo,"Hidden in the center of Venice , unknown to m...",https://www.atlasobscura.com/places/scala-cont...,1.0


# 3 Define the new score

For this point we decided to leverage the _cosine similarity_ score but with a twist.\
We introcuced the feature to remove documents containing specific words from the query by adding a minus sign "-".

The words that must be removed will recieve a negative score in the component of the _tf-idf query vector_.\
This way, when the similarity is computed, documents that have a non-zero component in those components of the vector (which means they contain the words) will be penalized respect to documents that contains only the rest of the query.

We also take into account the "Name" and "Country" fields in the creation of the inverted index

In [None]:
df_res["country"] = df_res.placeAddress.apply(lambda x: extract_country(x))
df_res["stemmed_name"] = df_res.placeName.apply(lambda x: text_cleaner_2(x))

Create the new et of words with city and names

In [13]:
# create the set of words contained in all the documents, including city and place name
s = set()
for desc in df_res.cleanedDesc:
    if desc is not None:
        for word in desc:
            s.add(word)
for desc in df_res.stemmed_name:
    if desc is not None:
        for word in desc:
            s.add(word)
for desc in df_res.country:
    if desc is not None:
        for word in desc:
            s.add(word)

create the vocabulary nad index files for our version of the query:

In [29]:
vocabulary_maker(s,"vocabulary_q3.json")

In [43]:
with open("vocabulary_q3.json", "r") as f:
    vocabulary = json.load(f)

reverse_index_v3 = {}

for w in s:
    
    filtered = (df_res["cleanedDesc"].apply(lambda x: w in x) | df_res["country"].apply(lambda x: w in x)  | df_res["stemmed_name"].apply(lambda x: w in x) )
    
    docs = df_res[filtered].apply(lambda x: \
                                  (x.cleanedDesc.count(w) + x.stemmed_name.count(w) + x.country.count(w))/len(x.cleanedDesc + x.stemmed_name + x.country)\
                                  * np.log(len(df_res)/len(df_res[filtered])), axis=1)  # compute the idf only for the files that contains the word
    tfidf= list(zip(docs.index.tolist(), docs.values.tolist()))

    
    reverse_index_v3[vocabulary[w]] = tfidf

with open("reverse_index_v3.json", "w") as f:
    f.write(json.dumps(reverse_index_v3))

**Comparison:**

In [9]:
query_function_v2("campo")

Unnamed: 0,placeName,placeDesc,placeURL,Cosine similarity
5282,Islet of Vila Franca do Campo,If swimming pools and ponds have become too m...,https://www.atlasobscura.com/places/islet-of-v...,1.0
5040,Ponte dei Pugni (Bridge of Fists),For generations of Venetians epic fist fights...,https://www.atlasobscura.com/places/ponte-dei-...,1.0
5872,Drowned Village of Vilarinho da Furna,"In a rare occurrence, during a particularly d...",https://www.atlasobscura.com/places/vilarinho-...,1.0
437,Campo de Fiori,"Flower stalls, cafes, and throngs of tourists...",https://www.atlasobscura.com/places/campo-de-f...,1.0
5943,Scuola Grande di San Marco,Once one of the six “great schools” of Venic...,https://www.atlasobscura.com/places/scuola-gra...,1.0
4248,Grand Hotel Campo dei Fiori,With all the charm of the Grand Budapest Hote...,https://www.atlasobscura.com/places/grand-hote...,1.0
2841,Bordallo Pinheiro Garden,A beautiful small garden in the Museum of the...,https://www.atlasobscura.com/places/bordallo-p...,1.0
6652,Scala Contarini del Bovolo,"Hidden in the center of Venice , unknown to m...",https://www.atlasobscura.com/places/scala-cont...,1.0


In [56]:
query_function_v3("campo - fiori ")

Unnamed: 0,placeName,placeDesc,placeURL,Cosine similarity
6652,Scala Contarini del Bovolo,"Hidden in the center of Venice , unknown to m...",https://www.atlasobscura.com/places/scala-cont...,0.626336
2338,Casa Sperimentale,Casa Sperimentale has been crumbling within a...,https://www.atlasobscura.com/places/casa-speri...,0.626336
5872,Drowned Village of Vilarinho da Furna,"In a rare occurrence, during a particularly d...",https://www.atlasobscura.com/places/vilarinho-...,0.626336
5943,Scuola Grande di San Marco,Once one of the six “great schools” of Venic...,https://www.atlasobscura.com/places/scuola-gra...,0.626336
5040,Ponte dei Pugni (Bridge of Fists),For generations of Venetians epic fist fights...,https://www.atlasobscura.com/places/ponte-dei-...,0.626336
787,Uyuni Salt Flat,"Spreading out over 11,000-sq.-km, Salar de Uy...",https://www.atlasobscura.com/places/salar-de-u...,0.626336
2841,Bordallo Pinheiro Garden,A beautiful small garden in the Museum of the...,https://www.atlasobscura.com/places/bordallo-p...,0.626336
5282,Islet of Vila Franca do Campo,If swimming pools and ponds have become too m...,https://www.atlasobscura.com/places/islet-of-v...,0.626336
437,Campo de Fiori,"Flower stalls, cafes, and throngs of tourists...",https://www.atlasobscura.com/places/campo-de-f...,-0.215405
4248,Grand Hotel Campo dei Fiori,With all the charm of the Grand Budapest Hote...,https://www.atlasobscura.com/places/grand-hote...,-0.215405
