In [5]:
import requests as rq
import bs4 as bs
import re
import time

In [6]:
import pandas as pd
from datetime import datetime
from re import search
import os
import numpy as np
from multiprocessing import  Pool, cpu_count

In [3]:
def extract_single_place(page):
    """
    Extract the requested features from a local html page
    """
    
    # Read local files
    with open(page,encoding="utf-8") as f:
        soup = bs.BeautifulSoup(f)
    
    placeName = soup.find_all('h1', {'class':'DDPage__header-title'})[0].contents[0]
    
    placeTags = list()
    for tags in soup.find_all('a', {'class':'itemTags__link js-item-tags-link'}):
        wordlen=len(tags.text)-2
        tag = tags.text[1:wordlen]
        placeTags.append(str(tag))
    
    been = soup.find_all('div', {'class':'col-xs-4X js-submit-wrap js-been-to-top-wrap action-btn-col hidden-print'})[0]
    num_been = been.get_text().split()
    numPeopleVisited = int(num_been[2])
    if numPeopleVisited==0:
        numPeopleVisited = ''
    
    want = soup.find_all('div', {'class':'col-xs-4X js-submit-wrap js-like-top-wrap action-btn-col hidden-print'})[0]
    num_want = want.get_text().split()
    numPeopleWant = int(num_want[3])
    if numPeopleWant==0:
        numPeopleWant = ''
    
    description = soup.find('div', class_='DDP__body-copy')
    allowlist = ['p', 'span', 'a', 'i']
    text_elements = [t for t in description.find_all(text=True) if t.parent.name in allowlist]
    placeDesc = str(' '.join(text_elements))
    placeDesc = placeDesc.replace(u'\xa0',u' ')
    
    
    placeShortDesc = soup.find_all('h3', {'class':'DDPage__header-dek'})[0].contents[0]
    placeShortDesc = placeShortDesc.replace(u'\xa0',u' ')
    placeShortDesc = str(placeShortDesc)
    
    placeNearby=list()
    for places in soup.find_all('div', {'class':'DDPageSiderailRecirc__item-title'}):
        placeNearby.append(str(places.text))
    if len(placeNearby) == 0:
        placeNearby = ''
    
    
    placeRaw= soup.find_all('address', class_='DDPageSiderail__address')[0]
    place = placeRaw.find_all('div')[0].contents[0:5:2]
    place = " ".join(place)
    placeAddress = place.replace('\n', '')
    
    
    coordinates = soup.find_all('div', class_='DDPageSiderail__coordinates')[0]
    coordinates = coordinates.get_text().split()
    Alt = coordinates[0]
    Altlen = len(Alt)
    placeAlt = float(Alt[0:Altlen-1])
    placeLong = float(coordinates[1])
    

    editors = soup.find_all('li', {'class':'DDPContributorsList__item'})
    if len(editors)==0:
        #placeEditors = soup.find_all('div', {'class':'DDPContributorsList'})[1].get_text().split()
        #TODO: check the line below
        listEditor = soup.find_all('div', {'class':'DDPContributorsList'})
        if len(listEditor) == 0:
            placeEditors=[""]
        else:
            placeEditors = listEditor[0].get_text().split()
    else:
        placeEditors = list()
        for place in editors:
            names = place.find('span').getText()
            placeEditors.append(names)
    
    
    date_time = soup.find_all('div', {'class':'DDPContributor__name'})
    if len(date_time) >0:
        time = date_time[0].get_text()
        placePubDate = datetime.strptime(time, '%B %d, %Y')
    else:
        placePubDate = ""
    
    
    titles = soup.find_all('h3', class_='Card__heading --content-card-v2-title js-title-content')  
    placeRelatedPlaces = list()
    for title in titles:
        big_check = title.parent.parent.parent.parent.parent.parent
        check = big_check.find('div', class_="CardRecircSection__title").get_text()
        if check == 'Related Places':
            placeRelatedPlaces.append(str(title.get_text().strip()))
    
    placeRelatedLists = list()
    for title in titles:
        big_check = title.parent.parent.parent.parent.parent.parent
        check = big_check.find('div', class_="CardRecircSection__title").get_text()
        if search("Appears in", check):
            placeRelatedLists.append(str(title.get_text().strip()))
    if len(placeRelatedLists)==0:
        placeRelatedLists.append('')
    
    find_url = soup.find('link', {"rel": "canonical"})
    placeURL = find_url['href']
    
    #print("placeName: "+str(len(placeName)))
    #print("placetags "+str(len(placeTags)))
    #print("address "+str(len(placeAddress)))
    #print("editors "+str(len(placeEditors)))
    #print("relatedplaces "+str(len(placeRelatedPlaces)))
    #print("relatedlists "+str(len(placeRelatedLists)))

    
    return {'placeName': placeName,
            'placeTags': str(placeTags),
            'numPeopleVisited': numPeopleVisited,
            'numPeopleWant': numPeopleWant,
            'placeDesc': placeDesc,
            'placeShortDesc':placeShortDesc,
            'placeNearby':str(placeNearby),
            'placeAddress': placeAddress,
            'placeAlt': placeAlt,
            'placeLong': placeLong,
            'placeEditors': str(placeEditors),
            'placePubDate': placePubDate,
            'placeRelatedPlaces': str(placeRelatedPlaces),
            'placeRelatedLists': str(placeRelatedLists),
            'placeURL': placeURL}
    

In [None]:
dir="../../downloads/"
    
res = [] 
files = os.listdir(dir)
files.remove(".ipynb_checkpoints")   # remove this junk

# create first row of the file (with the index)
pd.DataFrame(extract_single_place(f"{dir}/{files[0]}"),index=[0]).to_csv("res.tsv",index=None,sep="\t")

for file in files[1:]:
    df = pd.DataFrame(extract_single_place(f"{dir}/{file}"),index=[0])  # append every row to the file creted before
    if len(df)>1:
         
    df.to_csv("res.tsv",index=None,mode="a",sep="\t",header=None,encoding='utf-8')

### END OF QUESTION 1
---

reference:\
https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1

In [1]:
def table_maker(pages:list ,dir="../../downloads/"):
    raws = []
    
    for page in pages:
        d = extract_single_place(f"{dir}/{page}")
        raws.append(d)
    return raws

def parallel_table(dir="downloads"):
    files = os.listdir(dir)
    files.remove(".ipynb_checkpoints")   # remove this junk
    chunks = np.split(np.array(files), n_cores)
    
    with Pool() as p:
        l = p.map(table_maker, files)

    return l
    

In [None]:
dir="../../downloads/"
    
res = [] 
files = os.listdir(dir)
files.remove(".ipynb_checkpoints")   # remove this junk

pd.DataFrame(extract_single_place(f"{dir}/{files[0]}"),index=[0]).to_csv("res.tsv",index=None,sep="\t")
for file in files[1:]:
    pd.DataFrame(extract_single_place(f"{dir}/{file}"),index=[0]).to_csv("res.tsv",index=None,mode="a",sep="\t",header=None)

In [None]:
df_res =pd.DataFrame(res)

In [None]:
all_pages.to_csv()

## Only this method below seems to work

In [5]:
def table_maker(pages:list ,dir="downloads"):
    index = extract_single_place(f"{dir}/{pages[0]}") # create index
    with open("result.tsv","w",encoding="utf-8") as f:
        f.write("\t".join(list(map(str,index.keys()))) + "\n")
    for page in pages:
        cols = extract_single_place(f"{dir}/{page}")
        with open("result.tsv","a",encoding="utf-8") as f:
            f.write("\t".join(list(map(str,cols.values()))) + "\n")
    print("done")

In [15]:
files = os.listdir("../../downloads/")
files.remove(".ipynb_checkpoints")

table_maker(files,"../../downloads")

done


In [None]:
files = os.listdir("../../downloads/")
files.remove(".ipynb_checkpoints")

res = table_maker(files,"../../downloads")

#### Done with pandas to reduce compatibility errors

In [None]:
dir="../../downloads/"
    
res = [] 
files = os.listdir(dir)
files.remove(".ipynb_checkpoints")   # remove this junk

pd.DataFrame(extract_single_place(f"{dir}/{files[0]}"),index=[0]).to_csv("res.tsv",index=None,sep="\t")
for file in files[1:]:
    df = pd.DataFrame(extract_single_place(f"{dir}/{file}"),index=[0])
    if len(df)>1:
         
    df.to_csv("res.tsv",index=None,mode="a",sep="\t",header=None,encoding='utf-8')

### Debugging di extract_single_pages

In [None]:
import traceback

files = os.listdir("downloads/")
files.remove(".ipynb_checkpoints")   # remove this junk
for file in files:
    try:
        extract_single_place(f"downloads/{file}")
    except Exception as e:
        print(file,"-->" ,traceback.format_exc())

# 2

In [3]:
import nltk
import string

import json

In [4]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
def text_cleaner_2(description):
    """
    stemming function
    """
    if type(description) != str:
        return None
    
    stop_words = set(stopwords.words('english'))
    snow_stemmer = SnowballStemmer(language='english')
    punct =  set(string.punctuation)
    punct.add("“")
    punct.add("”")
    punct.add("’")

    filtered_sentence = []
    word_tokens = word_tokenize(description)
    for w in word_tokens:
        if w not in stop_words :
            filtered_sentence.append(w)

    stemmed_desc = []
    for w in filtered_sentence:
        x = snow_stemmer.stem(w)
        stemmed_desc.append(x)

    filtered_desc = []
    for s in stemmed_desc:
        if s not in punct:
            filtered_desc.append(s)

    return filtered_desc

In [7]:
df_res = pd.read_csv("res.tsv", delimiter = '\t')

In [8]:
df_res["cleanedDesc"] = df_res.placeDesc.apply(text_cleaner_2)
df_res["cleanedShortDesc"] = df_res.placeShortDesc.apply(text_cleaner_2)

In [9]:
# create the set of words contained in all the documents
s = set()
for desc in df_res.cleanedDesc:
    if desc is not None:
        for word in desc:
            s.add(word)

Creating the _vocabulary_ file:

In [18]:
# assign a number to each word in the set
d, i = {}, 0
for w in s:
    d[w] = i
    i += 1
with open("vocabulary.json", "w") as f:
    f.write(json.dumps(d))

In [15]:
with open("vocabulary.json", "r") as f:
    vocabulary = json.load(f)

reverse_index = {}
for w in s:
    filtered = df_res.cleanedDesc.apply(lambda x: x != None and w in x)   # filter the documents that cointain the word
    docs = df_res[filtered].index.tolist()                                # save the index of the filtered files
    reverse_index[vocabulary[w]] = docs

with open("reverse_index.json", "w") as f:                                # save the index in a json file
    f.write(json.dumps(reverse_index))

### execute query

In [10]:
def query_function(query, voc="vocabulary.json", reverse_index="reverse_index.json", df_name="res.tsv"):
    df = pd.read_csv(df_name, delimiter = '\t')
    with open(reverse_index, "r") as f:
        reverse_index = json.load(f)
    with open(voc, "r") as f:
        vocabulary = json.load(f)
    
    idx = vocabulary[query.split()[-1]]                       # find the id of the first word
    s = set(reverse_index[str(idx)])
    for w in query.split()[:-1]:
        idx = vocabulary[w]                                  # find the id for the remainig words
        s.intersection(set(reverse_index[str(idx)]))  # perform the intersection on the sets of documents
    return df.iloc[list(s)][["placeName","placeDesc","placeURL"]]

In [13]:
query = "system"

result = query_function(query)

In [14]:
result.head()

Unnamed: 0,placeName,placeDesc,placeURL
1,The Snow Hole,"In a rare geophysical phenomenon, snow and ic...",https://www.atlasobscura.com/places/the-snow-h...
2049,Webster Place,Brooklyn has an abundance of spectacular arch...,https://www.atlasobscura.com/places/webster-place
8,Eisbachwelle,When you think of wave surfing you probably h...,https://www.atlasobscura.com/places/eisbachwelle
4108,Rothschild Patent Model Collection,From 1790 to 1880 the United States required...,https://www.atlasobscura.com/places/rothschild...
2062,Atlantic Avenue Tunnel,UPDATE: Currently closed by New York departme...,https://www.atlasobscura.com/places/atlantic-a...


# 2.2

In [13]:
with open("vocabulary.json", "r") as f:
    vocabulary = json.load(f)

reverse_index_v2 = {}

for w in s:
    filtered = df_res.cleanedDesc.apply(lambda x: x != None and w in x)     # filter the documents that cointain the word
    docs = df_res[filtered].cleanedDesc.apply(lambda x: x.count(w)/len(x) * np.log(len(df_res)/len(df_res[filtered])))  # compute the idf only for the files that contains the word
    tfidf= list(zip(docs.index.tolist(), docs.values.tolist()))

    #tfidf.append(list(zip(df_res[-filtered].index.tolist(), [0 for i in range(len(df_res[-filtered].index.tolist()))])))    # 0 for all the files that do not contain the word
    
    reverse_index_v2[vocabulary[w]] = tfidf
    
with open("reverse_index_v2.json", "w") as f:
    f.write(json.dumps(reverse_index_v2))

In [87]:
def cosine_similarity(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

def query_function_v2(query, voc="vocabulary.json", reverse_index="reverse_index_v2.json", df_name="res.tsv"):
    df = pd.read_csv(df_name, delimiter = '\t')
    with open(reverse_index, "r") as f:
        reverse_index = json.load(f)
    with open(voc, "r") as f:
        vocabulary = json.load(f)
    stemmed_query = text_cleaner_2(query)
    token_query = list(map(lambda w: vocabulary[w],stemmed_query ))
    
    filtered_dict = { token: reverse_index[str(token)] for token in token_query }

    tfidf_vec = list(map(lambda w: token_query.count(w)/len(token_query) * np.log(len(df)/len(filtered_dict[w])), token_query))
    
    
    docs = set([tup[0] for l in filtered_dict.values() for tup in l ])
    
    
    tfidf_docs = {}
    for doc in docs:
        v=[]
        for word in token_query:
            for t in reverse_index[str(word)]:
                if t[0]==doc:
                    v.append(t[1])
                    break
            else:
                v.append(0)
        tfidf_docs[doc] = v
    
    res = df.filter(items=list(docs), axis=0)
    res["Cosine similarity"] = res.index.map(lambda x: cosine_similarity(tfidf_vec,tfidf_docs[x] ))
    
    return res[["placeName","placeDesc","placeURL","Cosine similarity"]].sort_values("Cosine similarity",ascending=False)

In [88]:
query_function_v2("fiori Roma").head()

Unnamed: 0,placeName,placeDesc,placeURL,Cosine similarity
4248,Grand Hotel Campo dei Fiori,With all the charm of the Grand Budapest Hote...,https://www.atlasobscura.com/places/grand-hote...,0.774663
437,Campo de Fiori,"Flower stalls, cafes, and throngs of tourists...",https://www.atlasobscura.com/places/campo-de-f...,0.774663
3395,Servian Wall at McDonald's,McDonald’s may be one of the last places you’...,https://www.atlasobscura.com/places/servian-wa...,0.632375
6822,Stolpersteine Holocaust Memorials,"All over Berlin and other parts of Europe, ar...",https://www.atlasobscura.com/places/stolperste...,0.632375
4600,Relics of Zlătari Church,"At a small church in Bucharest , Romania , a...",https://www.atlasobscura.com/places/relics-of-...,0.632375
