In [1]:
import requests as rq
import bs4 as bs
import re
import time

In [2]:
import pandas as pd
from datetime import datetime
from re import search
import os
import numpy as np
from multiprocessing import  Pool, cpu_count

In [4]:
def extract_single_place(page):
    
    # Read local files
    with open(page,encoding="utf-8") as f:
        soup = bs.BeautifulSoup(f)
    
    placeName = soup.find_all('h1', {'class':'DDPage__header-title'})[0].contents[0]
    
    placeTags = list()
    for tags in soup.find_all('a', {'class':'itemTags__link js-item-tags-link'}):
        wordlen=len(tags.text)-2
        tag = tags.text[1:wordlen]
        placeTags.append(str(tag))
    
    been = soup.find_all('div', {'class':'col-xs-4X js-submit-wrap js-been-to-top-wrap action-btn-col hidden-print'})[0]
    num_been = been.get_text().split()
    numPeopleVisited = int(num_been[2])
    if numPeopleVisited==0:
        numPeopleVisited = ''
    
    want = soup.find_all('div', {'class':'col-xs-4X js-submit-wrap js-like-top-wrap action-btn-col hidden-print'})[0]
    num_want = want.get_text().split()
    numPeopleWant = int(num_want[3])
    if numPeopleWant==0:
        numPeopleWant = ''
    
    description = soup.find('div', class_='DDP__body-copy')
    allowlist = ['p', 'span', 'a', 'i']
    text_elements = [t for t in description.find_all(text=True) if t.parent.name in allowlist]
    placeDesc = str(' '.join(text_elements))
    placeDesc = placeDesc.replace(u'\xa0',u' ')
    
    
    placeShortDesc = soup.find_all('h3', {'class':'DDPage__header-dek'})[0].contents[0]
    placeShortDesc = placeShortDesc.replace(u'\xa0',u' ')
    placeShortDesc = str(placeShortDesc)
    
    placeNearby=list()
    for places in soup.find_all('div', {'class':'DDPageSiderailRecirc__item-title'}):
        placeNearby.append(str(places.text))
    if len(placeNearby) == 0:
        placeNearby = ''
    
    
    placeRaw= soup.find_all('address', class_='DDPageSiderail__address')[0]
    place = placeRaw.find_all('div')[0].contents[0:5:2]
    place = " ".join(place)
    placeAddress = place.replace('\n', '')
    
    
    coordinates = soup.find_all('div', class_='DDPageSiderail__coordinates')[0]
    coordinates = coordinates.get_text().split()
    Alt = coordinates[0]
    Altlen = len(Alt)
    placeAlt = float(Alt[0:Altlen-1])
    placeLong = float(coordinates[1])
    

    editors = soup.find_all('li', {'class':'DDPContributorsList__item'})
    if len(editors)==0:
        #placeEditors = soup.find_all('div', {'class':'DDPContributorsList'})[1].get_text().split()
        #TODO: check the line below
        listEditor = soup.find_all('div', {'class':'DDPContributorsList'})
        if len(listEditor) == 0:
            placeEditors=[""]
        else:
            placeEditors = listEditor[0].get_text().split()
    else:
        placeEditors = list()
        for place in editors:
            names = place.find('span').getText()
            placeEditors.append(names)
    
    
    date_time = soup.find_all('div', {'class':'DDPContributor__name'})[0].get_text()
    placePubDate = datetime.strptime(date_time, '%B %d, %Y')
    
    
    titles = soup.find_all('h3', class_='Card__heading --content-card-v2-title js-title-content')  
    placeRelatedPlaces = list()
    for title in titles:
        big_check = title.parent.parent.parent.parent.parent.parent
        check = big_check.find('div', class_="CardRecircSection__title").get_text()
        if check == 'Related Places':
            placeRelatedPlaces.append(str(title.get_text().strip()))
    
    placeRelatedLists = list()
    for title in titles:
        big_check = title.parent.parent.parent.parent.parent.parent
        check = big_check.find('div', class_="CardRecircSection__title").get_text()
        if search("Appears in", check):
            placeRelatedLists.append(str(title.get_text().strip()))
    if len(placeRelatedLists)==0:
        placeRelatedLists.append('')
    
    find_url = soup.find('link', {"rel": "canonical"})
    placeURL = find_url['href']
    
    #print("placeName: "+str(len(placeName)))
    #print("placetags "+str(len(placeTags)))
    #print("address "+str(len(placeAddress)))
    #print("editors "+str(len(placeEditors)))
    #print("relatedplaces "+str(len(placeRelatedPlaces)))
    #print("relatedlists "+str(len(placeRelatedLists)))

    
    return {'placeName': placeName,
            'placeTags': str(placeTags),
            'numPeopleVisited': numPeopleVisited,
            'numPeopleWant': numPeopleWant,
            'placeDesc': placeDesc,
            'placeShortDesc':placeShortDesc,
            'placeNearby':str(placeNearby),
            'placeAddress': placeAddress,
            'placeAlt': placeAlt,
            'placeLong': placeLong,
            'placeEditors': str(placeEditors),
            'placePubDate': placePubDate,
            'placeRelatedPlaces': str(placeRelatedPlaces),
            'placeRelatedLists': str(placeRelatedLists),
            'placeURL': placeURL}
    

reference:\
https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1

In [4]:
def table_maker(pages:list ,dir="downloads"):
    raws = []
    
    for page in pages:
        df = pd.DataFrame(extract_single_place(f"{dir}/{page}"), index=[0])
        raws.append(df)
    return pd.concat(raws)

def parallel_table(dir="downloads"):
    n_cores = cpu_count()
    files = os.listdir(dir)
    files.remove(".ipynb_checkpoints")   # remove this junk
    chunks = np.split(np.array(files), n_cores)
    
    pool = Pool(n_cores)
    df = pd.concat(pool.map(table_maker, chunks))
    pool.close()
    pool.join()
    return df
    

In [None]:
%%time
all_pages = parallel_table("./../html/downloads/")

## Only this method below seems to work

In [5]:
def table_maker(pages:list ,dir="downloads"):
    index = extract_single_place(f"{dir}/{pages[0]}") # create index
    with open("result.tsv","w",encoding="utf-8") as f:
        f.write("\t".join(list(map(str,index.keys()))) + "\n")
    for page in pages:
        cols = extract_single_place(f"{dir}/{page}")
        with open("result.tsv","a",encoding="utf-8") as f:
            f.write("\t".join(list(map(str,cols.values()))) + "\n")
    print("done")

In [6]:
files = os.listdir("./../html/downloads/")
files.remove(".ipynb_checkpoints")

table_maker(files,"./../html/downloads")

IndexError: list index out of range

### Debugging di extract_single_pages

In [None]:
import traceback

files = os.listdir("downloads/")
files.remove(".ipynb_checkpoints")   # remove this junk
for file in files:
    try:
        extract_single_place(f"downloads/{file}")
    except Exception as e:
        print(file,"-->" ,traceback.format_exc())

# 2

In [6]:
import nltk
import string

import json

In [7]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fedes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fedes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
def text_cleaner_2(description):
    if type(description) != str:
        return None
    
    stop_words = set(stopwords.words('english'))
    snow_stemmer = SnowballStemmer(language='english')
    punct =  set(string.punctuation)
    punct.add("“")
    punct.add("”")
    punct.add("’")

    filtered_sentence = []
    word_tokens = word_tokenize(description)
    for w in word_tokens:
        if w not in stop_words :
            filtered_sentence.append(w)

    stemmed_desc = []
    for w in filtered_sentence:
        x = snow_stemmer.stem(w)
        stemmed_desc.append(x)

    filtered_desc = []
    for s in stemmed_desc:
        if s not in punct:
            filtered_desc.append(s)

    return filtered_desc

In [39]:
df_res = pd.read_csv("result.tsv", delimiter = '\t', error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 3724: expected 15 fields, saw 16\n'


In [17]:
df_res["cleanedDesc"] = df_res.placeDesc.apply(text_cleaner_2)
df_res["cleanedShortDesc"] = df_res.placeShortDesc.apply(text_cleaner_2)

In [18]:
s = set()
for desc in df_res.cleanedDesc:
    if desc is not None:
        for word in desc:
            s.add(word)

Creating the _vocabulary_ file:

In [19]:
d, i = {}, 0
for w in s:
    d[w] = i
    i += 1
with open("vocabulary.json", "w") as f:
    f.write(json.dumps(d))

In [20]:
with open("vocabulary.json", "r") as f:
    vocabulary = json.load(f)

reverse_index = {}
for w in s:
    filtered = df_res.cleanedDesc.apply(lambda x: x != None and w in x)
    docs = df_res[filtered].index.tolist()
    reverse_index[vocabulary[w]] = docs

with open("reverse_index.json", "w") as f:
    f.write(json.dumps(reverse_index))

### execute query

In [36]:
def query_function(query, voc, reverse_index, df):
    idx = voc[query.split()[-1]]
    s = set(reverse_index[str(idx)])
    for w in query.split()[:-1]:
        idx = voc[w]
        s.intersection(set(reverse_index[str(idx)]))
    return df.iloc[list(s)]

In [37]:
query = "system"

with open("reverse_index.json", "r") as f:
    reverse_index = json.load(f)
with open("vocabulary.json", "r") as f:
    vocabulary = json.load(f)

result = query_function(query, vocabulary, reverse_index, df_res)

In [38]:
result

Unnamed: 0,placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placeNearby,placeAddress,placeAlt,placeLong,placeEditors,placePubDate,placeRelatedPlaces,placeRelatedLists,placeURL,cleanedDesc,cleanedShortDesc
642,Bluespring Caverns,"['subterranea', 'river', 'subterranean site', ...",340,1964,The river that flows through the Bluespring C...,The longest navigable underground river in the...,"['Virgil I. Gus Grissom Rocket Monument', 'Sta...","1459 Blue Springs Cavern Road Bedford, Indiana...",38.7962,-86.546,"['Mkwolfor', 'mikermnz', 'rebecaschandel', 'la...",2014-07-14 00:00:00,"['Pertosa Caves', 'Caves of Monte Castillo', '...",[''],https://www.atlasobscura.com/places/bluespring...,"[the, river, flow, bluespr, cavern, 21-mile-lo...","[the, longest, navig, underground, river, unit..."
134,Alamogordo Landfill,['video game'],99,469,While there is some speculation as to whether...,Buried beneath the New Mexico sands are believ...,"['Sunspot Solar Observatory', 'New Mexico Muse...","5530 US-54 Alamogordo, New Mexico, 88310 Unite...",32.7426,-105.9894,['EricGrundhauser'],2014-04-14 00:00:00,"[""Bergsala's Super Mario"", 'Game Boy that Surv...",['8 Places to Experience Video Game History IRL'],https://www.atlasobscura.com/places/alamogordo...,"[while, specul, whether, stori, true, alamogor...","[buri, beneath, new, mexico, sand, believ, mil..."
904,Calvert Cliffs State Park,"['par', 'fossil', 'rock formation', 'rock', 'c...",501,667,The Chesapeake Bay and its shores were explor...,Captain John Smith thought these cliffs were a...,"['Moll Dyer Rock', 'Old Trinity Church', 'Stan...","10540 Hg Trueman Road Lusby, Maryland, 20657 U...",38.3956,-76.4293,"['morjolee', 'rugby007', 'Kusary', 'hana', 'de...",2015-10-26 00:00:00,"['Candy Cane Mountains', 'Neptuni Åkrar', 'Tuc...",[''],https://www.atlasobscura.com/places/calvert-cl...,"[the, chesapeak, bay, shore, explor, chart, ca...","[captain, john, smith, thought, cliff, amaz, 1..."
394,Baldwin Street Houses,"['world recor', 'urban plannin', 'road', 'hous...",481,465,"Thanks to some blunt city planning Dunedin ,...",One of the world's steepest streets seems to h...,"['Beverly Clock', 'Dunedin Museum of Natural M...","Baldwin Street Dunedin, 9010 New Zealand",-45.8493,170.5344,"['FluidNick', 'spersephone', 'narissa71', 'lin...",2013-12-05 00:00:00,"['Blenheim Gardens Estate', 'Kampong Lorong Bu...","['8 Geometrical Wonders That Are Also Houses',...",https://www.atlasobscura.com/places/baldwin-st...,"[thank, blunt, citi, plan, dunedin, new, zeala...","[one, world, 's, steepest, street, seem, hous,..."
524,Benbulben Barite Mine,"['abandoned mine', 'mine', 'industria', 'geolog']",31,410,"In a beautiful, remote part of Ireland stands...","The remains of this deep, dark mine have left ...","['Grave of W. B. Yeats', ""Medb's Cairn"", 'The ...",Unnamed Road Sligo Ireland,54.3622,-8.4579,['UrbexJunkie'],2015-11-05 00:00:00,"['Mazama Queen Mine', 'Alder Mine', 'Searles L...",[''],https://www.atlasobscura.com/places/benbulben-...,"[in, beauti, remot, part, ireland, stand, rema...","[the, remain, deep, dark, mine, left, behind, ..."
398,Carnegie Public Library,"['repositories of knowledg', 'librarie', 'Uniq...",360,472,At one time the earliest branch of the Seattl...,Always a new use for this old building.,"['Majestic Bay Theatre', 'Full Tilt Ice Cream ...","2026 NW Market Street Seattle, Washington, 981...",47.6689,-122.3834,"['Leslie', 'McIntyre']",2012-08-28 00:00:00,"['Suzzallo Library Reading Room', 'Biblioteca ...",[''],https://www.atlasobscura.com/places/ballard-ca...,"[at, one, time, earliest, branch, seattl, publ...","[alway, new, use, old, build]"
654,Boca do Inferno,"['aleister crowle', 'magi', 'geological odditi...",473,772,Boca do Inferno (in English: “Mouth of Hell”)...,A unique seaside cave where Aleister Crowley f...,"['Convent of the Capuchos', 'Pena National Pal...","Av. Rei Humberto II de Itália Cascais, 2750-64...",38.6914,-9.4306,"['matteogamba', 'majeur', 'Evilpad', 'Molly Mc...",2012-08-23 00:00:00,"['Kong Lor Cave', 'Marble Caves of Chile Chico...",['16 Hidden Gems to Visit in Portugal'],https://www.atlasobscura.com/places/boca-do-in...,"[boca, inferno, english, mouth, hell, scenic, ...","[a, uniqu, seasid, cave, aleist, crowley, fake..."
18,45 X 90 Geographical Marker,"['location marker', 'geographic marker', 'intr...",261,835,Tracing the earth are invisible lines. An inf...,The center of the only hemisphere that awards ...,"['Jurustic Park', 'Timms Hill', 'Rudolph Grott...","5651 Meridian Rd Athens, Wisconsin, 54411 Unit...",44.9992,-90.004,"['DebG', 'mkprator', 'carlyannh24', 'Sara Anun...",2013-02-20 00:00:00,"['Holme Fen Posts', ""'Krblin Jihn Kabin'"", 'Th...",[''],https://www.atlasobscura.com/places/45-x-90-ge...,"[trace, earth, invis, line, an, infinit, fine,...","[the, center, hemispher, award, visitor, comme..."
152,Aleister Crowley's Abbey of Thelema,"['aleister crowle', 'cult', 'drug', 'occul', '...",56,623,"In 1919 Aleister Crowley, a mystic and occul...","Where Aleister Crowley taught magick in ""La Ch...","['Lavatoio Medievale', 'Villa Palagonia', 'San...","Abbey of Thelema Cefalù, 90015 Italy",38.0316,14.0271,"['prakva', 'Collector of Experiences', 'Molly ...",2012-04-30 00:00:00,"[""Aleister Crowley's Magickal Retirement"", 'Th...",['33 Places Famous for the Scandals That Occur...,https://www.atlasobscura.com/places/aleister-c...,"[in, 1919, aleist, crowley, mystic, occultist,...","[where, aleist, crowley, taught, magick, ``, l..."
792,Buck Atom,"['route 6', 'shop', 'statue']",280,436,Standing over Route 66 in Tulsa is a 21-foot ...,"Standing at 21-feet tall, this space cowboy ad...","['The Church Studio', 'Boston Avenue Methodist...","1347 E 11th St Tulsa, Oklahoma, 74120 United S...",36.1481,-95.9737,"['mandamcg13', 'biodieselbarry', 'Darrell Powe...",2019-09-12 00:00:00,"['Captain America Statue', 'Via dei Presepi', ...",[''],https://www.atlasobscura.com/places/buck-atom,"[stand, rout, 66, tulsa, 21-foot, tall, space,...","[stand, 21-feet, tall, space, cowboy, add, leg..."
