In [1]:
import requests as rq
import bs4 as bs
import re
import time

In [2]:
import pandas as pd
from datetime import datetime
from re import search
import os
import numpy as np
from multiprocessing import  Pool, cpu_count

In [1]:
def extract_single_place(page):
    
    # Read local files
    with open(page) as f:
        soup = bs.BeautifulSoup(f)
    
    placeName = soup.find_all('h1', {'class':'DDPage__header-title'})[0].contents[0]
    
    placeTags = list()
    for tags in soup.find_all('a', {'class':'itemTags__link js-item-tags-link'}):
        wordlen=len(tags.text)-2
        tag = tags.text[1:wordlen]
        placeTags.append(str(tag))
    
    been = soup.find_all('div', {'class':'col-xs-4X js-submit-wrap js-been-to-top-wrap action-btn-col hidden-print'})[0]
    num_been = been.get_text().split()
    numPeopleVisited = int(num_been[2])
    if numPeopleVisited==0:
        numPeopleVisited = ''
    
    want = soup.find_all('div', {'class':'col-xs-4X js-submit-wrap js-like-top-wrap action-btn-col hidden-print'})[0]
    num_want = want.get_text().split()
    numPeopleWant = int(num_want[3])
    if numPeopleWant==0:
        numPeopleWant = ''
    
    description = soup.find('div', class_='DDP__body-copy')
    allowlist = ['p', 'span', 'a', 'i']
    text_elements = [t for t in description.find_all(text=True) if t.parent.name in allowlist]
    placeDesc = str(' '.join(text_elements))
    placeDesc = placeDesc.replace(u'\xa0',u' ')
    
    
    placeShortDesc = soup.find_all('h3', {'class':'DDPage__header-dek'})[0].contents[0]
    placeShortDesc = placeShortDesc.replace(u'\xa0',u' ')
    placeShortDesc = str(placeShortDesc)
    
    placeNearby=list()
    for places in soup.find_all('div', {'class':'DDPageSiderailRecirc__item-title'}):
        placeNearby.append(str(places.text))
    if len(placeNearby) == 0:
        placeNearby = ''
    
    
    placeRaw= soup.find_all('address', class_='DDPageSiderail__address')[0]
    place = placeRaw.find_all('div')[0].contents[0:5:2]
    place = " ".join(place)
    placeAddress = place.replace('\n', '')
    
    
    coordinates = soup.find_all('div', class_='DDPageSiderail__coordinates')[0]
    coordinates = coordinates.get_text().split()
    Alt = coordinates[0]
    Altlen = len(Alt)
    placeAlt = float(Alt[0:Altlen-1])
    placeLong = float(coordinates[1])
    

    editors = soup.find_all('li', {'class':'DDPContributorsList__item'})
    if len(editors)==0:
        #placeEditors = soup.find_all('div', {'class':'DDPContributorsList'})[1].get_text().split()
        #TODO: check the line below
        listEditor = soup.find_all('div', {'class':'DDPContributorsList'})
        if len(listEditor) == 0:
            placeEditors=[""]
        else:
            placeEditors = listEditor[0].get_text().split()
    else:
        placeEditors = list()
        for place in editors:
            names = place.find('span').getText()
            placeEditors.append(names)
    
    
    date_time = soup.find_all('div', {'class':'DDPContributor__name'})[0].get_text()
    placePubDate = datetime.strptime(date_time, '%B %d, %Y')
    
    
    titles = soup.find_all('h3', class_='Card__heading --content-card-v2-title js-title-content')  
    placeRelatedPlaces = list()
    for title in titles:
        big_check = title.parent.parent.parent.parent.parent.parent
        check = big_check.find('div', class_="CardRecircSection__title").get_text()
        if check == 'Related Places':
            placeRelatedPlaces.append(str(title.get_text().strip()))
    
    placeRelatedLists = list()
    for title in titles:
        big_check = title.parent.parent.parent.parent.parent.parent
        check = big_check.find('div', class_="CardRecircSection__title").get_text()
        if search("Appears in", check):
            placeRelatedLists.append(str(title.get_text().strip()))
    if len(placeRelatedLists)==0:
        placeRelatedLists.append('')
    
    find_url = soup.find('link', {"rel": "canonical"})
    placeURL = find_url['href']
    
    #print("placeName: "+str(len(placeName)))
    #print("placetags "+str(len(placeTags)))
    #print("address "+str(len(placeAddress)))
    #print("editors "+str(len(placeEditors)))
    #print("relatedplaces "+str(len(placeRelatedPlaces)))
    #print("relatedlists "+str(len(placeRelatedLists)))

    
    return {'placeName': placeName,
            'placeTags': str(placeTags),
            'numPeopleVisited': numPeopleVisited,
            'numPeopleWant': numPeopleWant,
            'placeDesc': placeDesc,
            'placeShortDesc':placeShortDesc,
            'placeNearby':str(placeNearby),
            'placeAddress': placeAddress,
            'placeAlt': placeAlt,
            'placeLong': placeLong,
            'placeEditors': str(placeEditors),
            'placePubDate': placePubDate,
            'placeRelatedPlaces': str(placeRelatedPlaces),
            'placeRelatedLists': str(placeRelatedLists),
            'placeURL': placeURL}
    

reference:\
https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1

In [11]:
def table_maker(pages:list ,dir="downloads"):
    raws = []
    
    for page in pages:
        df = pd.DataFrame(extract_single_place(f"{dir}/{page}"), index=[0])
        raws.append(df)
    return pd.concat(raws)

def parallel_table(dir="downloads"):
    n_cores = cpu_count()
    files = os.listdir(dir)
    files.remove(".ipynb_checkpoints")   # remove this junk
    chunks = np.split(np.array(files), n_cores)
    
    pool = Pool(n_cores)
    df = pd.concat(pool.map(table_maker, chunks))
    pool.close()
    pool.join()
    return df
    

In [None]:
%%time
all_pages = parallel_table()

### Debugging di extract_single_pages

In [None]:
import traceback

files = os.listdir("downloads/")
files.remove(".ipynb_checkpoints")   # remove this junk
for file in files:
    try:
        extract_single_place(f"downloads/{file}")
    except Exception as e:
        print(file,"-->" ,traceback.format_exc())