# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

17

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '', 'title': '[食記][台北市] Toasteria Cafe 吐司利亞 敦南店', 'date': ' 9/21', 'author': 'JeremyKSKGA', 'link': '/bbs/Food/M.1600701041.A.C41.html', 'text': '作者JeremyKSKGA (Jeremy以食為天)看板Food標題[食記][台北市] Toasteria Cafe 吐司利亞 敦南店時間Mon Sep 21 23:10:33 2020\n圖文網誌：https://jeremyckt2.pixnet.net/blog/post/229693142\n\nToasteria Cafe 吐司利亞 敦南店\n用餐日期：2020.3.20、9.16\n地址：台北市大安區敦化南路一段169巷3號\n鄰近捷運站：台北捷運板南線BL16忠孝敦化站\n電話：(02) 2752-0033\n營業時間：星期一~五 11:00~隔日凌晨1:00；星期六、日 9:00~隔日凌晨1:00\n官網： https://www.toasteriacafe.com/\nFB粉專： https://www.facebook.com/toasteriacafe/\n\n\n3/20(五)晚上下班後約了好友要去哈根達斯敦南旗艦店買一送一，\n\n我們打算先就近在「Toasteria Cafe 吐司利亞」吃晚餐。\n\n店門口就種了不少盆栽，\n\n綠意盎然令人感到賞心悅目。\n\n\n店裡充滿濃濃的異國風情，\n\n布置和裝潢很走地中海風格，\n\n現場又播放著異國情調音樂。\n\n廚房是開放式的。\n\n\n2樓的座位區(9/16那天中午二訪時補拍的XD)，\n\n空間更寬敞又舒適愜意，\n\n陽台區種了許多植栽前面也有一排座位。\n\n\n\n「Toasteria Cafe 吐司利亞」主打地中海料理，\n\n最具特色的包括西班牙小菜、中東胡姆斯(Hummus)、夏卡蘇卡(Shakshuka)等，\n\n還有地中海風格義大利麵、沙拉、早午餐等，\n\n此外這家的經典帕尼尼三明治也是主打之一，\n\n其他還有甜點、咖啡、茶飲、果汁、氣泡飲、酒精飲料等。\n\n(注意：未成年請勿飲酒！)\n\n完整菜單可參見我的相簿：https://jeremyckt

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push:  
title: [食記][台北市] Toasteria Cafe 吐司利亞 敦南店 
date:  9/21 
author: JeremyKSKGA 
link: /bbs/Food/M.1600701041.A.C41.html 
text: 作者Jer 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
