# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

12

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '', 'title': '[食記] 台北車站商業午餐-【樂雅樂】', 'date': '10/10', 'author': 'hellolego', 'link': '/bbs/Food/M.1602332970.A.C66.html', 'text': '作者hellolego (hellolego)看板Food標題[食記] 台北車站商業午餐-【樂雅樂】 時間Sat Oct 10 20:29:28 2020\n店家資料：樂雅樂(站前店)\n用餐時間：2020年6月\n營業時間：週一至週四07:00-21:30/ 週五至週日07:00-22:00\n電話：02 2371 3128\n付現、刷卡、一成服務費\n地址：台北市中正區許昌街19號一樓(捷運台北車站8號出口)\n醬廖食被秀：https://reurl.cc/ygDa2y\n---------------\n以前高中在北車補習的時候就看過這間樂雅樂，\n\n就在YMCA旁邊，\n\n而且全台很多分店，\n\n但直到現在都3X歲了，才來第一次造訪，\n\n\n\n\n\n我們臨時起意，所以沒有預約，\n\n現場倒是不少組客人，\n\n有些是預定席，當我們開始用餐時，人都陸陸續續到，\n\n\n\n\n\n有非常多的菜單可以選，\n\n我們光選菜就選了5分鐘，\n\n類別很多，包含漢堡排、蛋包飯、焗烤、義大利麵、排餐、咖哩飯、御膳等，\n\n我們當天是平日，所以有商業午餐，\n\n我選了蒜味雞排香酥魚餐(310元)，另外還附湯品、麵包(或白飯)、百元飲品，\n\n\n\n\n\n朋友點的是海鮮青醬義大利麵套餐(310元)，\n\n另外也有附湯品跟百元飲品(可是沒麵包)\n\n\n\n另外還有很多品項，\n\n詳細菜單就請大家到樂雅樂官網看個仔細吧\n\n\n\n坐下後不會等太久餐點就陸續登場，\n\n首先是當日例湯：奶油濃湯\n\n這款真的有濃~算是濃稠型的湯品，\n\n味道不錯、留著等等可以沾麵包(聰明)\n\n\n\n\n\n再來上桌的是剛烤好的麵包!\n\n香香軟軟，還附上有鹽奶油根本絕配，\n\n當然也可以沾上面的濃湯，口感也完全不同，\n\n\n\n\n\n我的主餐蒜味雞排香酥魚!\n\n\n\n\n\n除了有生菜沙拉加百香果醬之外，\n\n還有煎的嫩雞腿佐炸蒜片，\n\n\n\n\n

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push:  
title: [食記] 台北車站商業午餐-【樂雅樂】 
date: 10/10 
author: hellolego 
link: /bbs/Food/M.1602332970.A.C66.html 
text: 作者hel 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
