# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

21

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '', 'title': '[食記] 屏東琉球—小琉球脆皮蛋餅｜咔滋作響脆口蛋餅', 'date': '10/05', 'author': 'shinyban', 'link': '/bbs/Food/M.1601877418.A.D46.html', 'text': '作者shinyban (甩尼班)看板Food標題[食記] 屏東琉球—小琉球脆皮蛋餅｜咔滋作響脆口蛋餅時間Mon Oct  5 13:56:49 2020\n   餐廳名稱：小琉球脆皮蛋餅\n   消費時間：2020/9\n   地址：屏東縣琉球鄉中正路186號\n   電話：0981-621-756\n   營業時間：週一至週五06:00-10:30；週六週日05:30-10:30\n   每人平均價位：100\n   可否刷卡：否\n   有無包廂：無\n\n[*BLOG：https://shinyban.pixnet.net/blog/post/531454885 ]\n\n\n小琉球是臺灣唯一的珊瑚島，天然奇景甚美、海底生態資源豐富，是著名的旅遊、潛水景\n點。\n\n由於今年疫情影響無法出國，所以目標鎖定探索台灣之美，除了宜蘭、高雄、花蓮之外，\n也首次踏上了小琉球這塊美麗的島嶼。\n\n從屏東東港搭船約三十分鐘即可抵達，環島一圈僅二十公里不到，騎機車不用一小時即可\n跑透透，而與海龜同遊也是一大亮點，當然也是這次的重頭戲。\n\n除了玩樂之外，吃的部分也不能馬虎，由於小琉球面積不大，餐廳店家還算好選擇，不過\n旅遊旺季每間幾乎都是人潮滿滿，包含早餐店也是。\n\n--\n\n\n「小琉球脆皮蛋餅」位於小琉球中間位置，不論住哪到這都是挺方便的。\n\n室內室外皆有座位，但因為人潮太多就沒有多加拍攝，而店員則是不停的翻動平底鍋，才\n能應付絡繹不絕的訂單。\n\n不過或許是因為客人多，所以店家服務態度不算是挺好，甚至有點不耐煩，上桌速度不快\n且有漏單狀況，建議事先打電話預訂較保險些。\n\n--\n\n▲奶茶（小） $20\n\n▲薯條 $25\n\n▲雞塊 $30\n\n奶茶跟炸物表現不優，奶茶淡、薯條份量少、雞塊炸得不夠酥脆，來這還是乖乖吃招牌脆\n皮蛋餅就好。\n\n▲脆皮鮪魚蛋餅 $45\n\n小琉球早餐店販售的蛋餅多為粉漿蛋餅，但我更偏好脆皮蛋餅多些。\n\n「小

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push:  
title: [食記] 屏東琉球—小琉球脆皮蛋餅｜咔滋作響脆口蛋餅 
date: 10/05 
author: shinyban 
link: /bbs/Food/M.1601877418.A.D46.html 
text: 作者shi 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
