# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

22

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '', 'title': '[食記] 桃園中原大學早餐推薦 活采快樂廚房', 'date': '10/03', 'author': 'dong1104', 'link': '/bbs/Food/M.1601698289.A.D1E.html', 'text': '作者dong1104 (東蛙)看板Food標題[食記] 桃園中原大學早餐推薦 活采快樂廚房時間Sat Oct  3 12:11:26 2020\n   餐廳名稱：活采快樂廚房\n   消費時間：2020.09\n   地址：桃園市中壢區大仁二街8號\n   電話：03-438-1070\n   營業時間：07:00~15:00(星期天公休)\n   每人平均價位：60元\n   可否刷卡：否\n   有無包廂：無\n   推薦菜色：黃金牛肉捲餅\n\n\n好讀圖文版：https://dong1104.pixnet.net/blog/post/42692972\n\n\n這間位於中原大學商圈大仁二街上的活采快樂廚房，是我們相當喜歡的中原大學早餐店之一\n\n活采快樂廚房的價格非常實惠，套餐組合的價格才60~65元，真的是CP值超高\n\n推薦活采快樂廚房的黃金牛肉捲餅以及勁辣雞腿米漢堡，捲餅會煎到酥酥脆脆的，裡面還有牛肉和起司超好吃\n\n米漢堡也會煎到像是鍋巴一樣脆脆的，大口咬下超滿足，各位朋友如果想找中原早餐，來這裡就對了\n\n活采快樂廚房的位置就在中原大仁二街上，來這裡吃早餐騎車會比較方便\n\n門口廚房上面的套餐組合以及單價菜單價目表\n\n店內環境\n\n門口處有滿滿的漫畫，喜歡吃早餐配漫畫的朋友可以來\n\n活采快樂廚房的套餐組合有10種，通通才60元~65元，還附中杯奶茶或濃湯乙杯\n\n早餐單點的價格也很便宜，豬肉堡25元、煎蛋吐司20元，真的是銅板價\n\n套餐的中杯奶茶要自取\n\n黃金牛肉捲餅(60元)\n\n一共有四大塊，份量十足，而且外皮煎到恰恰的，看起來就超美味\n\n裡面捲的是牛肉片、起司和蛋，超厚的斷面讓人口水直流\n\n勁辣雞腿米漢堡(65元)\n\n第一次吃到米漢堡是如此酥脆，就像是吃鍋巴一樣脆脆的超好吃\n\n大口咬下還有又香又辣的雞腿排，真的是超過癮的啦！\n\n-----\n\n--\n※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 60.2

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push:  
title: [食記] 桃園中原大學早餐推薦 活采快樂廚房 
date: 10/03 
author: dong1104 
link: /bbs/Food/M.1601698289.A.D1E.html 
text: 作者don 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
