# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

15

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '2', 'title': '[廣宣] 台北 Leone Restaurant&Bar~台味地中海', 'date': '10/01', 'author': 'clairegarden', 'link': '/bbs/Food/M.1601558242.A.828.html', 'text': '作者clairegarden (簡單的幸福)看板Food標題[廣宣] 台北 Leone Restaurant&Bar~台味地中海時間Thu Oct  1 21:17:16 2020\n\n     店名：Leone Restaurant & Bar\n\n     地址：104台北市中山區林森北路353巷20號\n     電話：02 2581 3109\n     試吃日期：2020.09\n\n網誌圖文版\nhttps://clairegarden.pixnet.net/blog/post/539689720\n\n很難想像在林森北路的巷弄裡\n\n竟然藏了一棟這樣藍白異國風的建築物\n\n牆上畫著聖托里尼的風景讓人彷彿來到了希臘\n\nLeone是希臘文的獅子\n\n有別於其他店家的logo店家是半隻獅子的臉\n\n反而更讓人映象深刻呢\n\n小黑板上的台式地中海讓人好讓人好奇是甚麼樣的料理喔\n\n\n\n走進店內馬上就注意到一整排的調酒吧\n\n感覺店家的調酒非常厲害呢\n\n大包廂的座位區\n\n還有靠牆的座位區\n\n\n\n先上桌的迎賓飲料\n\n是用非常華麗的樹葉藤蔓裝著的桂花梅子釀\n\n喝起來有淡淡桂花香氣和梅子釀的愛玉\n\n非常清爽又開胃\n\n\n\n店家的餐點主要以台灣的各地特色食材為基準\n\n再搭配地中海的清爽式料理調味\n\n激盪出中西碰撞的料理花火\n\n\n\n首先上桌的是流沙芝心芋棗球\n\n外圍是一層厚厚的大甲芋頭泥\n\n鹹蛋黃醬加蛋黃醬輕輕一壓就要爆餡\n\n口感外酥內軟外\n\n還吃的到小顆芋頭塊喔\n\n\n\n奶香焦糖海鮮義大利麵\n\n義大利麵用高湯燉煮的很軟爛\n\n並沒有義大利麵條吃起來偏硬的口感\n\n奶香味非常濃郁吃起來讓人一口接著一口\n\n裡面有花枝、蝦子、淡菜、蛤蠣\n\n淡菜是用烤的吃起來有點香烤風味\n\n蝦子和花枝非常Q彈好吃\n\n蛤蠣則是鮮美多汁\n\n來自基隆的新鮮海產可是個

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push: 2 
title: [廣宣] 台北 Leone Restaurant&Bar~台味地中海 
date: 10/01 
author: clairegarden 
link: /bbs/Food/M.1601558242.A.828.html 
text: 作者cla 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
