# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

24

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '1', 'title': '[廣宣] 台中西屯 越好吃越南料理黎明店', 'date': ' 9/25', 'author': 'matchstick', 'link': '/bbs/Food/M.1601003051.A.E08.html', 'text': '作者matchstick (ccc)看板Food標題[廣宣] 台中西屯 越好吃越南料理黎明店時間Fri Sep 25 11:04:09 2020\n\n     店名：越好吃越南料理 黎明店\n\n     地址：台中市西屯區黎明路二段898號B室\n     電話：(04)2259-1186\n     試吃日期：2020/9\n\n     食記：https://mercury0314.pixnet.net/blog/post/523900357\n\n\n大里超人氣平價越南料理\n越好吃插旗台中西屯七期！\n專賣越式手作餐盒，\n必吃爆料大份量越南法國麵包，\n七期商辦大樓附近美食，\n配合Ubereats、foodpanda外送。\n\n\n越好吃越南料理黎明店\n位於西屯區黎明路二段，\n鄰近市政七期商辦、百貨公司，\n網友喻為「越式網美店」的越好吃，\n二代店延續清新風格，\n讓越南小吃也能很文青。\n\n\n\n木質調的門面裝潢，\n有著木頭暖暖的溫度，\n前方放有兩張長椅，\n餐點以外帶和外送為主，\n沒有提供內用座位～\n\n\n\n雖然空間不大，\n但佈置溫馨有特色，\n彩色菜單設計得像本美食雜誌，\n附上照片可供點餐參考。\n\n\n越好吃越南料理黎明店菜單\n越式米線、越南法國麵包、\n炸物小品、涼拌小食、\n越式飲品、氣泡飲和套餐，\n可勾選去除不愛吃的配料，\n店員也會口頭詢問是否吃香菜。\n\n\n\n網美風外帶越南美食，\n顏值和美味兼具，\n餐盒設計超有質感，\n內容物更是澎湃豐盛。\n\n\n\n朋友來家裡聚餐，開盒即吃，\n不需要更換自家碗盤，\n擺盤就已經很美了～\n也很適合作為會議便當，\n近期會推出團訂方案，可來電詢問。\n\n\n\n經典的越南法國麵包，\n大顆份量餡料多，\n有越南火腿(原味)、越式炸排骨、\n越式醬燒牛、綠咖哩雞，\n多達四種口味選擇！\n\n越南火腿麵包 90元\n法國麵包烤到外層酥脆，\n口感紮實有嚼勁，料多爆餡，\n

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push: 1 
title: [廣宣] 台中西屯 越好吃越南料理黎明店 
date:  9/25 
author: matchstick 
link: /bbs/Food/M.1601003051.A.E08.html 
text: 作者mat 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
