# Web Crawler

In [1]:
import requests
from bs4 import BeautifulSoup as soup  # HTML data structure

# extract article hyperlinks from an index page
def extractArtLinks(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "html.parser")
    containers = page_soup.findAll("div", {"class": "r-ent"})
    art_links = []
    for container in containers:
        # Finds all link tags "a" from within the first div.
        art_link = container.find('a')
        if art_link:
            #print(art_link['href'])
            #print(container.find('div',{'class':'title'}).get_text())
            art_meta = container.find('div',{'class':'meta'})
            #print(art_meta.find('div',{'class':'author'}).get_text())
            #print(art_meta.find('div',{'class':'date'}).get_text())

            art_links.append({
                'push': container.find('div',{'class':'nrec'}).get_text(),
                'title': container.find('div',{'class':'title'}).get_text().strip(),
                'date': art_meta.find('div',{'class':'date'}).get_text(),
                'author': art_meta.find('div',{'class':'author'}).get_text(),
                'link': art_link['href'],
                'text': extractArtText('https://www.ptt.cc' + art_link['href'])
            })

    return(art_links)

# find the previous index page link
def findPrevIndex(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text,"html.parser")
    btn = page_soup.select('div.btn-group > a')
    up_page_href = btn[3]['href']
    next_page_url = 'https://www.ptt.cc' + up_page_href
    return(next_page_url)

# extract article contents from  the article hyperlink
def extractArtText(url):
    r = requests.get(url, cookies={'over18':'1'})
    page_soup = soup(r.text, "lxml")
    #print(page_soup.find("div",{"id":"main-content"}).get_text())
    art_text=page_soup.select('div#main-content', limit=1)[0].text
    return(art_text)

In [2]:
# main()
num_of_index_page = 2
board_name = 'Food'
url = 'https://www.ptt.cc/bbs/{}/index.html'.format(board_name)
all_links =[]
for page in range(1,num_of_index_page):
    all_links = all_links + extractArtLinks(url)
    url = findPrevIndex(url)
len(all_links)

22

In [3]:
type(all_links[2])
print(all_links[2])

{'push': '', 'title': '[食記] 高雄 老周冷熱飲- 60年老店/20元好吃麻糬', 'date': ' 9/24', 'author': 'vhygdih', 'link': '/bbs/Food/M.1600949101.A.5F2.html', 'text': '作者vhygdih (ATJ)看板Food標題[食記] 高雄 老周冷熱飲- 60年老店/20元好吃麻糬時間Thu Sep 24 20:04:58 2020\n   餐廳名稱：老周冷熱飲\n   消費時間：2020年/08月\n   地址：高雄市三民區三民街126號\n   電話：07 281 6780\n   營業時間：09:30–00:00\n   每人平均價位：$20-50\n   可否刷卡：否\n   有無包廂：無\n   推薦菜色：\n\n\n\n完整圖文介紹:\nhttps://vhygdih0412.pixnet.net/blog/post/352934567\n\n\n\n這家位於位於三民市場的八寶冰\n已經是60年的老店~\n在朋友推薦之下過來吃麻糬,\n沒有聽錯就是麻糬XD\n趕快來看看為什麼, 八寶冰會想推麻糬吧\n\n\n\n這附近很多停車位\n至少晚上不會太難停,\n畢竟沒有捷運站, 自己來會比較方便\n晚餐過後是吃冰熱門時間\n還蠻多人過來外帶~\n內用外帶排隊都是直接點餐付錢\n\n\n\n夏天招牌是八寶冰\n另外也會提供湯的選擇,\n不過夏天不會準備好,\n有人點才會做需要等~\n提供室內的空間可以內用\n位置還蠻多的~\n\n\n\n燒麻糬芝麻, 花生 $40/2顆\n\n先介紹老周最多人吃的麻糬燒\n有花生跟芝麻口味\n建議兩個都要點,\n一口花生一口芝麻配著吃\n\n\n這邊麻糬口感是軟綿帶著Q\n完全不會黏牙, 本身就非常香\n個人還蠻喜歡這口感,\n有點溫度的麻糬還是比較好吃\n兩顆份量還蠻多,p果有吃別的東西\n推薦可以兩個人分兩顆\n\n\n\n八寶冰 $50\n\n據說已經有漲價\n現在是50元一碗,\n相比流行的水果冰, 韓式冰品\n這個價錢還是很便宜\n而且是古早味還懷念的味道\n\n\n\n八寶冰配料有紅豆, 小湯圓, 地瓜, 鳳梨\n芋頭, 愛玉, 珍珠,等等, 都是現煮現熬\n我個人就還蠻喜歡鳳梨跟地瓜的搭配\n帶有天然的甜

In [4]:
print('Push: {push:s} \n'
      'title: {title:s} \n'
      'date: {date:s} \n'
      'author: {author:s} \n'
      'link: {link:s} \n'
      'text: {text:.5} \n'.format(**all_links[2]))

Push:  
title: [食記] 高雄 老周冷熱飲- 60年老店/20元好吃麻糬 
date:  9/24 
author: vhygdih 
link: /bbs/Food/M.1600949101.A.5F2.html 
text: 作者vhy 



:::{admonition} Exercise
How to seperate post texts from push texts?
:::
