## Crawler class setup

In [4]:
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

URL = "https://www.ptt.cc/bbs/Gossiping/search"
TYPE = ["新聞", "爆卦"]
KEY = ["疫情", "指揮中心", "確診", "新冠", "疫苗", "醫院", "本土"]
PAGES = 30

payload = {
'from': '/bbs/Gossiping/index.html',
'yes': 'yes'
}
news_d = {}


class PttCrawler:
    def __init__(self, url=URL, keyword=KEY):
        self.url = url
        self.keyword = keyword
        self.articleIndex = 1
        self.rs = requests.session()      
        self.passWarning()
        for t in TYPE:
            self.startSearching(f"{URL}index.html", t)


    def __str__(self):
        return f"{self.keyword}"
    
    def passWarning(self):
        self.rs.post('https://www.ptt.cc/ask/over18',data = payload)

    def startSearching(self, url, articleType):
        res = self.rs.get(url, params={'q': articleType})
        soup = BeautifulSoup(res.text, "html.parser")
        startingIndex = 1

        for i in range(1, PAGES):
            newUrl = f"{URL}?page={startingIndex}&q={articleType}"
            print(newUrl)
            self.getPageArticles(newUrl, articleType)
            startingIndex += 1
            
        print(news_d)


    def getPageArticles(self, url, newsType):
        res = self.rs.get(url)
        soup = BeautifulSoup(res.text, "html.parser")
        
        for i in soup.find_all("div", {"class": "r-ent"}):
            d = i.find("div", {"class": "date"})
            l = i.find("div", {"class": "title"}).a
            # select today's news
            
            if self.isCurDate(d.text) and l is not None:
                title = i.select('.title')[0].text.strip()
                link = l["href"]
                trending = i.select('.nrec')[0].text
                
                if self.hasKeyword(title) and self.getTrending(trending) > 20:
                    print(title)
                    self.insertData(
                        self.articleIndex, title, link, d.text.strip(), int(trending), newsType)
                    self.articleIndex += 1
    
    def insertData(self, index, title, link, date, trending, newsType):
        newData = {
            "title": title,
            "type": newsType,
            "link": f"https://www.ptt.cc/{link}",
            "date": date,
            "trending": trending
        }
        
        news_d[index] = newData
        

    def getPageIndex(self, data):
        target = data.find_all("a", {"class": "btn wide"})
        for i in target:
            if i.text == "‹ 上頁":
                target_id = i["href"].split("/")[-1].split(".")[0]
            
        target_id = re.sub(r"[a-zA-z]+", "", target_id)
                
        return target_id

    
    def isCurDate(self, d):
        curDate = datetime.now().strftime("%-m/%d")
        return curDate == d.strip(" ")
    
    def getTrending(self, t):
        if not re.match('[0-9]', t):
            return 0

        return int(t)
    
    def hasKeyword(self, t):
        if "RE" in t or "Re" in t:
            return False

        for k in self.keyword:
            if re.search(k, t):
                return True
        return False

## Run

In [5]:
test = PttCrawler()

https://www.ptt.cc/bbs/Gossiping/search?page=1&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=2&q=新聞
[新聞] 333本土創新高！侯友宜：朝蓋方艙醫院的
https://www.ptt.cc/bbs/Gossiping/search?page=3&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=4&q=新聞
[新聞] 馬偕醫院宣布：即起萬華、板橋居民「禁入
https://www.ptt.cc/bbs/Gossiping/search?page=5&q=新聞
[新聞] 新北市1警察確診
https://www.ptt.cc/bbs/Gossiping/search?page=6&q=新聞
[新聞] 本土疫情燒進中央部會 經濟部證實一名員
https://www.ptt.cc/bbs/Gossiping/search?page=7&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=8&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=9&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=10&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=11&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=12&q=新聞
[新聞] 韓國在便利店出售新冠肺炎病毒自我檢測
https://www.ptt.cc/bbs/Gossiping/search?page=13&q=新聞
[新聞] 周玉蔻等名人擅自公布疫情資訊
https://www.ptt.cc/bbs/Gossiping/search?page=14&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=15&q=新聞
https://www.ptt.cc/bbs/Gossiping/search?page=16&q=新聞
https://www.ptt.cc/bbs/Goss

## print data

In [6]:
for k, v in sorted(news_d.items(), key=lambda x: x[1]["trending"], reverse=True):
    print(v)
    print(v["link"])

{'title': '[爆卦] 本土+333 境外+2', 'type': '爆卦', 'link': 'https://www.ptt.cc//bbs/Gossiping/M.1621231490.A.A43.html', 'date': '5/17', 'trending': 77}
https://www.ptt.cc//bbs/Gossiping/M.1621231490.A.A43.html
{'title': '[新聞] 333本土創新高！侯友宜：朝蓋方艙醫院的', 'type': '新聞', 'link': 'https://www.ptt.cc//bbs/Gossiping/M.1621237071.A.B36.html', 'date': '5/17', 'trending': 68}
https://www.ptt.cc//bbs/Gossiping/M.1621237071.A.B36.html
{'title': '[新聞] 馬偕醫院宣布：即起萬華、板橋居民「禁入', 'type': '新聞', 'link': 'https://www.ptt.cc//bbs/Gossiping/M.1621230910.A.C35.html', 'date': '5/17', 'trending': 45}
https://www.ptt.cc//bbs/Gossiping/M.1621230910.A.C35.html
{'title': '[新聞] 周玉蔻等名人擅自公布疫情資訊', 'type': '新聞', 'link': 'https://www.ptt.cc//bbs/Gossiping/M.1621187019.A.E8F.html', 'date': '5/17', 'trending': 31}
https://www.ptt.cc//bbs/Gossiping/M.1621187019.A.E8F.html
{'title': '[新聞] 新北市1警察確診', 'type': '新聞', 'link': 'https://www.ptt.cc//bbs/Gossiping/M.1621228477.A.87F.html', 'date': '5/17', 'trending': 26}
https://www.ptt.cc//bbs/Go