In [1]:
# 目標: 
# 1. 爬巴哈姆特的遊戲排行資料
# 2. 找出黑沙討論版的討論概況
# source: https://pala.tw/python-web-crawler/

import time
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import random
import json    
import pandas as pd
import numpy as np
import datetime as dt

# Google sheet AP
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [2]:
# 先創造出資料需求的 list 
title_list = []
gp_list = []
reply_list = []
click_list = []
lastReply_list = []
author_list = []
posttime_list = []
url_list = []

In [3]:
# 找出該篇文章的 標題, GP, 回復, 點擊 , 最後編輯, 作者

# tag 的邏輯
pattern1 = re.compile("^b-list__main")
pattern2 = re.compile(r"^b-list__summary__gp b-gp")

# 爬前五 rang() 頁的資料
for i in range(5):
    # 把網址讀入
    url = "https://forum.gamer.com.tw/B.php?page={}&bsn=19017".format(i+1)
    req = Request(url,headers={'User-Agent': 'Mozilla/5.0'})
    html = urlopen(req).read().decode("utf-8")
    soup=  BeautifulSoup(html, features='lxml')
    
    # 抓 title
    # 當該討論串已無文章時,會有不同 tag 所以要拆開處理   
    title_tag = soup.find_all("td",{"class":pattern1})
    for i in title_tag:
        if i.find_all("a") != []:
            title_list.append(i.find("a").contents[0])
        else:
            title_list.append(i.find("span").contents[0])
            

    # 抓文章 Post 的時間   
    # 此時的 post time 會忽略文章標題 == 本討論串已無文章
    id_tag = soup.find_all("td",{"class":"b-list__summary"})
    for i in id_tag:
        title_id = str(i.find("a"))[9:-6]
        url2 = "https://forum.gamer.com.tw/C.php?bsn=19017&snA=" + title_id
        # 新增文章連結
        url_list.append(url2)
        
        req2 = Request(url2,headers={'User-Agent': 'Mozilla/5.0'})
        html2 = urlopen(req2).read().decode("utf-8")
        soup2 =  BeautifulSoup(html2, features='lxml')
        post_tag = soup2.find_all("div",{"class":"c-post__header__info"},limit=1)
    
        for j in post_tag:
            temp = str(j).replace("data-mtime=","/").replace(" href="," /")
            posttime_list.append(temp.split('" /"')[1])
            

    # 抓 gp
    td_tag = soup.find_all( "td")
    count = 1
    for i, tag in enumerate(td_tag,0):
        if i >4:
            # 如果標籤裡面有 xxx gp 的話就印出 gp 的值
            # count 邏輯是依實際 html tag 來歸納
            if i - 4 * count == 1: 
                # GP 有值的話就給值 沒有的話就給 0
                if tag.find_all("span", {"class":pattern2}) != []:
                    gp_list.append(int(tag.find("span", {"class":pattern2}).contents[0]))
                else:
                    gp_list.append(0)
                count = count +1
    
    ## 抓 reply & click
    count_tag = soup.find_all("p",{"class":"b-list__count__number"})
    for a in count_tag:
        # 把 <span> </span> 透過 str slice 拿掉
        reply_list.append(str(a.contents[1])[6:-7])
        click_list.append(str(a.contents[3])[6:-7])
        
        
    ## 抓 最後回覆時間
    # 先找 b-list_time 再找 a
    time_tag = soup.find_all("p",{"class":"b-list__time__edittime"})
    for i in time_tag:
        lastReply_list.append(i.find("a").contents[0])
        
    # 找PO 文者
    author_tag = soup.find_all("p",{"class":"b-list__count__user"})
    for i in author_tag:
        author_list.append(i.find("a").contents[0])
        
# 透過 title_list 中的值來調整 post_list 
# 若 title_list 值為 "本討論串已無文章 則調整
for i,title in enumerate(title_list):
    if title =="本討論串已無文章":
        posttime_list.insert(i,"NA")

In [4]:
# 確認資料長度相同
### 之前抓 100 頁時, 有遇到 長度不依的情況, 不知道如何排除
print(len(title_list))
print(len(gp_list))
print(len(reply_list))
print(len(click_list))
print(len(lastReply_list))
print(len(author_list))
print(len(posttime_list))


150
150
150
150
150
150
150


In [5]:
# 轉 dataframe

#調整小數位
pd.set_option('precision', 2)

df = pd.DataFrame()
df["title"] = title_list
df["author"] = author_list
df["posttime"] = posttime_list
df["last_reply"] = lastReply_list

df["click"] = click_list
df.click = df.click.astype("int64")

df["reply"] = reply_list
df.reply = df.reply.astype("int64")

df["gp"] = gp_list


# 新增 "type" 欄位
def type_list(word):
    if word[0] != "【":
        return "NA"
    else:
        return word[1:3]
    
df["type"] = df.title.apply(type_list)


# 調整 last_edit 的時間欄位
today = time.strftime("%m/%d")
temp = dt.datetime.now() - dt.timedelta(days=1)
yesterday = temp.strftime('%m/%d')
df["last_reply"] = df["last_reply"].replace("今日" , today, regex=True).replace("昨日" , yesterday, regex=True)

# 調整發文時間欄位的型態
df["posttime"] = pd.to_datetime(df["posttime"],format = "%Y-%m-%d %H:%M:%S",errors= "coerce")

# timedelta
df["times_AF_post"] = pd.to_timedelta(dt.datetime.now() - df["posttime"],errors = "coerce", unit="d")
# change to float through / timedelta
df["days_AF_post"] = (pd.to_timedelta(dt.datetime.now() - df["posttime"],errors = "coerce", unit="d"))/dt.timedelta (days=1)



# 找出文章重要性的指標
# 每天平均的點擊為 # 每天平均地回復為
# 若文章天數小於1 則用1 來除
def avg_reply(df):
    if df["days_AF_post"] <1:
        return round(df["reply"],2)
    else:
        return round(df["reply"]/df["days_AF_post"],2)
    
df["avg_reply"] = df.apply(avg_reply, axis =1)


def avg_click(df):
    if df["days_AF_post"] <1:
        return round(df["click"],2)
    else:
        return round(df["click"]/df["days_AF_post"],2)
    
df["avg_click"] = df.apply(avg_click, axis =1)

# 加入 URL
df["url"] = url_list

# 把 本討論串已無文章 的資料拿掉
df = df[df["title"] != "本討論串已無文章"]



In [6]:
# 輸出檔案 給 Brian (找近兩天PO 文的資料)
# 目的: 減少去巴哈黑沙版找出今日PO文的時間
# 篩選兩天內的 po 文
temp_df= df.iloc[np.where(df["times_AF_post"]  < dt.timedelta(days=2))]
output = temp_df.sort_values("click",ascending=False).reset_index()

In [8]:
# use creds to create a client to interact with the Google Drive API
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('client_id.json', scope)
client = gspread.authorize(creds)
 
# Find a workbook by name and open the first sheet
# Make sure you use the right name here.
sheet = client.open("daily_article").sheet1

# 把爬到的值填入 google sheet
t = time.strftime("%Y-%m-%d-%H-%M",time.localtime())
sheet.update_cell(1,2,t)


for i in range(len(output)):
    # 要新增欄位的位置參數
    cell_list = sheet.range('A{}:M{}'.format(i+3,i+3))
    
    # 要新增的值, 每一格每一新增
    j =0
    for cell in cell_list:
        cell.value = str(output.iloc[i][j+1])
        j = j+1
    
    # 新增
    sheet.update_cells(cell_list)

    