# Project 02 - No.8
## 網路股票討論度與波動度關係之探討

In [1]:
import requests
import time
from datetime import datetime
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
import jieba
import jieba.analyse
import pandas as pd
from pandas.core.frame import DataFrame

In [2]:
PTT_URL = 'https://www.ptt.cc'

In [3]:
def get_web_page(url):
    time.sleep(0.5)  # 每次爬取前暫停 0.5 秒以免被 PTT 網站判定為大量惡意爬取
    resp = requests.get(url=url)
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text

In [4]:
def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html.parser')

    # 取得上一頁的連結
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []  # 儲存取得的文章資料
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').string.strip() == date:  # 發文日期正確

            # 取得文章連結及標題
            if d.find('a'):  # 有超連結，表示文章存在，未被刪除
                href = d.find('a')['href']
                title = d.find('a').string
                articles.append(title+"~"+date)
    return articles, prev_url

In [5]:
def parse(dom):
    soup = BeautifulSoup(dom, 'html.parser')
    links = soup.find(id='main-content').find_all('a')
    img_urls = []
    for link in links:
        if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
            img_urls.append(link['href'])
    return img_urls


In [6]:
def get_datelist(beginDate, endDate):
    # beginDate, endDate是形如‘20160601’的字符串或datetime格式
    date_list=[datetime.strftime(x,'%m/%d') for x in list(pd.date_range(start=beginDate, end=endDate))]
    return date_list

In [8]:
if __name__ == '__main__':
    current_page = get_web_page(PTT_URL + '/bbs/Stock/index.html')
    if current_page:
        articles = []  #全部的文章
        
    dates = (get_datelist('20171207','20180107')) #endDate需為當天
    for c in dates[::-1]:#倒序
        date = time.strftime(c).lstrip('0') # 去掉開頭的 '0' 以符合 PTT 網站日期格式
        current_articles, prev_url = get_articles(current_page, date)  # 目前頁面的文章
        while current_articles:  # 若目前頁面有文章則加入 articles，並回到上一頁繼續尋找是否有符合日期的文章
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, date)        

In [9]:
articles=DataFrame(articles)

In [10]:
Title = articles[0].str.split('~',expand=True).rename(columns={0:'articles',1:'date'})

In [29]:
def get_keywords(content):
    keywords=jieba.analyse.extract_tags(str(content),topK=30)
    return "/".join(keywords)

In [30]:
keywords_sort=Title.groupby('date').apply(get_keywords)

In [32]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
keywords_sort.str.split('/',expand=True)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
,12,articles,date,983,低檔,買進,終於有點,成績,心得,持有,,,,,,,,,,,,,,,,,,,,
1/01,1,,請益,追高殺,低長,期能,Re,新聞,articles,date,222,223,陸徵環,保稅,負擔,變重,224,嚴凱泰,無所適,225,3450,聯鈞,226,8039,台虹,227,228,飛利浦,LED,101
1/02,2,,新聞,Re,請益,01,2017,股匯,雙漲,台幣,開盤,大升,9.3,心得,閒聊,168,107,未來,大手筆,國際,電玩展,108,攤位,宏達電,VR,賞析,2018,台股,八萬元,辦法
1/03,3,,新聞,Re,請益,閒聊,2018,01,榮海,日電貿,11,資金,股利,好奇,玩死,其他,短空,articles,date,126,2233,宇隆,103,進獲利,108,127,盤子,怒花,200,紫變
1/04,4,,新聞,Re,美股,進入,終章,專家,標普融,50%,01,2018,請益,怎麼,獲利,107,穩懋,閒聊,泡沫,市場,電動,機車,電池,規格,Gogoro,大盤,請問,停利點,3105,3D
1/05,5,,新聞,Re,請益,閒聊,2018,01,心得,穩懋,張表打,臉大老,企業,4915,致伸,大盤,ETF,國產車,盤子,269,萬終,紫變,美股,進入,終章,專家,標普融,50%,KY,天堂
1/06,6,,新聞,Re,請益,心得,告白,違反板規,本魯自,師的,投資,水桶,財經,強勢,16,一週,台灣,分析,股票,公告,破百,可判,articles,date,14,觀測,著大股,東進,出準,沒錯
1/07,7,,新聞,Re,告白,25000,點還,low,川普自,曝美股,驚人數,本魯自,師的,投資,公股,心得,慶富案,提呆,銀上,獲利大減,分析,股票,articles,date,不幹,元大,大眾,銀爆,760,人離職
12/07,12,07,,Re,新聞,請益,心得,投資,11,一樣,輕人,營收,股價,鴻海,有關,玉晶光,進光,業績,EPS,1.41,KY,3227,原相,觀望,學習,2317,6180,橘子,新手,玉山
12/08,12,08,,Re,請益,新聞,11,群創,營收,比特,指期,EPS,南亞科,1909,榮成,1906,寶隆,發言人,投資,有關,106,這樣,隱波,調太多,大盤,空軍日,幣破,關卡,5000,6180
