In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from PTTLibrary import PTT

In [None]:
import json
def read_settings():
    with open('./settings.json') as file:
        setting = json.load(file)
    return setting

In [None]:
setting=read_settings()

In [None]:
PTTBot = PTT.Library()#LogLevel=PTT.LogLevel.DEBUG
try:
    PTTBot.login(
        ID=setting.get('ID'),
        Password=setting.get('Password'),
        KickOtherLogin=True
    )
except PTT.Exceptions.LoginError:
    PTTBot.log('登入失敗')
    sys.exit()
PTTBot.log('登入成功')

In [None]:
import string
from zhon.hanzi import punctuation
import jieba
import re
jieba.load_userdict('user_dict.txt')

In [None]:
#預處理
def preprocess(push_content_list):
    # 移除超連結
    push_content_list = [re.sub(r"http\S+", "", s) for s in push_content_list]
    # 移除空白
    push_content_list = [s.replace(' ', '') for s in push_content_list]
    # 移除空白內容
    push_content_list = [s for s in push_content_list if s]
    # 移除標點符號
    push_content_list = [''.join(c for c in s if c not in string.punctuation) for s in push_content_list]
    # 移除全形標點符號
    push_content_list = [''.join(c for c in s if c not in punctuation) for s in push_content_list]
    return push_content_list

In [None]:
#分詞
def segment(push_content_list_preprocessed):       
    seg_list = []    
    for push_content in push_content_list_preprocessed:
        seg = jieba.lcut(push_content, cut_all=False)
        seg_list = seg_list + seg
    return seg_list

In [None]:
def get_push_corpus(push_list):
    push_content_list = [push.getContent() for push in push_list]
#     print(f'{len(push_list)}則推文')
    push_content_list_preprocessed = preprocess(push_content_list)
    seg_list = segment(push_content_list_preprocessed)
    return seg_list

In [None]:
documents = []

In [None]:
def crawlHandler(post):
    global documents
    if post.getDeleteStatus() != PTT.PostDeleteStatus.NotDeleted:
        return
    corpus = get_push_corpus(post.getPushList())
    documents.append(corpus)

In [None]:
Board='Stock'
SearchType=PTT.PostSearchType.Keyword
Condition='盤中閒聊'
TestRange = 100
NewestIndex = PTTBot.getNewestIndex(
    PTT.IndexType.Board,
    Board,
    SearchType=SearchType,
    SearchCondition=Condition,
)
StartIndex = NewestIndex - TestRange + 1
print(f'預備爬行 {Board} 編號 {StartIndex} ~ {NewestIndex} 文章')
ErrorPostList, DelPostList = PTTBot.crawlBoard(
    crawlHandler,
    Board,
    StartIndex=StartIndex,
    EndIndex=NewestIndex,
    SearchType=SearchType,
    SearchCondition=Condition,
)
if len(ErrorPostList) > 0:
    print('Error Post: \n' + '\n'.join(str(x) for x in ErrorPostList))
if len(DelPostList) > 0:
    print('Del Post: \n' + '\n'.join([str(x) for x in DelPostList]))
    print(f'共有 {len(DelPostList)} 篇文章被刪除')

In [None]:
from gensim import corpora

In [None]:
# 建立本次文檔的語料庫(字典) 
dictionary = corpora.Dictionary(documents)
dictionary.save('ptt_stock_corpus.dict')

In [None]:
# 生成词库
corpus = [dictionary.doc2bow(document) for document in documents]
print(len(corpus))
# 保存词库
corpora.MmCorpus.serialize("ptt_stock_corpus.mm", corpus)