In [1]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

## 1. 處理股價

In [2]:
stock = pd.read_excel('./bda2023_mid_dataset/stock_data_2019-2023.xlsx', sheet_name = '上市2022')

* 我們選擇日月光 (3711) 作為此次預測標的

In [3]:
ase = stock[stock['證券代碼'].str.contains('3711')]
ase['年月日'] = pd.to_datetime(ase['年月日']).dt.date
ase = ase.sort_values(by = '年月日').reset_index(drop = True)

* 首先先整理表格內容，增加兩個欄位分別為
    1. 單日漲跌幅(元)：表示今日與前日收盤價的真實價格變化
    2. label：表示今日與前日收盤價的漲跌標籤，我們直接以 >0 或 <=0 作為判別依據

In [4]:
ase = ase[['證券代碼', '年月日', '收盤價(元)']]
ase['單日漲跌幅(%)'] = ase['收盤價(元)'].rolling(window=2).apply(lambda x: (x.iloc[1]-x.iloc[0])/x.iloc[0])
ase['label'] = ase['單日漲跌幅(%)'].apply(lambda x: (x>0 and '漲') or '跌')

In [5]:
ase

Unnamed: 0,證券代碼,年月日,收盤價(元),單日漲跌幅(%),label
0,3711 日月光投控,2022-01-03,97.3846,,跌
1,3711 日月光投控,2022-01-04,98.7692,0.014218,漲
2,3711 日月光投控,2022-01-05,102.4615,0.037383,漲
3,3711 日月光投控,2022-01-06,100.1539,-0.022522,跌
4,3711 日月光投控,2022-01-07,98.3077,-0.018434,跌
...,...,...,...,...,...
241,3711 日月光投控,2022-12-26,94.0000,0.002132,漲
242,3711 日月光投控,2022-12-27,94.5000,0.005319,漲
243,3711 日月光投控,2022-12-28,93.5000,-0.010582,跌
244,3711 日月光投控,2022-12-29,93.8000,0.003209,漲


* 最後整理出在 2022 年 246 個交易日中，漲跌變化的個數

In [5]:
ase['label'].value_counts()

跌    126
漲    120
Name: label, dtype: int64

## 2. 處理文章資料

In [6]:
bbs1921 = pd.read_csv('./bda2023_mid_dataset/bda2023_mid_bbs_2019-2021.csv') # 104466
bbs2223 = pd.read_csv('./bda2023_mid_dataset/bda2023_mid_bbs_2022-2023.csv') #  34630
bbs23 = pd.concat([bbs1921, bbs2223]).reset_index().drop(["index"], axis=1)
bbs23

Unnamed: 0,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url
0,1546274852018_PTT02R,bbs,Ptt,Stock,2019-01-01 00:31:32,[公告] n199808m HitMaker 警告一次,eyespot,1. 主旨：n199808m 違反板規4-2-1 警告一次 HitMake...,http://www.ptt.cc/bbs/Stock/M.1546273895.A.81F...
1,1546278287622_PTT02R,bbs,Ptt,Stock,2019-01-01 01:28:28,Re: [新聞] 貿戰讓台商錢匯不出？ 海基會：漣漪效應,CGDGAD,小弟有個想法不知可不可行 如果有人民幣想洗出來 出國一趟，比方去歐洲 用海外刷卡買黃金，存在...,http://www.ptt.cc/bbs/Stock/M.1546277311.A.1D3...
2,1546278288500_PTT02R,bbs,Ptt,Stock,2019-01-01 01:32:39,Re: [新聞] 貿易戰搶出口 透支效應2019衝擊中國經濟!,americ,分身帳號好像要連坐水桶 《ＩＤ暱稱》tangolosss (配息配股變成大富翁)《經濟狀況...,http://www.ptt.cc/bbs/Stock/M.1546277562.A.F7E...
3,1546298530556_PTT02R,bbs,Ptt,Stock,2019-01-01 07:07:37,Re: [新聞] 陸媒：俄羅斯想聯手中國去美元化,taco13,所以說不要小看俄羅斯的險惡奸詐 俄國一直鼓勵中國發展人民幣石油 去美元化的種種行為 俄羅...,http://www.ptt.cc/bbs/Stock/M.1546297660.A.928...
4,1546299585726_PTT02R,bbs,Ptt,Stock,2019-01-01 07:35:29,[標的] (伺機作多)日元正二,hrma,1. 標的：元大日元指數正二 2. 分類：(伺機作多)多 3. 分析/正文： (...,http://www.ptt.cc/bbs/Stock/M.1546299333.A.8D3...
...,...,...,...,...,...,...,...,...,...
139091,1679410888379_PTT02R,bbs,Ptt,Stock,2023-03-21 23:00:47,[情報] 3/21全市場當沖虧損最多的股票,offeree,3/21 全市場當沖虧損最多的股票 1. 創意 -999.8萬 2. 華孚 -...,http://www.ptt.cc/bbs/Stock/M.1679410849.A.80F...
139092,1679411198168_PTT02R,bbs,Ptt,Stock,2023-03-21 23:05:30,[新聞] 葉倫一番話提振市場信心 美股道瓊早盤漲2,humbler,原文標題： 葉倫一番話提振市場信心 美股道瓊早盤漲230點 原文連結： https://m...,http://www.ptt.cc/bbs/Stock/M.1679411133.A.D55...
139093,1679412591612_PTT02R,bbs,Ptt,Stock,2023-03-21 23:28:59,[新聞] 社子近三萬戶無預警停電！王美花道歉了,DrowningPool,原文標題：社子近三萬戶無預警停電！王美花道歉了：要求台電提早換設備 原文連結：https:...,http://www.ptt.cc/bbs/Stock/M.1679412541.A.011...
139094,1679413649631_PTT02R,bbs,Ptt,Stock,2023-03-21 23:45:22,[新聞] 聯電新加坡擴廠 亞翔再接82.59億元大單,changjie,原文标题：聯電新加坡擴廠 亞翔再接82.59億元大單 原文连结：https://amp-...,http://www.ptt.cc/bbs/Stock/M.1679413525.A.B90...


* 篩選出標題及內文提及「日月光」三字的文章

In [7]:
bbs23_ase = bbs23[bbs23['title'].str.contains('日月光') | bbs23['content'].str.contains('日月光')].reset_index(drop = True)
bbs23_ase

Unnamed: 0,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url
0,1546390288567_PTT02R,bbs,Ptt,Stock,2019-01-02 08:35:42,[新聞] 挖礦ASIC商機曇花一現　半導體供應鏈看淡,kaube,1.原文連結： 挖礦ASIC商機曇花一現　半導體供應鏈看淡後市 https://goo.gl...,http://www.ptt.cc/bbs/Stock/M.1546389344.A.A3C...
1,1546410609352_PTT02R,bbs,Ptt,Stock,2019-01-02 14:20:07,[其他] 108/01/02 加權股價指數成分股暨市值比重,BreezeCat,大家新年快樂～ 雖然今天大盤有點糟就是了= = 然後我因為空手限制，期貨什麼空單都沒有=...,http://www.ptt.cc/bbs/Stock/M.1546410010.A.7EE...
2,1546440294641_PTT02R,bbs,Ptt,Stock,2019-01-02 22:27:02,[其他] 1/2 台灣集中市場三大法人買賣超前30名,l75cm,1/2 台灣集中市場三大法人買賣超前30名 買超 代碼 名稱 ...,http://www.ptt.cc/bbs/Stock/M.1546439226.A.4AB...
3,1546495227622_PTT02R,bbs,Ptt,Stock,2019-01-03 13:47:31,[其他] 108/01/03 加權股價指數成分股暨市值比重,BreezeCat,慘 拉高又殺低 到底今年怎麼走呢？ 經過了一天，資料應該是準的了 但還是一樣 以下...,http://www.ptt.cc/bbs/Stock/M.1546494454.A.E68...
4,1546507261126_PTT02R,bbs,Ptt,Stock,2019-01-03 17:19:04,[新聞] 中美貿易戰蘋果賣不動 在陸台廠迎寒冬,mayingnine,-------------------------------發文提醒-----------...,http://www.ptt.cc/bbs/Stock/M.1546507148.A.FE6...
...,...,...,...,...,...,...,...,...,...
1618,1679308229702_PTT02R,bbs,Ptt,Stock,2023-03-20 18:28:43,[情報] 112/03/20 八大公股銀行買賣超排行,q1a1,1. 標題：112/03/20 八大公股銀行買賣超排行 2. 來源：玩股網 3. 網址：...,http://www.ptt.cc/bbs/Stock/M.1679308125.A.BD9...
1619,1679354010757_PTT02R,bbs,Ptt,Stock,2023-03-21 07:11:21,[新聞] 宣明智大膽預測未來5年台IC設計市佔衝5成,dosiris,原文標題： 宣明智大膽預測：未來5年台IC設計市佔衝5成 「台灣將成全世界最安全的地方」 ...,http://www.ptt.cc/bbs/Stock/M.1679353884.A.02A...
1620,1679382265145_PTT02R,bbs,Ptt,Stock,2023-03-21 15:02:27,[情報] 0321 上市投信買賣超排行,saber50116,1. 標題：投信買賣超排行 2. 來源： 富邦證券 3. 網址：https://fub...,http://www.ptt.cc/bbs/Stock/M.1679382149.A.B2E...
1621,1679386643213_PTT02R,bbs,Ptt,Stock,2023-03-21 16:16:09,[情報] 0321 上市外資買賣超排行,saber50116,1. 標題：外資買賣超排行 2. 來源：台灣證交所 3. 網址：https://www....,http://www.ptt.cc/bbs/Stock/M.1679386571.A.A16...


* 將第 n 天的文章與第 n+1 的股市漲跌標籤合併，並且只保留我們需要的欄位資訊

In [8]:
bbs23_ase['post_time'] = pd.to_datetime(bbs23_ase['post_time']).dt.date
ase['年月日-1'] = ase['年月日'] - datetime.timedelta(days=1)
bbs23_ase = pd.merge(bbs23_ase, ase, left_on='post_time', right_on='年月日-1')[['post_time', 'title', 'content', 'label']]
bbs23_ase

Unnamed: 0,post_time,title,content,label
0,2022-01-03,[情報] 0103上市外資買賣超排行,圖片好讀版： 買超 ...,漲
1,2022-01-03,[情報] 0103八大公股銀行買賣超排行,作者: addy7533967 (火爆刺香腸) 看板: Stock 標題: [情報] 121...,漲
2,2022-01-04,[新聞] 不只有你賠錢 2021年當沖客總計賠掉一個,原文標題：不只有你賠錢 2021年當沖客總計賠掉一個日月光 原文連結：https://bit...,漲
3,2022-01-04,[情報] 0104上市外資買賣超排行,圖片好讀版： 買超 ...,漲
4,2022-01-04,[情報] 0104八大公股銀行買賣超排行,作者: addy7533967 (火爆刺香腸) 看板: Stock 標題: [情報] 121...,漲
...,...,...,...,...
443,2022-12-22,[情報] 1222 上市投信買賣超排行,1. 標題：投信買賣超排行 2. 來源： 富邦證券 3. 網址：https://fub...,跌
444,2022-12-22,[情報] 1222 上市外資買賣超排行,1. 標題：外資買賣超排行 2. 來源：台灣證交所 3. 網址：https://www....,跌
445,2022-12-22,[情報] 1222八大公股銀行買賣超排行,手機介面圖片好讀版： 以下資訊依張數排列 買超 ...,跌
446,2022-12-28,[新聞] 美股陰影籠罩 那指挫1.3% 特斯拉瀉11%,美股陰影籠罩 那指挫1.3% 特斯拉瀉11% 工商時報 呂佳恩 2022.12.28 美...,漲


# 3. 訓練集文章向量化處理

* 首先先進行斷詞處理，以下我們會先實作幾個步驟：
    1. 正則表示法清除多餘字元：先移除文章中符號、英數字，只保留中文字元
    2. 斷句：由於 monpa 在處理 200 字以上字串的斷詞時可能會出現錯誤結果，因此我們統一對長文章先進行斷句拆成較短的句子組成的 list
    3. 斷詞：透過 monpa 對斷句結果中的所有句子進行斷詞

In [9]:
import monpa
from monpa import utils
import re

+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.


In [10]:
# 這個function用來將字串以正則化處理去除中文字元以外的字元
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

# 我們從stopwords_zh.txt這個檔案中匯入繁體中文的停用詞
with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

* 我們想嘗試用 2022 年 7 月至 9 月的文章資料來訓練一個分類器，並用於 10 月的股價變動預測當中

In [11]:
# 設定訓練資料集的開始日期與結束日期
train_startDate = datetime.date(2022,7,1)
train_endDate = datetime.date(2022,9,30)

# 透過monpa對文章進行斷詞處理，並將個別斷詞結果寫在tokenStr這個字串變數中，最後再將訓練集中所有字串存在tokenStr_list中
train_tokenStr_list = []
for i in list(bbs23_ase[bbs23_ase['post_time'].between(train_startDate, train_endDate)].index):
    try:
#         txt = clearSentence(bbs23_ase['content'][i])
        sentence_list = utils.short_sentence(bbs23_ase['content'][i])
        tokenStr = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        train_tokenStr_list.append(tokenStr)
    except:
        train_tokenStr_list.append('')

In [12]:
train_tokenStr_list

['標題 上市 投信 買賣 超 排行 來源 富邦 證券 網址 內文 投信 避風港 電信 大哥 外面 的 世界 好 恐怖 抖 圖片版 買 超 賣 超 名次 股票 名稱 超 張 數 名次 股票稱 超 張數 中華電 金像電 漢翔 新唐 台灣 大順德 遠南亞科 開發金 台 化 台泥 欣興 統一 泰永豐金 啟碁 台達電 裕民 廣達 富邦 元宇宙 仁寶友達元晶 台肥 華航 正隆 英業 達智原第一 金南電 宏碁 台 勝科元大 金台 表科 中信金達 邁南亞 同欣電 遠東新智易 飛捷中砂虹堡華邦電 安集 嘉澤台塑 亞泥振樺電慧洋 日月光 投控 緯創 臻鼎 全新 元大 台灣 大成鋼大聯大長榮航 上海商銀 長榮鈺齊美食 華通中鴻光寶科華新富邦金聯發科 致 伸緯穎可成億光聯 強健策 智伸科京元電子 美利達橋 椿兆豐金陽明 台積電 景 碩建 大技 嘉世紀 鋼文曄 乙盛智邦中興電 豐泰貿聯遠雄力積電東和 鋼鐵 宇隆旭暉 應材 昇陽 半導體 帆宣 華碩 超豐',
 '買 超 賣 超 名次 股票 名稱 超 張數 收盤價 漲跌 名次 股票 名稱 超 張數 收盤價 漲跌 長榮航 元大 台灣 反 友達開發 金國泰金國泰 臺灣 加權 反 華新中鋼亞太電群 創 臺企銀 元大 台灣華航 中信 金智原 鴻海 萬海新光金 宏達電 南亞金像電台泥 新 唐陽明 宏碁 元大 滬深正晶豪科日月光 投控 遠東新台積電台灣 大 緯創 台玻 第一 金 東鹼台塑 中華電 元大金 欣興京元電子 台肥國泰 台灣聯茂聯電中石化 仁寶 技嘉 英業達四維航 元大 台灣 正三陽 工業 寶成美 時遠傳 榮運 中鴻 聯合 再生國泰智能 電動車 富邦 基因 免疫 生技 華通 國泰基 因 免疫 革命 永豐金元晶中電光 寶科漢翔興富發 亞泥 榮成 玉山 金 正 新台 新金 中信 電池 及 儲能 台達電 耿鼎和 碩偉 詮電 旺宏 富邦 未來車 廣達敦泰 國票 金華新科國 泰費城 半導體泰鼎 期元 大 原油 反 兆豐金華孚 遠東銀矽格集 盛康舒上曜彩晶 台揚 潤泰 新台 表科 合庫金新興 金寶',
 '原文 標題 台灣 晶圓 製造 指數 上路 台積電聯電 列 大 成分股 原文 連結 發布 時間 記者 署名 中央社 記者 潘智義 台北日電 原文 內容 台灣晶圓 製造 指數 上路 台積電聯電 列大 成分股 中央社 記者 潘智義 台北日電 台灣指數公司 今天 

* 文章向量化處理：我們透過 sklearn 套件中 TfidfVectorizer 將斷詞結果去除停用詞後轉為空間向量

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [14]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(train_tokenStr_list)
X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names())
X_train

Unnamed: 0,一一三四,一一八三一,一一六,一七五七八,一三一六三三,一三九三八,一九二九四,一二三五七,一五一,一五四二五,...,鼎反,鼎寶,鼎智原,鼓舞,齊揚,齊敦陽科,龍彩霖,龍日月光,龍頭,龍頭廠
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.040123,0.0,0.0,0.040123,0.0,0.0,0.0
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0


* 可以看到使用全部的斷詞結果去組成空間向量時稀疏性會非常大，在後續預測時效率會很低，因此我們需要選擇對分類結果有較顯著影響的詞彙作為向量空間的維度，以下我們透過 Chi-square 計算各詞彙與漲跌標籤的獨立性作為選擇向量空間維度的依據。

In [15]:
y_train = bbs23_ase[bbs23_ase['post_time'].between(train_startDate, train_endDate)]['label']

chi2_selector = SelectKBest(chi2, k = 2000)
chi2_selector.fit(X_train, y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,三商,三商壽寶,三晃力成,三陽工業,上乙盛,上半,上半年,上南亞科,上品,上國碩鋼聯,...,麗正國泰永高,麗清,麗訊舟,麗遠,鼎中砂,鼎力,鼎智原,齊敦陽科,龍日月光,龍頭
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 4. 測試集文章向量化處理

* 接著我們可以透過 7-9 月的向量維度將 10 月份的文章也轉成 tf-idf 的向量空間

In [16]:
# 首先先計算4月份的文章個別的tfidf
test_startDate = datetime.date(2022,10,1)
test_endDate = datetime.date(2022,10,31)

test_tokenStr_list = []
for i in list(bbs23_ase[bbs23_ase['post_time'].between(test_startDate, test_endDate)].index):
    try:
        txt = clearSentence(bbs23_ase['content'][i])
        sentence_list = utils.short_sentence(txt)
        tokenStr = str()
        for sentence in sentence_list:
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        test_tokenStr_list.append(tokenStr)
    except:
        test_tokenStr_list.append('')

In [17]:
# 接著將4月份的結果透過df.reindex這個方法映射到訓練集的向量空間中
y_test = bbs23_ase[bbs23_ase['post_time'].between(test_startDate, test_endDate)]['label']

vectorizer = TfidfVectorizer(stop_words=stopwords)
X_test = vectorizer.fit_transform(test_tokenStr_list)
X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test

Unnamed: 0,三商,三商壽寶,三晃力成,三陽工業,上乙盛,上半,上半年,上南亞科,上品,上國碩鋼聯,...,麗正國泰永高,麗清,麗訊舟,麗遠,鼎中砂,鼎力,鼎智原,齊敦陽科,龍日月光,龍頭
0,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
5,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
6,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
7,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
8,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
9,0,0,0,0,0,0.0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0


# 5. 建立預測模型

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

# 6. 檢視預測結果

In [21]:
test_label = ase[ase['年月日-1'].between(test_startDate, test_endDate)]['label']

test_data = bbs23_ase[bbs23_ase['post_time'].between(test_startDate, test_endDate)]
test_data['predict_label'] = clf.predict(X_test)
predict_label = pd.merge(
    ase[ase['年月日-1'].between(test_startDate, test_endDate)], 
    test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
    left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('{}月份預測準確率:'.format(test_startDate.month), accuracy_score(test_label, predict_label))

10月份預測準確率: 0.7142857142857143


In [22]:
pd.DataFrame([test_label.reset_index(drop=True), predict_label]).T

Unnamed: 0,label,predict_label
0,跌,跌
1,漲,跌
2,漲,跌
3,漲,跌
4,跌,跌
5,跌,跌
6,跌,跌
7,跌,跌
8,漲,漲
9,漲,漲


# 4. 同學們可以嘗試調整

1. 漲跌標籤的判斷%數（重要！！）
2. 文章與股價時間區間的移動天數（小時數）
3. 使用不同斷詞工具（推薦中研院CKIPTransformer）
4. 特徵選擇的其他方法（lift、、MI、、LLR...）
5. 特徵選擇的數量（太少會有很高的 false positive，太高則效率差）
6. 嘗試用看看不同分類模型
7. 改變投票方法，漲跌平三者的權重應該一樣嗎？

      GOOD LUCK!!!