### Packages

In [7]:
import pickle
from datetime import datetime
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import BernoulliNB

import monpa
from monpa import utils
from collections import Counter
import math

import datetime

### Data process

In [8]:
ptt_df = pd.read_csv("bda2024_202203-202402_討論數據_ptt.csv")
transaction_df = pd.read_csv("bda2024_微股力_個股交易數據-2年.csv")
with open('stopwords_zh_ptt.txt', 'r') as file:
    stopwords = file.read().splitlines()
file.close()

  transaction_df = pd.read_csv("bda2024_微股力_個股交易數據-2年.csv")


In [9]:
transaction_df['stock_symbol'] = transaction_df['stock_symbol'].astype(str)
newsSet = ptt_df[ptt_df['title'].str.contains('聯發科') | ptt_df['content'].str.contains('聯發科')].reset_index(drop = True)

### Post Cat.

In [10]:
def get_next_date(input_date, dates_list):
    index = sum(1 for d in dates_list if d < input_date)
    if index >= len(dates_list):
        return -1, -1
    else:
        return index, dates_list[index]

def get_answer(stock_name, input_date, n_days_after, threshold):
    dates_list = sorted(transaction_df.loc[(transaction_df['stock_name']==stock_name)]['date'])
    i, _ = get_next_date(input_date, dates_list)

    if i == -1 or i >= len(dates_list) or i + n_days_after >= len(dates_list):
        return 2
    
    open_date = dates_list[i]
    close_date = dates_list[i + n_days_after]
    
    open_price = transaction_df.loc[(transaction_df['date']==open_date) & (transaction_df['stock_name']==stock_name)]['open'].iloc[0]
    close_price = transaction_df.loc[(transaction_df['date']==close_date) & (transaction_df['stock_name']==stock_name)]['close'].iloc[0]
    price_change = (close_price - open_price) / open_price

    if price_change >= threshold:
        return 1     # buy
    elif price_change <= -threshold:
        return -1    # sell
    else:
        return 0     # hold

In [11]:
newsSet['stock_name'] = "聯發科"
newsSet['category'] = newsSet.apply(lambda r: get_answer(r['stock_name'], r['post_time'], 10, 0.02), axis = 1)

In [12]:
with open('newsSet.pkl', 'wb') as file: 
    pickle.dump(newsSet, file)

In [13]:
# 分出漲跌資料集
upSet = newsSet[newsSet['category'] == 1]
downSet = newsSet[newsSet['category'] == -1]

### tf-idf

In [14]:
up_tf_counter=Counter()
up_df_counter=Counter()

for index, row in upSet.iterrows():
    query = row['title'] + ' ' + row['content']
    df_tmp = Counter()
    sentence_list = utils.short_sentence(query)

    for item in sentence_list:
      result_cut = monpa.cut(item) 
      for term in result_cut:
        term = term.strip()
        if (len(term) > 1):
          up_tf_counter[term] += 1 
          if(df_tmp[term] == 0): 
            df_tmp[term] = 1 
    up_df_counter += df_tmp

up_tf_idf = {}
d = len(upSet)
for term, freq in up_tf_counter.items():
    up_tf_idf[term] = freq * math.log(d / up_tf_counter[term])

In [15]:
down_tf_counter=Counter()
down_df_counter=Counter()

for index, row in downSet.iterrows():
    query = row['title'] + ' ' + row['content']
    df_tmp = Counter()
    sentence_list = utils.short_sentence(query)

    for item in sentence_list:
      result_cut = monpa.cut(item) 
      for term in result_cut:
        term = term.strip()
        if (len(term) > 1):
          down_tf_counter[term] += 1 
          if(df_tmp[term] == 0): 
            df_tmp[term] = 1 
    down_df_counter += df_tmp

down_tf_idf = {}
d = len(downSet)
for term, freq in down_tf_counter.items():
    down_tf_idf[term] = freq * math.log(d / down_tf_counter[term])

### Likelihood Ratio selection

In [16]:
frames = [upSet, downSet]
fullSet = pd.concat(frames).sort_values('post_time').reset_index()
fullSet

Unnamed: 0,index,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url,stock_name,category
0,0,1646132549195_PTT02R,bbs,Ptt,Stock,2022-03-01 18:56:10.000,[情報] 2323 中環處分有價證券(發哥,addy7533967,1. 標題：公告本公司處分有價證券\n\n2. 來源：公開資訊觀測站\n\n3. 網址：ht...,http://www.ptt.cc/bbs/Stock/M.1646132174.A.167...,聯發科,-1
1,1,1646140407607_PTT02R,bbs,Ptt,Stock,2022-03-01 21:11:34.000,[新聞] 聯發科全力搶攻5G手機 天璣8100、天璣800,Ruo5566,聯發科全力搶攻5G手機 天璣8100、天璣8000登場\nhttps://ec.ltn.co...,http://www.ptt.cc/bbs/Stock/M.1646140301.A.983...,聯發科,-1
2,2,1646204116939_PTT02R,bbs,Ptt,Stock,2022-03-02 14:52:08.000,[情報] 0302上市投信買賣超排行,MOMO0478,買超 賣超\n名次 股票名稱 超張...,http://www.ptt.cc/bbs/Stock/M.1646203930.A.51D...,聯發科,-1
3,3,1646209462036_PTT02R,bbs,Ptt,Stock,2022-03-02 16:20:45.000,[情報] 0302上市外資買賣超排行,MOMO0478,買超 賣超\n名次 股票名稱 ...,http://www.ptt.cc/bbs/Stock/M.1646209247.A.D2B...,聯發科,-1
4,4,1646217643615_PTT02R,bbs,Ptt,Stock,2022-03-02 18:37:59.000,[情報] 0302八大公股銀行買賣超排行,addy7533967,手機介面圖片好讀版：\n\n\n以下資訊依張數排列\n買超 ...,http://www.ptt.cc/bbs/Stock/M.1646217502.A.CF6...,聯發科,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
1345,1856,1706688439506_PTT02R,bbs,Ptt,Stock,2024-01-31 16:06:51.000,[情報] 0131 上市外資買賣超排行,paidzou,1. 標題：0131 上市外資買賣超排行\n\n2. 來源：TWSE\n\n3. 網址：ht...,http://www.ptt.cc/bbs/Stock/M.1706688414.A.A04...,聯發科,1
1346,1857,1706693007636_PTT02R,bbs,Ptt,Stock,2024-01-31 17:22:06.000,[新聞] 〈聯發科法說〉去年Q4獲利登五季新高,xlaws1987,原文連結：https://reurl.cc/lgzzXd\n發布時間： 2024-01-31...,http://www.ptt.cc/bbs/Stock/M.1706692928.A.B95...,聯發科,1
1347,1858,1706706929898_PTT02R,bbs,Ptt,Stock,2024-01-31 21:14:40.000,[新聞] 聯發科旗下奕微科車規晶片、MEMS 相繼問,NerfMePls,原文標題：聯發科旗下奕微科車規晶片、MEMS 相繼問世 搶攻全球車廠訂單\n\n原文連結：h...,http://www.ptt.cc/bbs/Stock/M.1706706882.A.E24...,聯發科,1
1348,1859,1706708637116_PTT02R,bbs,Ptt,Stock,2024-01-31 21:42:06.000,[情報] 113年01月31日信用交易統計,steward135,1. 標題：113年01月31日信用交易統計\n2. 來源：臺灣證券交易所、證券櫃檯買賣中心...,http://www.ptt.cc/bbs/Stock/M.1706708529.A.276...,聯發科,1


In [17]:
import re
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

In [18]:
def cal_dict_docs(docsFrame):
  df_counter=Counter()

  for index, row in docsFrame.iterrows():
      query = row['title'] + ' ' + row['content']
      df_tmp = Counter()
      sentence_list = utils.short_sentence(query)

      for item in sentence_list:
        item = clearSentence(item)
        result_cut = monpa.cut(item) 
        for term in result_cut:
          term = term.strip()
          if (len(term) > 1):
            if(df_tmp[term] == 0): 
              df_tmp[term] = 1 
      df_counter += df_tmp
  return df_counter

In [19]:
def cal_term_docs(docsRow):
  tf_counter=Counter()

  for index, row in docsRow.iterrows():
      query = row['title'] + ' ' + row['content']
      sentence_list = utils.short_sentence(query)
      
      for item in sentence_list:
        item = clearSentence(item)
        result_cut = monpa.cut(item) 
        for term in result_cut:
          term = term.strip()
          if (len(term) > 1):
            tf_counter[term]+=1
  return tf_counter

In [20]:
def likeliratio_selection(docsFrame, trainCateg=[], nFeatrues=500):
    tfDocsList = {}
    vocabDict = cal_dict_docs(docsFrame)
    ScoreList = {}
        
    for i in range(len(docsFrame)):
        tfDocsList[i] = cal_term_docs(docsFrame.loc[[i]])
        
    for term in vocabDict:
        Score = 0
        for categ in trainCateg:
            real = np.zeros((2,2))
            for index, row in docsFrame.iterrows():
                if (row['category'] == trainCateg[categ]):
                    if (term in tfDocsList[index]):
                        real[0][0] += 1
                    else:
                        real[0][1] += 1
                else:
                    if (term in tfDocsList[index]):
                        real[1][0] += 1
                    else:
                        real[1][1] += 1
            sumTotal = real.sum()
            pt = (real[0][0] + real[1][0]) / sumTotal
            p1 = real[0][0] / (real[0][0] + real[0][1])
            p2 = real[1][0] / (real[1][0] + real[1][1])
            lambdaVal = ((pt ** real[0][0]) * ((1 - pt) ** real[0][1]) * (pt ** real[1][0]) * ((1 - pt) ** real[1][1])) / ((p1 ** real[0][0]) * ((1 - p1) ** real[0][1]) * (p2 ** real[1][0]) * ((1 - p2) ** real[1][1]))
            Score += -2 * math.log(lambdaVal)
        ScoreList[term] = Score
    selected = sorted(ScoreList, key=ScoreList.get, reverse=True)[:nFeatrues]
    totalList = sorted(ScoreList, key=ScoreList.get, reverse=True)
    return selected, totalList

In [21]:

result_keywords, total_keywords = likeliratio_selection(fullSet, trainCateg = [1, -1], nFeatrues = 2000)

  lambdaVal = ((pt ** real[0][0]) * ((1 - pt) ** real[0][1]) * (pt ** real[1][0]) * ((1 - pt) ** real[1][1])) / ((p1 ** real[0][0]) * ((1 - p1) ** real[0][1]) * (p2 ** real[1][0]) * ((1 - p2) ** real[1][1]))


In [22]:
# save the result
with open('result_keywords.pkl', 'wb') as file: 
    pickle.dump(result_keywords, file)
with open('total_keywords.pkl', 'wb') as file: 
    pickle.dump(total_keywords, file)

In [23]:
with open('result_keywords.pkl', 'rb') as f:
    result_keywords = pickle.load(f)
with open('total_keywords.pkl', 'rb') as f:
    total_keywords = pickle.load(f)

In [24]:
result_keywords = total_keywords[:1000]

# 將全部看漲與看跌隨機分類，並預測準確率

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 合併上漲和下跌的資料集
df = pd.concat([upSet, downSet], ignore_index=True)

# 分割訓練集和測試集
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("訓練集大小：", len(train_df))
print("測試集大小：", len(test_df))


訓練集大小： 1080
測試集大小： 270


* 建立訓練資料空間向量

In [27]:
train_tokenStr_list = []
for i in list(train_df.index):
    try:
        txt = clearSentence(train_df['content'][i])
        sentence_list = utils.short_sentence(txt)
        tokenStr = str()
        for sentence in sentence_list:
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        train_tokenStr_list.append(tokenStr)
    except:
        train_tokenStr_list.append('')

In [28]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(train_tokenStr_list)
X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names_out())
X_train



Unnamed: 0,一一一,一一三,一一九五億,一一二,一一定,一三七六,一三四六六七,一下子,一世代,一二,...,龍華城健,龍豐達科,龍順德,龍頭,龍頭廠,龍頭料,龍頭標,龍頭股,龐大,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.07627,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
1076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
1077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
1078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0


* 為避免太多建立的矩陣太稀疏，影響後續預測的效率，故此做chi-square方法

In [29]:
y_train = train_df['category']

chi2_selector = SelectKBest(chi2, k = 1000)
chi2_selector.fit(X_train, y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一半,一段,一陣子,三成,三成分股,三福化,三陽,三陽工業,上品,上季,...,高檔,鮑爾,鴻海元大,鴻海南電,鴻海廣達聯電,鴻海晶豪科,鴻海雄獅,黃仁勳,黃金粽,點擊者
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* 建立訓練資料空間向量

In [30]:
test_tokenStr_list = []
for i in list(test_df.index):
    try:
        txt = clearSentence(test_df['content'][i])
        sentence_list = utils.short_sentence(txt)
        tokenStr = str()
        for sentence in sentence_list:
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        test_tokenStr_list.append(tokenStr)
    except:
        test_tokenStr_list.append('')

In [31]:
y_test = test_df['category']

vectorizer = TfidfVectorizer(stop_words=stopwords)
X_test = vectorizer.fit_transform(test_tokenStr_list)
X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test



Unnamed: 0,一半,一段,一陣子,三成,三成分股,三福化,三陽,三陽工業,上品,上季,...,高檔,鮑爾,鴻海元大,鴻海南電,鴻海廣達聯電,鴻海晶豪科,鴻海雄獅,黃仁勳,黃金粽,點擊者
0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
1,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
2,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
3,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
4,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
266,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
267,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0
268,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0,0.0


* 建立預測模型-GDBoost

In [32]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9990740740740741

In [33]:
# 進行預測
predictions = clf.predict(X_test)

# 評估準確率
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

accuracy = (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
print("Accuracy:", accuracy)

Confusion Matrix:
[[ 55  68]
 [ 39 108]]
Accuracy: 0.6037037037037037


* 建立預測模型-NB

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

# Naive Bayes 分類器
classifier = MultinomialNB()

#  使用訓練數據訓練分類器
classifier.fit(X_train, y_train)

#  使用分類器進行預測
predictions = classifier.predict(X_test)

# 評估準確率
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

accuracy = (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
print("Accuracy:", accuracy)


Confusion Matrix:
[[  9 114]
 [  7 140]]
Accuracy: 0.5518518518518518


* 建立預測模型-SVC

In [35]:
from sklearn.svm import SVC

# 创建支持向量机分类器
classifier = SVC(kernel='linear')

# 使用训练数据和标签训练分类器
classifier.fit(X_train, y_train)

# 使用训练好的分类器对测试数据进行预测
predictions = classifier.predict(X_test)

# 生成混淆矩阵
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# 计算准确率
accuracy = (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
print("Accuracy:", accuracy)

Confusion Matrix:
[[ 37  86]
 [ 27 120]]
Accuracy: 0.5814814814814815


* 建立預測模型-DF

In [36]:
from sklearn.tree import DecisionTreeClassifier

# decision tree 分類器
classifier = DecisionTreeClassifier()

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

accuracy = (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
print("Accuracy:", accuracy)

Confusion Matrix:
[[52 71]
 [52 95]]
Accuracy: 0.5444444444444444


* 建立預測模型-Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

accuracy = (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
print("Accuracy:", accuracy)

Confusion Matrix:
[[ 50  73]
 [ 35 112]]
Accuracy: 0.6


### 回測

In [38]:
fullSet['post_time'] = pd.to_datetime(fullSet['post_time']).dt.date

In [39]:
start_date = datetime.date(2022, 6, 1)
end_date = datetime.date(2024, 1, 31)
current_date = start_date
Score = []
result = []
Date = []
while current_date <= end_date - datetime.timedelta(days=26):
    date_lst = []
    train_startDate = current_date - datetime.timedelta(days=90)
    train_endDate = current_date - datetime.timedelta(days=1)
    print(f"Train: {train_startDate} ~ {train_endDate}")
    test_startDate = current_date
    if current_date.strftime("%m") in ['01','03','05','07','08','10','12']:
        test_endDate = test_startDate + datetime.timedelta(days=30)
    elif current_date.strftime("%m") in ['04','06','09','11']:
        test_endDate = test_startDate + datetime.timedelta(days=29)
    else:
        test_endDate = test_startDate + datetime.timedelta(days=27)
    print(f"Test: {test_startDate} ~ {test_endDate}")

    #train
    train_tokenStr_list = []

    for i in list(fullSet[fullSet['post_time'].between(train_startDate, train_endDate)].index):
        try:
            sentence_list = utils.short_sentence(fullSet['content'][i])
            tokenStr = str()
            for sentence in sentence_list:
                sentence = clearSentence(sentence)
                tokens = monpa.cut(sentence)
                tokenStr += ' '.join(tokens)
            train_tokenStr_list.append(tokenStr)
        except:
            train_tokenStr_list.append('')

    vectorizer = TfidfVectorizer(stop_words=stopwords)
    y_train = fullSet[fullSet['post_time'].between(train_startDate, train_endDate)]['category']

    X_train = vectorizer.fit_transform(train_tokenStr_list)
    X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names_out())
    X_train[[col for col in X_train.columns if col in result_keywords]]

    X_trainN = pd.DataFrame(index=range(0,len(X_train)), columns=result_keywords).fillna(0)
    X_trainN[[col for col in X_train.columns if col in result_keywords]] = X_train[[col for col in X_train.columns if col in result_keywords]]

    #test
    test_tokenStr_list = []
    date_list = []
    for i in list(fullSet[fullSet['post_time'].between(test_startDate, test_endDate)].index):
        date_list.append(fullSet['post_time'][i])
        try:
            sentence_list = utils.short_sentence(fullSet['content'][i])
            tokenStr = str()
            for sentence in sentence_list:
                sentence = clearSentence(sentence)
                tokens = monpa.cut(sentence)
                tokenStr += ' '.join(tokens)
            test_tokenStr_list.append(tokenStr)
        except:
            test_tokenStr_list.append('')
    
    y_test = fullSet[fullSet['post_time'].between(test_startDate, test_endDate)]['category']

    vectorizer = TfidfVectorizer(stop_words=stopwords)
    X_test = vectorizer.fit_transform(test_tokenStr_list)
    X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names_out())

    X_testN = pd.DataFrame(index=range(0,len(X_test)), columns=result_keywords).fillna(0)
    X_testN[[col for col in X_test.columns if col in result_keywords]] = X_test[[col for col in X_test.columns if col in result_keywords]]

    #model
    clf = BernoulliNB()
    clf.fit(X_trainN, y_train)
    
    Score.append(clf.score(X_testN, y_test))
    print(Score)
    Predict = clf.predict(X_testN)
    print(Predict)
    result.append(Predict)
    Date.append(date_list)

    print(current_date.strftime("%Y/%m/%d"))
    if current_date.strftime("%m") in ['01','03','05','07','08','10','12']:
        current_date = current_date + datetime.timedelta(days=31)
    elif current_date.strftime("%m") in ['04','06','09','11']:
        current_date = current_date + datetime.timedelta(days=30)
    else:
        current_date = current_date + datetime.timedelta(days=28)

Train: 2022-03-03 ~ 2022-05-31
Test: 2022-06-01 ~ 2022-06-30




[1.0]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
2022/06/01
Train: 2022-04-02 ~ 2022-06-30
Test: 2022-07-01 ~ 2022-07-31




[1.0, 0.14516129032258066]
[ 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1  1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
2022/07/01
Train: 2022-05-03 ~ 2022-07-31
Test: 2022-08-01 ~ 2022-08-31




[1.0, 0.14516129032258066, 0.8524590163934426]
[ 1 -1  1 -1 -1  1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1]
2022/08/01
Train: 2022-06-03 ~ 2022-08-31
Test: 2022-09-01 ~ 2022-09-30




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1]
2022/09/01
Train: 2022-07-03 ~ 2022-09-30
Test: 2022-10-01 ~ 2022-10-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1  1 -1 -1  1
 -1 -1 -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
2022/10/01
Train: 2022-08-03 ~ 2022-10-31
Test: 2022-11-01 ~ 2022-11-30




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
2022/11/01
Train: 2022-09-02 ~ 2022-11-30
Test: 2022-12-01 ~ 2022-12-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2022/12/01
Train: 2022-10-03 ~ 2022-12-31
Test: 2023-01-01 ~ 2023-01-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2023/01/01
Train: 2022-11-03 ~ 2023-01-31
Test: 2023-02-01 ~ 2023-02-28




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1]
2023/02/01
Train: 2022-12-01 ~ 2023-02-28
Test: 2023-03-01 ~ 2023-03-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1]
2023/03/01
Train: 2023-01-01 ~ 2023-03-31
Test: 2023-04-01 ~ 2023-04-30




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1]
2023/04/01
Train: 2023-01-31 ~ 2023-04-30
Test: 2023-05-01 ~ 2023-05-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
2023/05/01
Train: 2023-03-03 ~ 2023-05-31
Test: 2023-06-01 ~ 2023-06-30




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897]
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1 -1  1  1
  1]
2023/06/01
Train: 2023-04-02 ~ 2023-06-30
Test: 2023-07-01 ~ 2023-07-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384]
[-1 -1 -1 -1  1 -1 -1  1 -1 -1  1 -1  1]
2023/07/01
Train: 2023-05-03 ~ 2023-07-31
Test: 2023-08-01 ~ 2023-08-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384, 0.7021276595744681]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1]
2023/08/01
Train: 2023-06-03 ~ 2023-08-31
Test: 2023-09-01 ~ 2023-09-30




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384, 0.7021276595744681, 0.03508771929824561]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1  1  1]
2023/09/01
Train: 2023-07-03 ~ 2023-09-30
Test: 2023-10-01 ~ 2023-10-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384, 0.7021276595744681, 0.03508771929824561, 0.9777777777777777]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2023/10/01
Train: 2023-08-03 ~ 2023-10-31
Test: 2023-11-01 ~ 2023-11-30




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384, 0.7021276595744681, 0.03508771929824561, 0.9777777777777777, 0.9137931034482759]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2023/11/01
Train: 2023-09-02 ~ 2023-11-30
Test: 2023-12-01 ~ 2023-12-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384, 0.7021276595744681, 0.03508771929824561, 0.9777777777777777, 0.9137931034482759, 0.38461538461538464]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2023/12/01
Train: 2023-10-03 ~ 2023-12-31
Test: 2024-01-01 ~ 2024-01-31




[1.0, 0.14516129032258066, 0.8524590163934426, 0.8125, 0.23809523809523808, 0.12244897959183673, 0.35294117647058826, 1.0, 0.46808510638297873, 0.2289156626506024, 0.6486486486486487, 0.015384615384615385, 0.16326530612244897, 0.5384615384615384, 0.7021276595744681, 0.03508771929824561, 0.9777777777777777, 0.9137931034482759, 0.38461538461538464, 0.7236842105263158]
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1]
2024/01/01


### 預測每日漲跌

In [40]:
def daily_predictions(time_list, prediction_list):
    daily_results = {}
    
    for time, prediction in zip(time_list, prediction_list):
        # 將日期轉換為每日日期
        daily_date = time
        
        # 如果這一天還沒有預測結果的列表，就建立一個新的列表
        if daily_date not in daily_results:
            #print(daily_date)
            daily_results[daily_date] = []
        
        # 將預測結果加入到該日期的列表中
        daily_results[daily_date].append(prediction)
    
    # 進行投票
    daily_votes = {}
    for date, predictions in daily_results.items():
        # 計算每個日期的投票結果
        positive_count = sum(1 for result in predictions if result == 1)
        negative_count = sum(1 for result in predictions if result == -1)
        
        # 決定每日的投票結果
        if positive_count > negative_count:
            daily_votes[date] = 1
        elif positive_count < negative_count:
            daily_votes[date] = -1
        else:
            # 如果正負票數相等，你可以採取一個預設的決策，比如0或者None
            daily_votes[date] = 0
    
    return daily_votes

In [41]:
#flat Date
Date = [item for sublist in Date for item in sublist]
result = [item for sublist in result for item in sublist]

In [42]:
daily_votes = daily_predictions(Date, result)

In [43]:
#找出所有交易日
trade_days = sorted(transaction_df.loc[(transaction_df['stock_name']=='聯發科')]['date'])
#turn elements in trade_days to date
trade_days = pd.to_datetime(trade_days).date
trade_days = trade_days[62:]

#if a date it not in daily_votes, add it to daily_votes and set its value to 0
for date in trade_days:
    if date not in daily_votes:
        daily_votes[date] = 0

#if a date is in daily_votes but not in trade_days, remove it from daily_votes
for date in list(daily_votes.keys()):
    if date not in trade_days:
        daily_votes.pop(date)
#trade_days = [datetime.datetime.strptime(date[:10], "%Y-%m-%d") for date in trade_days]
#trade_days

### 預測結果輸出

In [44]:
vote_list = [(str(key), value) for key, value in daily_votes.items()]
vote_list = sorted(vote_list, key = lambda x: x[0])
with open('Prediction_20220601_20240131_PTT.csv', 'w') as file:
    for key, value in vote_list:
        file.write('%s,%s\n' % (key, value))