In [10]:
import pandas as pd
import numpy as np

import warnings
# 關閉所有警告
warnings.filterwarnings("ignore")

import datetime
import monpa
from monpa import utils
import re

from collections import Counter
import math
import heapq
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [11]:
df_MTK_content = pd.read_csv('MTK_2%_2023_content_v2.csv')
df_MTK_content['post_time_update'] = pd.to_datetime(df_MTK_content['post_time_update']).dt.date
df_MTK_content.shape

(12183, 5)

In [12]:
df_MTK_raw_data_content = df_MTK_content[['title','content','post_time_update','s_name']]
df_MTK_raw_data_content.shape

(12183, 4)

In [13]:
df_MTK_content['label'].value_counts()

label
持平    8483
漲     1893
跌     1807
Name: count, dtype: int64

In [14]:
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines()
file.close()
# 停用詞
len(stopwords)

1893

In [15]:
def cutTerm(articles, start_date, end_date):
    tokenStr_list = []
    date_list = []  # 新增一個列表來保存日期
    # 確保範圍內的文章都被處理
    filtered_articles = articles[articles['post_time_update'].between(start_date, end_date)]
    
    for index, row in filtered_articles.iterrows():

        title = str(row['title'])
        content = str(row['content'])
        query = title + content
        
        try:
            tokenStr = ""
            sentence_list = utils.short_sentence(query)  # 斷句(一句不能超過200字)
            for sentence in sentence_list:
                sentence = clearSentence(sentence)  # 只留下中文
                tokens = monpa.cut(sentence)  # 斷詞
                tokens = [token.strip() for token in tokens if len(token.strip()) > 1]  # 去除多餘空白，只留長度大於1的token
                tokenStr += ' '.join(tokens) + ' '  # 確保每個token之後有空格

            tokenStr_list.append(tokenStr)
            date_list.append(row['post_time_update'])  # 保存對應的日期
        except Exception as e:
            print(f'處理文章時出錯: {e}')  # 印出錯誤訊息
            tokenStr_list.append('')  # 在錯誤的情況下加入空字符串
            date_list.append(row['post_time_update'])  # 仍然要保存日期

    return tokenStr_list, date_list  # 返回token字符串列表和日期列表

In [16]:
def TF_IDF(articles_term): #要放入切詞後的
    tf_counter = Counter() #預備統計tf用
    df_counter = Counter() #預備統計df用
    for terms in articles_term:
        terms = terms.split(' ')
        df_tmp=Counter() #暫存本篇df用
        for term in terms:
            tf_counter[term]+=1 #tf加1
            if(df_tmp[term]==0): #若本篇之前不曾出現
                df_tmp[term]=1 #df標為1
        
        df_counter += df_tmp
    return tf_counter,df_counter

In [17]:
def calculate_tfidf_keywords(tokenStr_list_up, tokenStr_list_down):
    tf_counter_up, df_counter_up = TF_IDF(tokenStr_list_up)
    tf_counter_down, df_counter_down = TF_IDF(tokenStr_list_down)

    tf_dict_up = dict(tf_counter_up)
    df_dict_up = dict(df_counter_up)
    tf_dict_down = dict(tf_counter_down)
    df_dict_down = dict(df_counter_down)

    # Calculate TF-IDF for "up"
    tfidf_dict_up = {}
    N_up = len(tokenStr_list_up)
    for word in tf_dict_up:
        tf = float(tf_dict_up[word])
        idf = float(df_dict_up[word])
        tfidf = (1 + math.log(tf)) * (math.log(N_up / idf)) if tf > 0 and idf > 0 else 0.0
        tfidf_dict_up[word] = tfidf

    # Calculate TF-IDF for "down"
    tfidf_dict_down = {}
    N_down = len(tokenStr_list_down)
    for word in tf_dict_down:
        tf = float(tf_dict_down[word])
        idf = float(df_dict_down[word])
        tfidf = (1 + math.log(tf)) * (math.log(N_down / idf)) if tf > 0 and idf > 0 else 0.0
        tfidf_dict_down[word] = tfidf

    # Select the top 200 by value
    top_200_up = heapq.nlargest(200, tfidf_dict_up.items(), key=lambda x: x[1])
    top_200_down = heapq.nlargest(200, tfidf_dict_down.items(), key=lambda x: x[1])

    keyword_list = [word for word, value in top_200_up + top_200_down]
    keyword_set = set(keyword_list)  # Remove duplicates
    return list(keyword_set)


---

## 開始移動回測

In [18]:
final_results = pd.DataFrame()

# 開始日期和結束日期
start_date = datetime.date(2022, 3, 1)
end_date = datetime.date(2023, 12, 20)

while start_date < end_date:
    # 設定訓練和測試的日期範圍
    train_startDate = start_date
    train_endDate = train_startDate + datetime.timedelta(days=90)  # 加三個月
    test_startDate = train_endDate
    test_endDate = test_startDate + datetime.timedelta(days=30)  # 加一個月

    print(f"======== 正在處理從 {train_startDate} 到 {train_endDate} 的訓練數據 ========")
    print(f"======== 將使用從 {test_startDate} 到 {test_endDate} 的數據進行預測 ========")

    # 訓練數據準備
    train_data = df_MTK_content[df_MTK_content['post_time_update'].between(train_startDate, train_endDate)].reset_index(drop=True)
    train_stock_sep = train_data.copy().groupby('label')
    train_stock_up = train_stock_sep.get_group('漲').reset_index(drop=True)
    train_stock_down = train_stock_sep.get_group('跌').reset_index(drop=True)
    train_stock_total = pd.concat([train_stock_up, train_stock_down], axis=0, ignore_index=True)

    train_tokenStr_list_up, date_list_up = cutTerm(train_stock_up, train_startDate, train_endDate)
    train_tokenStr_list_down, date_list_down = cutTerm(train_stock_down, train_startDate, train_endDate)
    print("訓練集切詞完畢！")
    keyword_list = calculate_tfidf_keywords(train_tokenStr_list_up, train_tokenStr_list_down)
    print(f"生成的關鍵字列表長度：{len(keyword_list)}")

    # 建立空間向量模型
    vectorizer = TfidfVectorizer(vocabulary=keyword_list, use_idf=True, stop_words=stopwords)
    train_content = train_tokenStr_list_up + train_tokenStr_list_down

    y_train = train_stock_total['label']
    X_train = vectorizer.fit_transform(train_content)
    X_train = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())
    k_n = 150
    chi2_selector = SelectKBest(chi2, k=k_n)
    chi2_selector.fit(X_train, y_train)
    kbest_vocabs = X_train.columns[chi2_selector.get_support()]
    X_train = X_train[kbest_vocabs]

    classifier = SVC(kernel='linear')
    scores = cross_val_score(classifier,X_train,y_train,cv=5,scoring='accuracy') #交叉驗證，計算準確率
    print("Average Accuracy:", scores.mean())

    classifier = SVC(kernel='linear')
    classifier.fit(X_train, y_train)

    # 測試數據準備和預測
    test_data = df_MTK_raw_data_content[df_MTK_raw_data_content['post_time_update'].between(test_startDate, test_endDate)].reset_index(drop=True)
    test_data_tokenStr, date_list = cutTerm(test_data, test_startDate, test_endDate)
    print("測試集切詞完畢！")
    X_test = vectorizer.transform(test_data_tokenStr)
    X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())[kbest_vocabs]
    y_pred = classifier.predict(X_test)

    # 儲存當前月份的結果
    results = pd.DataFrame({
        'date': date_list,
        'prediction': y_pred
    })
    results = results.sort_values(by='date')
    results = results.groupby('date')['prediction'].value_counts(normalize=True).unstack(fill_value=0)

    if '漲' not in results.columns:
        results['漲'] = 0.0
    if '跌' not in results.columns:
        results['跌'] = 0.0
    # Adding proportion columns
    results['漲'] = (results['漲']).round(4)
    results['跌'] = (results['跌']).round(4)
    results['pred_result'] = results.apply(lambda row: '漲' if row['漲'] > row['跌'] else '跌', axis=1)
    results.reset_index(inplace=True)

    file_name = f"E:\BDA_2024\mid\SVM_result\{test_startDate}_{test_endDate}.csv"
    results.to_csv(file_name,index=False)

    final_results = pd.concat([final_results, results], ignore_index=True)
    # 更新日期為下一個循環
    start_date = start_date + datetime.timedelta(days=30)
    print("---------------------進入下一批訓練--------------------")




訓練集切詞完畢！
生成的關鍵字列表長度：368
Average Accuracy: 0.6025688073394495
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：366
Average Accuracy: 0.6463323201621074
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：357
Average Accuracy: 0.6272727272727272
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：365
Average Accuracy: 0.6205977710233029
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：372
Average Accuracy: 0.6590483827853515
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：373
Average Accuracy: 0.6517647058823529
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：374
Average Accuracy: 0.6884210526315789
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：370
Average Accuracy: 0.6299553484760241
測試集切詞完畢！
---------------------進入下一批訓練--------------------
訓練集切詞完畢！
生成的關鍵字列表長度：375
Average Accuracy: 0.6667

In [19]:
final_results.to_csv("移動回測結果_SVM.csv",index=False)

In [20]:
final_results

prediction,date,漲,跌,pred_result
0,2022-05-30,0.4286,0.5714,跌
1,2022-05-31,0.2750,0.7250,跌
2,2022-06-01,0.2222,0.7778,跌
3,2022-06-02,0.1765,0.8235,跌
4,2022-06-06,0.0000,1.0000,跌
...,...,...,...,...
433,2024-02-20,0.8519,0.1481,漲
434,2024-02-21,0.7391,0.2609,漲
435,2024-02-22,0.8571,0.1429,漲
436,2024-02-23,0.7097,0.2903,漲
