## Part 1. Data Preprocessing

In [12]:
# Import packages and functions for part 1
import pandas as pd
from ckiptagger import WS, POS, NER
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy import sparse

# 先把我們需要的函數載入
ws = WS("./data_ckip") # 斷詞
pos = POS("./data_ckip") # 詞性標注
ner = NER("./data_ckip") # 命名實體識別

  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
2024-04-19 19:38:30.086547: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)


In [26]:
def clean(sentence_ws, sentence_pos):
    '''
    - sentence_ws: 經過斷詞的句子
    - sentence_pos: 經過詞性標注的句子
    - 留下特定的詞性（名詞、動詞、形容詞）、排除一個字的詞、專有名詞
    '''
    cleaned = []
    for word, pos in zip(sentence_ws, sentence_pos):
        is_Na_or_V_or_A_or_D = pos.startswith("Na") or pos.startswith("V") or pos.startswith("A")
        if is_Na_or_V_or_A_or_D and len(word) > 1:
            cleaned.append(word)
    return " ".join(cleaned)

# Word segmentation
def word_segmentation(contents, ws_driver, pos_driver, ner_driver):
    '''
    - contents: 一個 list，每個元素是一篇文章的全文內容
    - ws: 斷詞模型
    '''
    ws_results = ws_driver(contents)
    pos_results = pos_driver(ws_results)
    contents_cleaned = []
    for sentence, sentence_ws, sentence_pos in zip(contents, ws_results, pos_results):
        sentence_cleaned = clean(sentence_ws, sentence_pos)
        contents_cleaned.append(sentence_cleaned)

    return contents_cleaned

In [28]:
data_news = pd.read_csv('../data/news_filtered_merged.csv')
contents = data_news['content'].tolist()
contents_cleaned = word_segmentation(contents, ws, pos, ner)
# add the cleaned content back to the dataframe as a column named 'content_cleaned'
data_news['content_cleaned'] = contents_cleaned
data_news.to_csv('../data/news_cleaned.csv', index=False)

In [81]:
# function to generate cleaned data and save
def generate_cleaned_data(name, data, ws, pos, ner):
    contents = data['content'].tolist()
    contents_cleaned = word_segmentation(contents, ws, pos, ner)
    data['content_cleaned'] = contents_cleaned
    data.to_csv(f'../data/{name}_cleaned.csv', index=False)

In [95]:
# generate cleaned data for ptt and dcard
data_ptt = pd.read_csv('../data/ptt_filtered_labeled.csv')
data_dcard = pd.read_csv('../data/dcard_filtered_labeled.csv')
data_to_clean = {'ptt': data_ptt, 'dcard': data_dcard}
for name, data in data_to_clean.items():
    generate_cleaned_data(name, data, ws, pos, ner)

## Part 2. Feature Engineering

In [1]:
# import packages 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy import sparse

# parameters 
n_features = {'ptt': 10, 'dcard': 10, 'news': 10, 'all': 10}

In [2]:
def feature_extraction(source, data, n_features, days_ahead):
    '''
    - source: the source of the data
    - data: a pandas dataframe with a column named 'content_cleaned'
    - n_features: the number of features to select
    - days_ahead: the number of days ahead to predict
    '''
    # Drop the rows where label = -1
    data = data[data['label_day'+str(days_ahead)] != -1]
    # TF-IDF
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 3))
    X = vectorizer.fit_transform(data['content_cleaned'])
    # select the top 1000 features
    ch2 = SelectKBest(chi2, k=min(n_features[source],X.shape[1] - 30)) 
    X = ch2.fit_transform(X, data['label_day'+str(days_ahead)])
    # get the target variable
    y = data['label_day'+str(days_ahead)]
    return X, y

## Part 3. Model Training for All

In [3]:
# import packages
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from scipy.stats import randint
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import tqdm

# initialize all parameters
params = {
    'Naive Bayes': {},
    'Random Forest': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1],
        #'kernel': ['linear', 'rbf', 'poly'],
        'kernel': ['rbf'],
        'gamma': ['scale'],
    },
    'XGBoost': {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 0.9, 1.0],
    }
}

In [4]:
def train(source, data, n_features, days_ahead, backtest=False, period = -1):
    '''
    - source: the source of the data
    - data: a pandas dataframe with a column named 'content_cleaned'
    - n_features: the number of features to select
    - days_ahead: the number of days ahead to predict
    '''
    result = {}
    # feature extraction
    X, y = feature_extraction(source, data, n_features, days_ahead)
    # densify the sparse matrix
    X = X.toarray()
    # split the data into training and testing sets
    if backtest:
        # 因為後面有照日期排序，所以這邊可以照順序分出 test 和 train(上一個方啊會報錯)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # kfold
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    models = {
        'Naive Bayes': GaussianNB(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(),
        'XGBoost': XGBClassifier()
    }
    # search for the optimal parameters for each model by grid search
    best_models = {}
    for model_name, model in models.items():
        grid_search = GridSearchCV(model, params[model_name], cv=kfold, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
    # stack the models
    stack = StackingClassifier(estimators=[(name, model) for name, model in best_models.items()], final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    # predict the test set and calculate the accuracy and confusion matrix, and save the results
    # best_models (respectively)
    for name, model in best_models.items():
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)
        result[name] = {'accuracy': accuracy, 'confusion': confusion, 'prediction': y_pred}
    # stack
    y_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    result['stack'] = {'accuracy': accuracy, 'confusion': confusion, 'prediction': y_pred}

    return result

In [5]:
# Split the data into training and testing sets
datas = {
    'ptt': pd.read_csv('../data/ptt_cleaned.csv'),
    'dcard': pd.read_csv('../data/dcard_cleaned.csv'),
    'news': pd.read_csv('../data/news_cleaned.csv'),
}
# Train the models for each source media and get the results
results = {}
for name, data in datas.items():
    # drop nan for content_cleaned
    data = data.dropna(subset=['content_cleaned'])
    print(f'Training models for {name}')
    results[name] = {}
    for days_ahead in tqdm.tqdm(range(1, 6)):
        results[name][days_ahead] = train(name, data, n_features, days_ahead)

Training models for ptt


100%|██████████| 5/5 [00:42<00:00,  8.58s/it]


Training models for dcard


100%|██████████| 5/5 [00:52<00:00, 10.56s/it]


Training models for news


100%|██████████| 5/5 [01:19<00:00, 15.94s/it]


In [6]:
# flatten the results
results_flat = {}
for name, result in results.items():
    for days_ahead, models in result.items():
        for model_name, model_result in models.items():
            result_name = f'{name}_{days_ahead}_{model_name}'
            results_flat[result_name] = model_result

# save the flattened results
results_df = pd.DataFrame(results_flat).T
results_df.to_csv('../data/prob2_results.csv')

In [7]:
results_df.sort_values(by='accuracy', ascending=False, inplace=True)
results_df
results_df.to_csv('../data/prob2_sorted_f10.csv')

## Part 4. Backtesting

In [9]:
# import packages
from tqdm import tqdm

In [48]:
def backtest(source, data, period):
    '''
    - source: the source of the data (news, ptt, dcard)
    - data: a pandas dataframe with a column named 'content_cleaned'
    - period: the number of days to backtest
    '''
    data.loc[:, 'date'] = pd.to_datetime(data['date'])
    data = data.sort_values(by='date')
    result = {} # key is the period, value is the accuracy and confusion matrix of the best model
    start_date = data['date'].min()
    
    while start_date + pd.Timedelta(days=int(period*0.2) - 1) <= data['date'].max():
        end_date = start_date + pd.Timedelta(days=period - 1)
        period_data = data[(data['date'] >= start_date) & (data['date'] <= end_date)]
        date_interval = f'{start_date.strftime("%Y-%m-%d")}_{end_date.strftime("%Y-%m-%d")}'
        result[date_interval] = {}
        for days_ahead in range(1, 6):
            result_name = f'day_{days_ahead}'
            result[date_interval][result_name] = train(source, period_data, n_features, days_ahead, backtest=True, period=period)
        start_date = start_date + pd.Timedelta(days=int(period*0.2))
    return result

In [49]:
datas = {
    'ptt': pd.read_csv('../data/ptt_cleaned.csv'),
    'dcard': pd.read_csv('../data/dcard_cleaned.csv'),
    'news': pd.read_csv('../data/news_cleaned.csv'),
}
# Train the models for each source media and get the results
results_backtest = {}
# 初始化混淆矩陣
total_conf_matrix = np.zeros((2, 2))
for name, data in datas.items():
    # drop nan for content_cleaned
    data = data.dropna(subset=['content_cleaned'])
    print(f'Training models for {name}')
    results_backtest[name] = {}
    results_backtest[name] = backtest(name, data, 90)
    print(results_backtest[name])

Training models for ptt




ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chengliang/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chengliang/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/chengliang/anaconda3/lib/python3.11/site-packages/sklearn/svm/_base.py", line 199, in fit
    y = self._validate_targets(y)
        ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/chengliang/anaconda3/lib/python3.11/site-packages/sklearn/svm/_base.py", line 747, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class


In [None]:
# flatten the results
results_backtest_flat = {}
total_confusion_matrix = {}
total_accuracy = {}

# 初始化 total confusion matrix
for name, result in results_backtest.items():
    total_confusion_matrix[name] = {}
    total_accuracy[name] = {}
    for date_interval, days_ahead in result.items():
        for day_ahead, model in days_ahead.items():
            total_confusion_matrix[name][day_ahead] = {}
            total_accuracy[name][day_ahead] = {}
            for model_name, model_result in model.items():
                total_confusion_matrix[name][day_ahead][model_name] = np.zeros((2, 2))
                total_accuracy[name][day_ahead][model_name] = 0

for name, result in results_backtest.items():
    for date_interval, days_ahead in result.items():
        for day_ahead, model in days_ahead.items():
            for model_name, model_result in model.items():
                #計算total confusion matrix
                total_confusion_matrix[name][day_ahead][model_name] += model_result["confusion"]
                total_accuracy[name][day_ahead][model_name] += model_result["accuracy"]

for name, result in results_backtest.items():
    for date_interval, days_ahead in result.items():
        for day_ahead, model in days_ahead.items():
            for model_name, model_result in model.items():
                result_name = f'{name}_{day_ahead}_{model_name}'
                total_acc_conf = {"accuracy": total_accuracy[name][day_ahead][model_name],"confusion_matrix": total_confusion_matrix[name][day_ahead][model_name]}
                results_backtest_flat[result_name] = total_acc_conf

# save the flattened results
results_df = pd.DataFrame(results_backtest_flat).T
results_df.sort_values(by='accuracy', ascending=False, inplace=True)
results_df.to_csv('../data/results/backtest_result.csv')


## Playground

In [27]:
data = pd.read_csv('../data/news_filtered_merged.csv')
contents = data['content'].tolist()
contents_cleaned = word_segmentation(contents[:100], ws, pos, ner)
print(contents_cleaned[0])

相比 新興 債券 面臨 悲觀 預期 認為 新興 上漲 空間 抵擋 貨幣 縮減 政策 通膨 觸頂 原物料 出口 有利 財務 狀況 新興 國家 看好 標的 新興 主權 債券 需要 選擇性 投資 主動式 投資 機會 超越 指數 表現 特別 看好 投資 主題 原物料 商品 旅遊業 相對 落後 國家 主題 國家 債券 殖利率 債券 利差 約為 基本點 疫情 擴大 基本點 具有 吸引力 相較 投資級 債券 利差 疫情 水平 相近 約為 基本點 利差 收歛 收斂 空間 原物料 商品 價格 走高 相關 出口國 可望 受惠 出口 綠色 轉型 金屬 國家 計畫 布局 標的 綠色 轉型 過程 石油 基礎 設施 投資 不足 意味 綠色 能源 接手 石油 價格 維持 高檔 石油 出口國 吸引力 國家 出現 巨額 財政 赤字 疫情 惡化 盈餘 現金 用來 支撐 經濟 認為 投資 石油 出口國 支持 綠色 轉型 目標 衝突 遠離 排放 長期 趨勢 實現 石油 出口國 利用 債券 收益 確保 發展 主題 旅遊業 國家 收入 疫情 封鎖 期間 出現 下滑 疫苗 接種 檢測 技術 改進 旅人 回歸 旅遊業 回升 時間 問題 方式 旅遊 收入 增加 事實 國家 看到 相關 跡象 萌芽 國家 承受 病毒 肆虐 密切 關注 資源 不足 疫苗 接種 經濟 重啟 方面 進展 緩慢 國家 沙漠 國家 疫苗 接種率 相信 基本面 改善 國家 迎頭趕上 創造 良好 投資 機會


In [17]:
import pandas as pd
df_news = pd.read_csv('../data/dcard_cleaned.csv')
contents_news = df_news['content_cleaned'].tolist()
print(contents_news[:20])


['教學', '奈米 韭菜 這樣 站上 大關 小弟 成本 獲利 高\n\n 除息 知道 填息 賣掉 獲利 股息\n\n\n\n 小弟 知道 謝謝', nan, '通報 文章 想法 來去 看看', '除息稅 除息 差\n 看跌', '機會 準備 位數', '刪除 內容 一樣 錯過 相見', '文章 分析 基本面 預測 參考', '高歌 離席', '參加', '成本', '關注 航運 報價 營收 營收 準備 營收 公布', '權息', '刪除 內容 一樣 錯過 相見', '你我 持續 大膽 加碼', '傾向 賣掉 疫情 結果 實現 收益 收益 股息 賣掉去 殖利率 標的', nan, '停利 目標 區間 開始 波段 倉位 停利 動作 預計 停利 出場 嘗試 摸頭 收盤 十日線 作為 停利 標準 方法 擇時 交易 實際 時間 成本 操作 策略 保持 長期 操作 倉位 原文 停止 操作 條件 達成 停下 開始 使用 股票 研究 報告 出來 發文 發文 發現 持有 投資 充滿 絕望 決定 分享出來 股票 操作 感到 手足無措 出來 分享 兼顧 身心健康 大錢', '這樣 啦\n 攤到 繼續 機會', '賣掉 想說 低檔 接回 噴到 時機 難過']


In [112]:
datas['dcard']['label_day3'].value_counts()

label_day3
-1.0    2724
 0.0    2594
 1.0    1343
Name: count, dtype: int64