In [1]:
import os
import pickle
from datetime import datetime, timedelta
from collections import Counter
from tqdm import tqdm

import numpy as np
import pandas as pd

## function

In [12]:
#前處理
def label_preprocessing(data_list, transaction):
    
    transaction['date'] = pd.to_datetime(transaction['date'])
    transaction['date'] = pd.to_datetime(transaction['date'].dt.date)
    for data in data_list:
        data['post_time'] = pd.to_datetime(data['post_time'])
        data["date"] = pd.to_datetime(data['post_time'].dt.date)

    sorted_trans = transaction.sort_values(by=["date","stock_name"]).reset_index(drop=True)

    return data_list, sorted_trans
    

In [3]:
#回傳一個兩欄的list，標註每個交易日股票是漲、跌或持平。
def calculate_score(sorted_trans,companylist,company_share):
    #初始值
    lastday_close = {}
    for company in companylist:
        lastday_close[company] = 0

    old_date = "2022-03-01"

    for index, row in sorted_trans.iterrows():

        #特殊情況：股票頭一次紀錄，全部設為0、紀錄收盤價
        if lastday_close[row["stock_name"]] == 0:
            sorted_trans.loc[index,"change"] = 0
            sorted_trans.loc[index,"partition"] = 0
            sorted_trans.loc[index,"score"] = 0
            lastday_close[row["stock_name"]] = row["close"]
            continue

        #新的一天，重新計算總市值
        if row["date"] != old_date:
            old_date = row["date"]
            today_data = sorted_trans[sorted_trans["date"] == row["date"]]
            totalMarketValue = 0
            #print(row["date"])
            if pd.to_datetime(row["date"]) < pd.to_datetime("2022-09-19"):
                for index_today, row_today in today_data.iterrows():
                    totalMarketValue += row_today["close"] * company_share[0][row_today["stock_name"]]
            else:
                for index_today, row_today in today_data.iterrows():
                    totalMarketValue += row_today["close"] * company_share[1][row_today["stock_name"]]
        
        #計算漲幅
        else:
            today_close = row["close"]                             
            change = (today_close - lastday_close[row["stock_name"]]) / lastday_close[row["stock_name"]]    #print(change)
            sorted_trans.loc[index,"change"] = change


        #當日該股票市值佔總市值比例
        if pd.to_datetime(row["date"]) < pd.to_datetime("2022-09-16"):
            for index_today, row_today in today_data.iterrows():
                sorted_trans.loc[index,"partition"] = row["close"] * company_share[0][row["stock_name"]]/ totalMarketValue
        else:
            for index_today, row_today in today_data.iterrows():
                sorted_trans.loc[index,"partition"] = row["close"] * company_share[1][row["stock_name"]]/ totalMarketValue
                

        #加權分數 (漲幅*市值佔比)
        sorted_trans.loc[index,"score"] = sorted_trans.loc[index,"change"] * sorted_trans.loc[index,"partition"]

        #收盤價更新
        lastday_close[row["stock_name"]] = row["close"]
    


    #每日類股漲跌總平均 (按日期把score加總)
    date_score = pd.pivot_table(
                                        data = sorted_trans,
                                        values = "score",
                                        index = "date",
                                        aggfunc = "sum"
                                    ).reset_index()

    #根據敘述統計，以總漲幅/跌幅0.9%為區分，中間約50%資料為持平，不會納入機器學習(-1)
    date_score["rise_fall"] = np.where(date_score['score'] >= 0.009, 1, np.where(date_score['score'] <= -0.009, 0, -1))
        
    return date_score

In [4]:
#資料標籤
def text_label(data_list,date_score):
        
    #共五種label，設為討論發生後1-5天
    day_option = [1,2,3,4,5]
    #開市日
    open_dates =date_score["date"].unique()

    for data in data_list:
        for day_after in day_option:
            new_col = "label_day" + str(day_after)

            for index, row in data.iterrows():
                
                #date_to_find為討論日+day_after
                date_to_find = pd.to_datetime(row["date"]) + pd.Timedelta(days= day_after) #print(index,"today",row["date"], "find", date_to_find)
                
                #當日有開市
                if date_to_find in open_dates:
                    data.loc[index,new_col] = date_score.loc[date_score["date"] == date_to_find,"rise_fall"].iloc[0] #print("found,", data.loc[index,new_col])
                
                #當日無開市->延後到下一個開市日
                #最後面的一些討論日期比市場數據晚，無法驗證故標為-1
                else:
                    next_date = date_score[date_score['date'] > date_to_find]['date'].min()
                    if not pd.isnull(next_date):
                        data.loc[index,new_col] = date_score.loc[date_score["date"] == next_date, "rise_fall"].iloc[0] #print("not found, use ", next_date, data.loc[index,new_col])
                    else:
                        data.loc[index,new_col] = -1 #print("out of range, label as -1")


    return data_list

In [5]:
#sample code裡面就有給
def load_df(filepath, preview=True):
    print(f"\n----- Loading {filepath}... -----")
    df = pd.read_csv(filepath)
    print(f"Size of dataframe: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    if preview:
        print(df.head())
    return df

In [6]:
#總函數，回傳標籤好的datalist
def labeling_all(init_data_list,transaction):

    companylist = ['長榮航太', '台灣高鐵', '龍德造船', '亞航', '宅配通', '建新國際', '台船', '台驊投控', '漢翔',
    '慧洋-KY', '遠雄港', '志信', '華航', '裕民', '新興', '陽明', '台航', '中櫃', '嘉里大榮',
    '四維航', '萬海', '長榮航', '榮運', '長榮', '中航']

    company_share = [{'長榮航太':352806962, '台灣高鐵':5628293058, '龍德造船':80000000, '亞航':150300385, '宅配通':95467000, '建新國際':81131938,
                '台船':931787296, '台驊投控':144752230, '漢翔':941867101,'慧洋-KY':746409199, '遠雄港':229979100, '志信':171820247, 
                '華航':5740484368, '裕民':845055712, '新興':585353297, '陽明':3492104270, '台航':417294487, '中櫃':148423458, 
                '嘉里大榮':467000498,'四維航':279267090, '萬海':2440127212, '長榮航':5138538667, '榮運':1067141094, '長榮':5290848436, 
                '中航':197484593},
                {'長榮航太':352806962, '台灣高鐵':5628293058, '龍德造船':98000000, '亞航':161928530, '宅配通':87731938, '建新國際':81131938,
                '台船':931787296, '台驊投控':143391230, '漢翔':941867101,'慧洋-KY':746409199, '遠雄港':254979100, '志信':189002272, 
                '華航':6013537444, '裕民':845055712, '新興':585353297, '陽明':3492104270, '台航':417294487, '中櫃':148423458, 
                '嘉里大榮':467000498,'四維航':329267090, '萬海':2806146293, '長榮航':5346225996, '榮運':1067141094, '長榮':2116420082, 
                '中航':197484593}]

    data_list, sorted_trans = label_preprocessing(init_data_list,transaction)
    date_score = calculate_score(sorted_trans,companylist,company_share)
    labeled_data_list = text_label(data_list,date_score)
    return labeled_data_list

## main

In [7]:
os.chdir("./bda-midterm-project/data")
retval = os.getcwd()
print("当前工作目录为 %s" % retval + "\n")

当前工作目录为 c:\Users\d1073\OneDrive\桌面\bda\bda-midterm-project\data



In [8]:
news = load_df("news_filtered.csv", preview=False)
mobile01 = load_df("mobile01_filtered.csv", preview=False)
dcard = load_df("dcard_filtered.csv", preview=False)
ptt = load_df("ptt_filtered.csv", preview=False)

transaction = load_df("個股交易數據-2年_filtered.csv", preview=True)


----- Loading news_filtered.csv... -----
Size of dataframe: (8617, 9)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url']

----- Loading mobile01_filtered.csv... -----
Size of dataframe: (683, 10)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url', 'content_type']

----- Loading dcard_filtered.csv... -----
Size of dataframe: (6665, 10)
Columns: ['id', 'forum', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url', 'content_type']

----- Loading ptt_filtered.csv... -----
Size of dataframe: (2013, 9)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url']

----- Loading 個股交易數據-2年_filtered.csv... -----
Size of dataframe: (11614, 8)
Columns: ['stock_name', 'stock_symbol', 'open', 'high', 'low', 'close', 'volume', 'date']
  stock_name  stock_symbol  open  high   low  close  volume  \
0       長榮航太          2645

In [9]:
#將4個dataset 放入 list
init_data_list = []
init_data_list.append(news)
init_data_list.append(mobile01)
init_data_list.append(dcard)
init_data_list.append(ptt)

In [13]:
labeled_data_list = labeling_all(init_data_list,transaction)

In [14]:
'''
labeled_data_list[0].to_csv('news_filtered_labeled.csv')
labeled_data_list[1].to_csv('mobile01_filtered_labeled.csv')
labeled_data_list[2].to_csv('dcard_filtered_labeled.csv')
labeled_data_list[3].to_csv('ptt_filtered_labeled.csv')
'''