In [1]:
import pandas as pd
import numpy as np

import scipy.stats as stats
from scipy.stats import kendalltau
from scipy.spatial.distance import pdist, squareform

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from math import sqrt
import matplotlib.pyplot as plt
from pyEDM import *

import warnings
warnings.filterwarnings('ignore')

# Data preprocessing

1.1.Import data

In [2]:
#############
TICKER = 2382
TP = 1
TARGET=f'y_{TP}'
#############

### import data ###
data = pd.read_csv('/Users/yitsung/Desktop/MasterThesis/data/TaiwanStockData_Top100_EMA')
ticker_data = data[data['ticker']==TICKER].reset_index(drop=True)
ticker_data = ticker_data.drop(columns=['ticker'])

### generate y ###
# (SMA-P/P, 2class) #
ticker_data[f'y_{TP}'] = ticker_data['close'].rolling(window=TP).mean()
ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(-TP)
ticker_data = ticker_data.dropna().reindex()
ticker_data[f'y_{TP}'] = ((ticker_data[f'y_{TP}'] - ticker_data['close']) >= 0).astype(int)

# 對答案用 #
origi_data = ticker_data.copy()

# 還原成不洩露資訊的df #
ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(TP)
ticker_data = ticker_data.dropna().reindex()

ticker_data

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
1,2021-01-05,81.3,84.8,81.2,84.7,17034.0,515.0,2672.0,-306.0,661.0,5356.0,0.4065,81.195156,80.827490,80.021409,0.806082,0.453364,94.836385,1.0
2,2021-01-06,84.0,85.8,83.1,83.9,14793.0,-227.0,-2037.0,128.0,398.0,2440.0,0.2848,81.736300,81.301343,80.327906,0.973436,0.567153,95.783305,0.0
3,2021-01-07,83.8,84.9,83.8,83.9,8900.0,176.0,-435.0,537.0,-224.0,318.0,0.2189,82.169152,81.701965,80.608793,1.093172,0.680120,96.272379,1.0
4,2021-01-08,85.0,85.0,83.7,84.6,9497.0,365.0,1490.0,-184.0,-186.0,-220.0,0.1997,82.655423,82.148598,80.921212,1.227386,0.795940,96.701253,1.0
5,2021-01-11,84.6,85.0,83.6,84.9,7993.0,-155.0,129.0,292.0,-555.0,-1643.0,0.2043,83.104413,82.572518,81.231355,1.341163,0.910001,96.994399,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,2023-11-24,204.0,205.0,200.0,203.5,31771.0,431.0,-1249.0,-5220.0,-635.0,-5372.0,2.5242,207.921343,208.194608,210.625828,-2.431221,-2.975214,91.109592,1.0
704,2023-11-27,203.0,203.0,193.0,196.5,51001.0,-148.0,-2832.0,-7563.0,-1608.0,-12535.0,3.5869,205.637074,206.395437,209.579471,-3.184034,-3.016978,81.774727,0.0
705,2023-11-28,196.0,198.0,194.0,197.0,34568.0,96.0,148.0,-4751.0,227.0,-2933.0,2.2737,203.909659,204.949985,208.647658,-3.697673,-3.153117,71.964659,1.0
706,2023-11-29,198.5,202.0,196.0,201.0,37752.0,518.0,5533.0,-4474.0,-321.0,-485.0,2.3036,203.327728,204.342295,208.081165,-3.738870,-3.270267,63.565773,1.0


In [3]:
origi_data[origi_data['Date']>='2023-07-01']

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
603,2023-07-03,155.0,164.5,155.0,160.5,70996.0,2464.0,-5374.0,1897.0,-63.0,-4744.0,3.3186,148.334934,145.815379,134.563296,11.252083,11.014040,71.963704,1
604,2023-07-04,161.5,175.5,160.5,170.5,101935.0,465.0,-5847.0,4343.0,574.0,-2931.0,4.4679,152.767947,149.613013,137.225274,12.387739,11.288779,74.980737,0
605,2023-07-05,168.5,171.5,162.0,169.5,54131.0,2047.0,-3608.0,200.0,-359.0,132.0,2.6471,156.114358,152.672549,139.615994,13.056555,11.642335,74.460785,0
606,2023-07-06,166.5,172.5,163.5,165.0,58575.0,-1375.0,-4027.0,869.0,-280.0,-5246.0,2.7694,157.891486,154.569080,141.496291,13.072789,11.928425,72.010928,0
607,2023-07-07,162.0,166.5,160.0,160.5,51610.0,-1526.0,5111.0,101.0,-1032.0,-1786.0,2.7042,158.413189,155.481529,142.903973,12.577556,12.058252,62.511810,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,2023-11-24,204.0,205.0,200.0,203.5,31771.0,431.0,-1249.0,-5220.0,-635.0,-5372.0,2.5242,207.921343,208.194608,210.625828,-2.431221,-2.975214,91.109592,0
704,2023-11-27,203.0,203.0,193.0,196.5,51001.0,-148.0,-2832.0,-7563.0,-1608.0,-12535.0,3.5869,205.637074,206.395437,209.579471,-3.184034,-3.016978,81.774727,1
705,2023-11-28,196.0,198.0,194.0,197.0,34568.0,96.0,148.0,-4751.0,227.0,-2933.0,2.2737,203.909659,204.949985,208.647658,-3.697673,-3.153117,71.964659,1
706,2023-11-29,198.5,202.0,196.0,201.0,37752.0,518.0,5533.0,-4474.0,-321.0,-485.0,2.3036,203.327728,204.342295,208.081165,-3.738870,-3.270267,63.565773,1


1.2.Splite data into train(Library) and test(Prediction)

In [4]:
Library = ticker_data[ticker_data['Date'] <= '2023-06-30'] # the last prediction from Library is 6/30
# Prediction = ticker_data[(ticker_data['Date'] >= '2023-07-01')&(ticker_data['Date'] <= '2023-07-20')] # test
Prediction = ticker_data[(ticker_data['Date'] >= '2023-07-01')&(ticker_data['Date'] <= '2023-10-31')] 

In [5]:
Library

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
1,2021-01-05,81.3,84.8,81.2,84.7,17034.0,515.0,2672.0,-306.0,661.0,5356.0,0.4065,81.195156,80.827490,80.021409,0.806082,0.453364,94.836385,1.0
2,2021-01-06,84.0,85.8,83.1,83.9,14793.0,-227.0,-2037.0,128.0,398.0,2440.0,0.2848,81.736300,81.301343,80.327906,0.973436,0.567153,95.783305,0.0
3,2021-01-07,83.8,84.9,83.8,83.9,8900.0,176.0,-435.0,537.0,-224.0,318.0,0.2189,82.169152,81.701965,80.608793,1.093172,0.680120,96.272379,1.0
4,2021-01-08,85.0,85.0,83.7,84.6,9497.0,365.0,1490.0,-184.0,-186.0,-220.0,0.1997,82.655423,82.148598,80.921212,1.227386,0.795940,96.701253,1.0
5,2021-01-11,84.6,85.0,83.6,84.9,7993.0,-155.0,129.0,292.0,-555.0,-1643.0,0.2043,83.104413,82.572518,81.231355,1.341163,0.910001,96.994399,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,2023-06-26,148.0,148.0,143.0,146.5,52415.0,-2007.0,-9650.0,-4093.0,-227.0,-10313.0,2.5169,142.900556,139.751733,127.499044,12.252689,11.103599,90.326209,0.0
599,2023-06-27,141.0,143.5,138.0,141.5,48037.0,-7908.0,9218.0,1137.0,-98.0,3463.0,2.2584,142.620445,140.020697,128.536152,11.484545,11.179789,81.275570,0.0
600,2023-06-28,144.5,148.0,143.0,144.0,37927.0,-810.0,4102.0,-174.0,-584.0,2612.0,2.0783,142.896356,140.632898,129.681622,10.951275,11.134086,75.661364,1.0
601,2023-06-29,145.5,147.0,144.0,146.5,28093.0,258.0,-307.0,141.0,-63.0,962.0,1.4951,143.617084,141.535529,130.927428,10.608101,11.028889,70.762733,1.0


In [6]:
Prediction

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
603,2023-07-03,155.0,164.5,155.0,160.5,70996.0,2464.0,-5374.0,1897.0,-63.0,-4744.0,3.3186,148.334934,145.815379,134.563296,11.252083,11.014040,71.963704,1.0
604,2023-07-04,161.5,175.5,160.5,170.5,101935.0,465.0,-5847.0,4343.0,574.0,-2931.0,4.4679,152.767947,149.613013,137.225274,12.387739,11.288779,74.980737,1.0
605,2023-07-05,168.5,171.5,162.0,169.5,54131.0,2047.0,-3608.0,200.0,-359.0,132.0,2.6471,156.114358,152.672549,139.615994,13.056555,11.642335,74.460785,0.0
606,2023-07-06,166.5,172.5,163.5,165.0,58575.0,-1375.0,-4027.0,869.0,-280.0,-5246.0,2.7694,157.891486,154.569080,141.496291,13.072789,11.928425,72.010928,0.0
607,2023-07-07,162.0,166.5,160.0,160.5,51610.0,-1526.0,5111.0,101.0,-1032.0,-1786.0,2.7042,158.413189,155.481529,142.903973,12.577556,12.058252,62.511810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,2023-10-25,210.0,214.0,207.0,210.5,42225.0,286.0,-5459.0,1181.0,439.0,-864.0,3.7028,213.919402,216.945239,224.719080,-7.773842,-4.787146,19.846001,1.0
682,2023-10-26,202.0,206.0,201.0,202.5,43216.0,-869.0,6935.0,-12672.0,-3076.0,-76.0,3.5988,211.635522,214.722894,223.073222,-8.350328,-5.499783,12.569983,0.0
683,2023-10-27,206.0,206.5,201.5,203.0,36587.0,-660.0,-3086.0,-86.0,133.0,-3749.0,3.4407,209.908417,212.919372,221.586317,-8.666945,-6.133215,7.244508,1.0
684,2023-10-30,204.0,208.0,203.5,206.0,24975.0,-405.0,1895.0,356.0,34.0,3048.0,2.3477,209.126734,211.854853,220.431775,-8.576922,-6.621956,0.718921,1.0


In [7]:
print(len(Library))
print(Library['y_1'].sum())

602
328.0


1.3. Over-Sampling Library

In [8]:
def over_sampling(Library, target):
    """
    注意: 重複抽取的樣本都會加在最後面
    """
    oversample = RandomOverSampler(random_state=87)
    features = Library.drop(columns=['Date', target])
    df_target = Library[target]

    features_resampled, target_resampled = oversample.fit_resample(features, df_target)

    resampled_df = features_resampled.copy()
    resampled_df[target] = target_resampled
    resampled_df = resampled_df.reset_index(drop=True)

    resampled_df.insert(0, 'Date', range(1, len(resampled_df) + 1))

    return resampled_df

In [9]:
def under_sampling(Library, target):
    """
    注意: 時間序列順序已完全被打亂, 不能用來找view, 只能用來進行logistic
    """
    undersample = RandomUnderSampler(random_state=87)
    features = Library.drop(columns=['Date', target])
    df_target = Library[target]

    features_resampled, target_resampled = undersample.fit_resample(features, df_target)

    resampled_df = features_resampled.copy()
    resampled_df[target] = target_resampled
    resampled_df = resampled_df.reset_index(drop=True)
    
    resampled_df.insert(0, 'Date', range(1, len(resampled_df) + 1))

    return resampled_df

In [10]:
# Library = over_sampling(Library=Library, target=TARGET)
# Library

1.4. Concate dataframe function

In [11]:
def concate_Lib_Pred(Library, Prediction, th): 
    """
    th=-1 為Library
    th=n 為第n個Pred concate進Lib
    """
    if th < 0:
        Lib_Pred_df = Library
    
    else:
        row_to_add = Prediction.iloc[th]
        Lib_Pred_df = pd.concat([Library, row_to_add.to_frame().T], ignore_index=True)
        
    # 這種concate方法會有非數值問題, 要這樣修正
    Lib_Pred_df[Library.columns.to_list()[1:]] = Lib_Pred_df[Library.columns.to_list()[1:]].apply(pd.to_numeric, errors='coerce')
    Lib_Pred_df['Date'] = pd.to_datetime(Lib_Pred_df['Date'])

    return Lib_Pred_df

In [12]:
# Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=10)
# Lib_Pred_df

1.5. Data normalize function

In [13]:
def data_normalize(Lib_Pred_df):

    feature_to_standardize = Lib_Pred_df.columns.to_list()
    feature_to_standardize.remove(Lib_Pred_df.columns[0]) # 排除Date
    feature_to_standardize.remove(Lib_Pred_df.columns[-1])

    scaler = MinMaxScaler() 
    Lib_Pred_df[feature_to_standardize] = scaler.fit_transform(Lib_Pred_df[feature_to_standardize])

    return Lib_Pred_df

In [14]:
# Lib_Pred_df = data_normalize(Lib_Pred_df)
# Lib_Pred_df

# MDRSmap I

2.1. 製作可餵入EDM格式的train_feature

In [15]:
def find_train_target_feature(data, target):

    df_columns = list(data.columns)
    train_feature = df_columns.copy()
    train_feature.remove('Date') # 先拿掉日期
    
    formatted_columns = ' '.join(df_columns[1:]) # 變成可以餵給 EDM function 參數 'columns' 的形式 
    train_feature.remove(target) # 再拿掉 target_feature

    return formatted_columns, train_feature

# formatted_columns, train_feature = find_train_target_feature(data=Library, target=TARGET)
# formatted_columns

2.2. 找出target_feature最佳嵌入維度

In [16]:
def find_target_OED(data, target):

    target_OED = EmbedDimension(dataFrame=data, lib=f'1 {len(data)}', pred=f'{len(data)-21} {len(data)-1}', columns=target, showPlot=False) # 4. lib訓練全部, pred看最後20筆

    target_OED_rho = target_OED['rho'].max()
    target_OED = int(target_OED['E'][target_OED['rho'] == target_OED['rho'].max()].iloc[0])

    return target_OED, target_OED_rho

# target_OED, target_OED_rho = find_target_OED(data=Library, target=TARGET)
# print(f'target_OED: {target_OED}, target_OED_rho: {target_OED_rho}')

2.3. 找出所有有因果關係的train_feature

In [17]:
def find_rho_sig_df(data, ticker, target, target_OED, train_feature, E_max):
    
    crirho = stats.t.ppf(0.95, len(data) - 1) / (len(data) - 2 + stats.t.ppf(0.95, len(data) - 1) ** 2)
    ccm_libSizes = f'{target_OED+10} {len(data)-10} 10'
    # ccm_libSizes = list(range(10, len(data) + 1, 10)) + [len(data)]  # sequence of library size # original

    rho_sig_df = pd.DataFrame(columns=train_feature)
    for train in train_feature:

        ### 找出該train_feature最好的ccm_OED ###
        ccm_E_termRHO = pd.DataFrame(columns=['E', 'term_rho'])
        """
        這裡假設用 term_rho 來選 ccm_OED
        """
        for e in range(1, E_max+1):
            ccm_result = CCM(dataFrame=data, E=e, columns=train, target=target,
                            libSizes=ccm_libSizes, random=False, showPlot=False)
            # print(e, ccm_result[f'{target}:open'].iloc[-1]) # 有時候會有warning, 測試用
            new_data = {'E': e, 'term_rho': ccm_result[f'{target}:{train}'].iloc[-1]}
            ccm_E_termRHO.loc[len(ccm_E_termRHO)] = new_data

        max_term_rho_index = ccm_E_termRHO['term_rho'].idxmax()
        ccm_OED = ccm_E_termRHO.at[max_term_rho_index, 'E']

        ### 用最好的ccm_OED來做該feature的因果檢定 ###
        ccm_result = CCM(dataFrame=data, E=ccm_OED, columns=train, target=target, 
                        libSizes=ccm_libSizes, random=False, showPlot=False)
        """
        這裡假設用 target:train 、 LibSize 來做 kendalltau 檢定
        """
        ccm_result = ccm_result[['LibSize', f'{target}:{train}']]
        ccm_result[f'{target}:{train}'][ccm_result[f'{target}:{train}'] < 0] = 0
        term_rho = ccm_result[f'{target}:{train}'].iloc[-1]

        tau, p_value = kendalltau(ccm_result['LibSize'], ccm_result[f'{target}:{train}']) # 進行 kendalltau 相關檢定

        alpha = 0.05
        if (p_value < alpha) and (term_rho > crirho): # 顯著相關
            rho_sig_df[train] = [term_rho]

        else: # "不" 顯著相關
            rho_sig_df[train] = [0]

    rho_sig_df.index = pd.Index([f'{ticker}_{target}']) 
    
    return rho_sig_df

# rho_sig_df = find_rho_sig_df(data=Library, ticker=TICKER, target=TARGET, 
#                              target_OED=target_OED, train_feature=train_feature, E_max=10)
# rho_sig_df

2.4. 用有因果關係的train_feature建立Embed_df

In [18]:
def make_Embed_df(data, max_lag, target, rho_sig_df):

    #用有因果關係的 train feature + target feature 製作 Embed_df #
    non_zero_columns = rho_sig_df.loc[:, (rho_sig_df != 0).any(axis = 0)] # 選取值非0的column
    train_feature_ls = list(non_zero_columns.columns)
    formatted_columns = ' '.join(train_feature_ls) # 轉成 EDM column 的 input
    columns_to_lag = formatted_columns + f' {target}' # 加入 target 本身

    Embed_df = Embed(dataFrame=data, E=max_lag, tau=-1, columns=columns_to_lag) # 製作 Embed_df
    Embed_df['Date'] = data['Date'] # 加入Date來看index, 才可以防simplex func的bug
    Embed_df.dropna(inplace=True) # 把包含NaN的資料拿掉
    Embed_df = Embed_df.reset_index(drop=True)
    Embed_df = Embed_df[['Date'] + [col for col in Embed_df.columns if col != 'Date']]

    ML_df_date = Embed_df.copy()
    ML_df_date['Date'] = pd.to_datetime(ML_df_date['Date']) # 將index設為日期
    ML_df_date.set_index('Date', inplace=True)
    ML_df_date = ML_df_date.filter(like="(t-0)") # 只留下(t-0)的column

    return Embed_df, ML_df_date

# Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df)
# Embed_df

In [19]:
# ML_df_date

2.5. 用simplex randomsearch找出最佳的view

In [20]:
def make_random_simplex(Embed_df, target, target_OED, kmax, kn):

    Embed_for_train = Embed_df.drop(columns='Date') # 先把 Date 拿掉
    Embed_for_train = Embed_for_train.drop(columns=f'{target}(t-0)') # 先把 target 拿掉
    train_f_ls = list(Embed_for_train.columns) # train_feature
    train_f_num = len(Embed_for_train.columns) # train_feature 的個數

    rho_feature_view = pd.DataFrame(columns=['rho']) # 創建一個df去紀錄每個隨機view的資料
    new_column = pd.DataFrame(columns=['feature_' + str(i) for i in range(1, target_OED+1)])
    rho_feature_view = pd.concat([rho_feature_view, new_column], axis=1)
    k = 1
    while k <= kmax:
        random_pick_train = np.random.choice(train_f_num, target_OED, replace=False)
        # print(random_pick_train)

        train_f_ls = np.array(train_f_ls) # 變成 array 才可以一次選
        select_train_f = train_f_ls[random_pick_train] # 隨機選到的 train_feature
        formatted_random_columns = ' '.join(select_train_f) # 用成符合 EDM 的資料格式
        # print(formatted_random_columns)

        simp = Simplex(dataFrame=Embed_df, E=target_OED, # ver3: 測試近10 or 20個交易日
                       lib=f'1 {len(Embed_df)}', pred = f'{len(Embed_df)-21} {len(Embed_df)-1}', 
                       columns=formatted_random_columns, target=f'{target}(t-0)',
                       embedded = True, showPlot = False) # 原本是False現在改True
        # print(simp)

        sub_simp = simp[['Observations', 'Predictions']] # 計算rho
        rho = sub_simp['Observations'].corr(sub_simp['Predictions'])

        rho_feature_view.loc[len(rho_feature_view), 'rho'] = rho # 將 view 更新到 rho_feature_view 的 df 中
        rho_feature_view.loc[len(rho_feature_view)-1, rho_feature_view.columns[1:]] = select_train_f
        # print(rho)
        k += 1

    allscore = rho_feature_view.sort_values(by='rho', ascending=False).head(kn)
    allscore = allscore.reset_index(drop=True)

    return allscore

# allscore = make_random_simplex(Embed_df=Embed_df, target=TARGET, target_OED=target_OED, 
#                                kmax=10000, kn=5)
# allscore

# MDRSmap II

3.1. 計算每個時點的(view加權)距離

In [21]:
def compute_view_w_distance(Embed_df, allscore):

    ww = allscore['rho'] / allscore['rho'].sum() # 每個view的權重

    dmatrix_ls = []
    for j in range(allscore.shape[0]):

        view_feature = allscore.iloc[j, 1:] # 選取第j個view的所有feature
        view_feature = np.array(view_feature) # 把所有feature變成array才可以從完整Embed_df中找資料
        view_feature_value = Embed_df[view_feature]
        view_matrix = view_feature_value.to_numpy() # 從df形式變array
        view_matrix = np.vstack(view_matrix) # 這樣才能疊成matrix

        Dx_t2 = pdist(view_matrix, metric='euclidean') * ww[j] # 計算加權距離
        Dx_t2 = squareform(Dx_t2) # 將距離變成squareform
        dmatrix_ls.append(Dx_t2)

    v_w_dmatrix = np.sum(dmatrix_ls, axis=0) # 輸出每個時點的view加權距離

    return v_w_dmatrix

# v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)
# v_w_dmatrix

3.2.尋找elastic-net最佳參數

In [22]:
def find_MDRSmap_param(target, ML_df_date, theta_seq, v_w_dmatrix, Tp):

    result_ls = pd.DataFrame(columns=['Theta', 'Score', 'Param']) # 創建紀錄回測結果的dataframe

    ### 將原始資料乘上空間位置權數 ###
    tp = len(ML_df_date) -1
    tp_distence = v_w_dmatrix[tp] # 第tp個時點離其他時點的距離
    mask = np.ones(len(tp_distence), dtype=bool) # 遮蔽該時點計算平均數
    mask[tp] = False
    dpar = np.mean(tp_distence[mask]) # 第tp個時點離其他時點的平均數

    for theta in theta_seq:
        w_tp = np.exp(-theta * tp_distence / dpar) # 計算每個時點資料的加權
        w_tp = np.sqrt(w_tp)

        ### 加入答案列 ###
        ML_df_date_new = ML_df_date.copy()
        ML_df_date_new[f'ans(t-0)'] = ML_df_date_new[f'{target}(t-0)'].shift(-Tp) # step.1: 先將target往前移Tp, 製作y
        ML_df_date_new = ML_df_date_new.multiply(w_tp, axis=0) # step.2: 再將data乘上距離加權
        ML_df_date_new['ans(t-0)'] = ML_df_date_new['ans(t-0)'].apply(lambda x: 1.0 if x != 0 else x) # step.3: 把ans非0的部分變成1
        ML_df_date_new = ML_df_date_new[:-(Tp+1)] # step.4: 拿掉最後Tp+1個, 因為最後面的data是硬拼上去的
        # ML_df_date_new = ML_df_date_new.drop(columns=[f'{target}(t-0)']) # step.5: 原paper有刪target啦, 這邊可選擇刪或不刪

        ### 分拆train, validation(以近60天為基準) ###
        X = ML_df_date_new.iloc[:, :-1]
        y = ML_df_date_new.iloc[:, -1]
        val_fold = [-1] * (len(X)-60) + [0] * 60 # 最後60筆當validation set
        ps = PredefinedSplit(test_fold=val_fold)

        logistic_elastic_net = LogisticRegression(penalty='elasticnet', 
                                                  solver='saga', # 只有saga支持elasticnet
                                                  random_state=87)

        ### grid search ###
        param_grid = {'l1_ratio': [0.9, 0.1, 0.01, 0.001, 0.0001],
                      'C': [0.001, 0.01, 0.1, 1, 10],
                      'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
                      'fit_intercept': [True], 
                      'intercept_scaling': [0.1],
                      'warm_start': [True]}
        grid_search = GridSearchCV(estimator=logistic_elastic_net, 
                                   param_grid=param_grid, 
                                   cv=ps, scoring='accuracy', 
                                   return_train_score=True)       

        grid_search.fit(X, y)

        ### 記錄結果 ###
        result_ls.loc[len(result_ls), 'Theta'] = theta
        result_ls.loc[len(result_ls)-1, 'Score'] = grid_search.best_score_
        result_ls.loc[len(result_ls)-1, 'Param'] = [grid_search.best_params_]

        theta = result_ls['Theta'][result_ls['Score'].idxmax()]
        param = result_ls['Param'][result_ls['Score'].idxmax()][0]

    return result_ls, theta, param

# result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
#                                              ML_df_date=ML_df_date, 
#                                              theta_seq=[1,2,4,7,11,16,22], 
#                                              v_w_dmatrix=v_w_dmatrix,
#                                              Tp=TP)

3.3.用最佳參數訓練MDRSmap

In [23]:
def MDRSmap_model(target, ML_df_date, theta, v_w_dmatrix, param, Tp):

    ### 將原始資料乘上空間位置權數 ###
    tp = len(ML_df_date) -1
    tp_distence = v_w_dmatrix[tp] # 第tp個時點離其他時點的距離
    mask = np.ones(len(tp_distence), dtype=bool) # 遮蔽該時點計算平均數
    mask[tp] = False
    dpar = np.mean(tp_distence[mask]) # 第tp個時點離其他時點的平均數

    w_tp = np.exp(-theta * tp_distence / dpar) # 計算每個時點資料的加權
    w_tp = np.sqrt(w_tp)

    ### 加入答案列 ###
    ML_df_date_new = ML_df_date.copy()
    ML_df_date_new[f'ans(t-0)'] = ML_df_date_new[f'{target}(t-0)'].shift(-Tp) # step.1: 先將target往前移Tp, 製作y
    ML_df_date_new = ML_df_date_new.multiply(w_tp, axis=0) # step.2: 再將data乘上距離加權
    ML_df_date_new['ans(t-0)'] = ML_df_date_new['ans(t-0)'].apply(lambda x: 1.0 if x != 0 else x) # step.3: 把ans非0的部分變成1
    ML_df_date_new = ML_df_date_new[:-(Tp+1)] # step.4: 拿掉最後Tp+1個, 因為最後面的data是硬拼上去的
    # ML_df_date_new = ML_df_date_new.drop(columns=[f'{target}(t-0)']) # step.5: 原paper有刪target啦, 這邊可選擇刪或不刪

    ### 分拆train, validation(以近60天為基準) ###
    X = ML_df_date_new.iloc[:, :-1]
    y = ML_df_date_new.iloc[:, -1]

    logistic_elastic_net = LogisticRegression(penalty='elasticnet', 
                                              solver='saga', # 只有saga支持elasticnet
                                              random_state=87,
                                              **param)
                             

    logistic_elastic_net.fit(X, y)

    return logistic_elastic_net

# logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
#                                      theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)

# Experience pipeline

In [24]:
Date = origi_data['Date'][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-10-31')].reset_index(drop=True)
Observations = origi_data[TARGET][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-10-31')].reset_index(drop=True)

MDRSmap_result = pd.DataFrame(Date)
MDRSmap_result['Observations'] = Observations
MDRSmap_result['Predictions'] = None

th=0
Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=th)
Lib_Pred_df = data_normalize(Lib_Pred_df)

formatted_columns, train_feature = find_train_target_feature(data=Lib_Pred_df, target=TARGET)
target_OED, target_OED_rho = find_target_OED(data=Lib_Pred_df, target=TARGET)
rho_sig_df = find_rho_sig_df(data=Lib_Pred_df, ticker=2330, target=TARGET, 
                                target_OED=target_OED, train_feature=train_feature, E_max=10)
Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df)
allscore = make_random_simplex(Embed_df=Embed_df, target=TARGET, target_OED=target_OED, kmax=10000, kn=5)
v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)

result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
                                            ML_df_date=ML_df_date, 
                                            theta_seq=[1,2,4,7,11,16,22], 
                                            v_w_dmatrix=v_w_dmatrix,
                                            Tp=TP)
logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
                                    theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)

X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
y_pred = logistic_elastic_net.predict(X_pred)
y_pred = y_pred[0]

MDRSmap_result.loc[th, 'Predictions'] = y_pred
print(f"{MDRSmap_result['Date'][th]}: finished")

for th in range(1, len(Prediction)):

    Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=th)
    Lib_Pred_df = data_normalize(Lib_Pred_df)    

    Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df)
    v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)

    result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
                                                    ML_df_date=ML_df_date, 
                                                    theta_seq=[1,2,4,7,11,16,22], 
                                                    v_w_dmatrix=v_w_dmatrix,
                                                    Tp=TP)
    logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
                                            theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)
    
    X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
    y_pred = logistic_elastic_net.predict(X_pred)
    y_pred = y_pred[0]

    MDRSmap_result.loc[th, 'Predictions'] = y_pred
    print(f"{MDRSmap_result['Date'][th]}: finished")

2023-07-03: finished
2023-07-04: finished
2023-07-05: finished
2023-07-06: finished
2023-07-07: finished
2023-07-10: finished
2023-07-11: finished
2023-07-12: finished
2023-07-13: finished
2023-07-14: finished
2023-07-17: finished
2023-07-18: finished
2023-07-19: finished
2023-07-20: finished
2023-07-21: finished
2023-07-24: finished
2023-07-25: finished
2023-07-26: finished
2023-07-27: finished
2023-07-28: finished
2023-07-31: finished
2023-08-01: finished
2023-08-02: finished
2023-08-04: finished
2023-08-07: finished
2023-08-08: finished
2023-08-09: finished
2023-08-10: finished
2023-08-11: finished
2023-08-14: finished
2023-08-15: finished
2023-08-16: finished
2023-08-17: finished
2023-08-18: finished
2023-08-21: finished
2023-08-22: finished
2023-08-23: finished
2023-08-24: finished
2023-08-25: finished
2023-08-28: finished
2023-08-29: finished
2023-08-30: finished
2023-08-31: finished
2023-09-01: finished
2023-09-04: finished
2023-09-05: finished
2023-09-06: finished
2023-09-07: f

In [25]:
ACC = len(MDRSmap_result[MDRSmap_result['Predictions'] == MDRSmap_result['Observations']]) / len(MDRSmap_result['Observations'])
print('ACC: ', ACC)

MDRSmap_result.head(60)

ACC:  0.5060240963855421


Unnamed: 0,Date,Observations,Predictions
0,2023-07-03,1,1.0
1,2023-07-04,0,0.0
2,2023-07-05,0,1.0
3,2023-07-06,0,1.0
4,2023-07-07,1,1.0
5,2023-07-10,1,0.0
6,2023-07-11,1,0.0
7,2023-07-12,1,0.0
8,2023-07-13,1,0.0
9,2023-07-14,0,0.0


# 0050 test

In [None]:
# constituent = [2330, 2454, 2317, 2308, 2382, 2303, 2891, 3711, 2881, 2412,
#                2886, 2882, 2884, 1216, 2885, 3231, 3034, 2357, 2002, 2892,
#                1303, 2379, 5880, 2301, 3037, 2345, 1301, 3008, 3661, 2890,
#                5871, 2880, 2327, 2883, 2887, 2207, 4938, 1101, 6669, 1326,
#                2395, 3045, 5876, 2603, 1590, 2912, 4904, 2801, 6505, 2408]

constituent = [2330, 2454, 2317]

In [None]:
result_df = pd.DataFrame()

for TICKER in constituent:

    try:
        ##### Import data #####

        #######################
        TP = 1
        TARGET=f'y_{TP}'
        #######################

        data = pd.read_csv('/Users/yitsung/Desktop/MasterThesis/data/TaiwanStockData_Top100_EMA')
        ticker_data = data[data['ticker']==TICKER].reset_index(drop=True)
        ticker_data = ticker_data.drop(columns=['ticker'])

        # (SMA-P/P, 2class) #
        ticker_data[f'y_{TP}'] = ticker_data['close'].rolling(window=TP).mean()
        ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(-TP)
        ticker_data = ticker_data.dropna().reindex()
        ticker_data[f'y_{TP}'] = ((ticker_data[f'y_{TP}'] - ticker_data['close']) >= 0).astype(int)

        # 對答案用 #
        origi_data = ticker_data.copy()
        Date = origi_data['Date'][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-10-31')].reset_index(drop=True)
        Observations = origi_data[TARGET][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-10-31')].reset_index(drop=True)

        MDRSmap_result = pd.DataFrame(Date)
        MDRSmap_result['Observations'] = Observations
        MDRSmap_result['Predictions'] = None

        # 還原成不洩露資訊的df #
        ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(TP)
        ticker_data = ticker_data.dropna().reindex()

        ##### Splite data into train(Library) and test(Prediction) #####

        Library = ticker_data[ticker_data['Date'] <= '2023-06-30'] # the last prediction from Library is 6/30
        # Prediction = ticker_data[(ticker_data['Date'] >= '2023-07-01')&(ticker_data['Date'] <= '2023-07-20')] # test
        Prediction = ticker_data[(ticker_data['Date'] >= '2023-07-01')&(ticker_data['Date'] <= '2023-10-31')] 

        ##### start #####
        th=0
        Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=th)
        Lib_Pred_df = data_normalize(Lib_Pred_df)

        formatted_columns, train_feature = find_train_target_feature(data=Lib_Pred_df, target=TARGET)
        target_OED, target_OED_rho = find_target_OED(data=Lib_Pred_df, target=TARGET)
        rho_sig_df = find_rho_sig_df(data=Lib_Pred_df, ticker=2330, target=TARGET, 
                                        target_OED=target_OED, train_feature=train_feature, E_max=10)
        Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df)
        allscore = make_random_simplex(Embed_df=Embed_df, target=TARGET, target_OED=target_OED, kmax=10000, kn=5)
        v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)

        result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
                                                    ML_df_date=ML_df_date, 
                                                    theta_seq=[1,2,4,7,11,16,22], 
                                                    v_w_dmatrix=v_w_dmatrix,
                                                    Tp=TP)
        logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
                                            theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)

        X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
        y_pred = logistic_elastic_net.predict(X_pred)
        y_pred = y_pred[0]

        MDRSmap_result.loc[th, 'Predictions'] = y_pred
        # print(f"{MDRSmap_result['Date'][th]}: finished")

        for th in range(1, len(Prediction)):

            Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=th)
            Lib_Pred_df = data_normalize(Lib_Pred_df)    

            Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df)
            v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)

            result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
                                                            ML_df_date=ML_df_date, 
                                                            theta_seq=[1,2,4,7,11,16,22], 
                                                            v_w_dmatrix=v_w_dmatrix,
                                                            Tp=TP)
            logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
                                                    theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)
            
            X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
            y_pred = logistic_elastic_net.predict(X_pred)
            y_pred = y_pred[0]

            MDRSmap_result.loc[th, 'Predictions'] = y_pred
            print(f"{MDRSmap_result['Date'][th]}: finished")

            ACC = len(MDRSmap_result[MDRSmap_result['Predictions'] == MDRSmap_result['Observations']]) / len(MDRSmap_result['Observations'])
            print('ACC: ', ACC)

        result_df = pd.concat([result_df, MDRSmap_result], axis=0, ignore_index=True)
    
    except:
        print(f'{TICKER} failed.')
        continue

In [None]:
result_df.to_csv('MDRSmap_Tp=1.csv', index=False)

ACC = len(result_df[result_df['Predictions'] == result_df['Observations']]) / len(result_df['Observations'])
print('ACC: ', ACC)