In [1]:
import pandas as pd
import numpy as np

import scipy.stats as stats
from scipy.stats import kendalltau
from scipy.spatial.distance import pdist, squareform

from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from math import sqrt
import matplotlib.pyplot as plt
from pyEDM import *

import warnings
warnings.filterwarnings('ignore')

# Data preprocessing

1.1.Import data

In [2]:
#############
TICKER = 2330
TP = 5
TARGET = f'y_{TP}'
#############

### import data ###
data = pd.read_csv('/Users/yitsung/Desktop/MasterThesis/data/TaiwanStockData_Top100_EMA')
ticker_data = data[data['ticker']==TICKER].reset_index(drop=True)
ticker_data = ticker_data.drop(columns=['ticker'])

### generate y ###
# ver.2(SMA-P/P, 2class) #
ticker_data[f'y_{TP}'] = ticker_data['close'].rolling(window=TP).mean()
ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(-TP)
ticker_data = ticker_data.dropna().reindex()
ticker_data[f'y_{TP}'] = ((ticker_data[f'y_{TP}'] - ticker_data['close']) >= 0).astype(int)

# 對答案用 #
origi_data = ticker_data.copy()

# 還原成不洩露資訊的df #
ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(TP)
ticker_data = ticker_data.dropna().reindex()

ticker_data.tail()

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_5
699,2023-11-20,576.0,579.0,575.0,577.0,26606.0,176.0,3579.0,-125.0,270.0,-2193.0,5.4217,570.883694,567.18891,556.797391,10.391519,7.167611,94.971748,1.0
700,2023-11-21,582.0,585.0,581.0,585.0,39881.0,-334.0,18793.0,97.0,-772.0,10844.0,6.7572,573.706955,569.929078,558.886473,11.042605,7.94261,100.0,1.0
701,2023-11-22,576.0,579.0,574.0,577.0,23922.0,533.0,-2966.0,-478.0,-230.0,-7073.0,4.7807,574.365564,571.016912,560.228216,10.788696,8.511827,97.93853,0.0
702,2023-11-23,574.0,578.0,574.0,578.0,15144.0,173.0,3740.0,-253.0,-218.0,93.0,3.0366,575.092451,572.091233,561.544644,10.546589,8.918779,95.718908,0.0
703,2023-11-24,577.0,578.0,574.0,575.0,12503.0,243.0,-854.0,70.0,-118.0,-2263.0,2.8318,575.073961,572.538736,562.541337,9.997398,9.134503,90.744592,0.0


In [3]:
origi_data[origi_data['Date']>='2023-07-01']

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_5
603,2023-07-03,578.0,580.0,576.0,579.0,15118.0,97.0,2353.0,-305.0,1401.0,582.0,2.5477,576.701182,575.940219,566.756798,9.183421,11.833896,33.311646,0
604,2023-07-04,585.0,585.0,580.0,585.0,17777.0,84.0,4805.0,-1348.0,-13.0,1767.0,2.7068,578.360946,577.334031,568.108146,9.225886,11.312294,30.627646,0
605,2023-07-05,589.0,589.0,579.0,582.0,15554.0,-50.0,-890.0,-503.0,-1092.0,-790.0,2.6473,579.088757,578.051873,569.137172,8.914701,10.832775,17.707207,0
606,2023-07-06,573.0,574.0,565.0,565.0,32070.0,563.0,-16476.0,-574.0,-603.0,-14045.0,5.1447,576.271005,576.043892,568.830715,7.213177,10.108856,7.697299,1
607,2023-07-07,565.0,572.0,563.0,565.0,19859.0,32.0,-4486.0,-185.0,-100.0,-1681.0,3.6264,574.016804,574.344832,568.546958,5.797874,9.246659,1.662877,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,2023-11-20,576.0,579.0,575.0,577.0,26606.0,176.0,3579.0,-125.0,270.0,-2193.0,5.4217,570.883694,567.188910,556.797391,10.391519,7.167611,94.971748,0
700,2023-11-21,582.0,585.0,581.0,585.0,39881.0,-334.0,18793.0,97.0,-772.0,10844.0,6.7572,573.706955,569.929078,558.886473,11.042605,7.942610,100.000000,0
701,2023-11-22,576.0,579.0,574.0,577.0,23922.0,533.0,-2966.0,-478.0,-230.0,-7073.0,4.7807,574.365564,571.016912,560.228216,10.788696,8.511827,97.938530,0
702,2023-11-23,574.0,578.0,574.0,578.0,15144.0,173.0,3740.0,-253.0,-218.0,93.0,3.0366,575.092451,572.091233,561.544644,10.546589,8.918779,95.718908,0


1.2.Splite data into train(Library) and test(Prediction)

In [4]:
Library = ticker_data[ticker_data['Date'] <= '2023-06-30'] # windows=20, 最後預測到6/30
Prediction = ticker_data[(ticker_data['Date'] >= '2023-07-01')&(ticker_data['Date'] <= '2023-07-20')] # test
# Prediction = ticker_data[(ticker_data['Date'] >= '2023-07-01')&(ticker_data['Date'] <= '2023-10-31')] 
Prediction

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_5
603,2023-07-03,578.0,580.0,576.0,579.0,15118.0,97.0,2353.0,-305.0,1401.0,582.0,2.5477,576.701182,575.940219,566.756798,9.183421,11.833896,33.311646,1.0
604,2023-07-04,585.0,585.0,580.0,585.0,17777.0,84.0,4805.0,-1348.0,-13.0,1767.0,2.7068,578.360946,577.334031,568.108146,9.225886,11.312294,30.627646,1.0
605,2023-07-05,589.0,589.0,579.0,582.0,15554.0,-50.0,-890.0,-503.0,-1092.0,-790.0,2.6473,579.088757,578.051873,569.137172,8.914701,10.832775,17.707207,1.0
606,2023-07-06,573.0,574.0,565.0,565.0,32070.0,563.0,-16476.0,-574.0,-603.0,-14045.0,5.1447,576.271005,576.043892,568.830715,7.213177,10.108856,7.697299,1.0
607,2023-07-07,565.0,572.0,563.0,565.0,19859.0,32.0,-4486.0,-185.0,-100.0,-1681.0,3.6264,574.016804,574.344832,568.546958,5.797874,9.246659,1.662877,0.0
608,2023-07-10,567.0,573.0,565.0,565.0,18996.0,-17.0,-329.0,-67.0,-775.0,-1141.0,3.9634,572.213443,572.907166,568.284221,4.622945,8.321917,0.333989,0.0
609,2023-07-11,574.0,577.0,570.0,577.0,18567.0,-263.0,9680.0,173.0,-306.0,4659.0,3.3345,573.170755,573.536832,568.929834,4.606998,7.578933,0.34881,0.0
610,2023-07-12,574.0,578.0,572.0,578.0,16220.0,-107.0,5154.0,2.0,-1385.0,543.0,2.6385,574.136604,574.223474,569.601698,4.621776,6.987501,0.495048,0.0
611,2023-07-13,587.0,590.0,585.0,585.0,26878.0,-566.0,20423.0,-647.0,403.0,9753.0,3.3511,576.309283,575.881401,570.742313,5.139088,6.617819,5.108553,1.0
612,2023-07-14,589.0,591.0,587.0,591.0,24381.0,-276.0,14641.0,-173.0,204.0,9289.0,3.3484,579.247426,578.207339,572.242882,5.964457,6.487146,13.098743,1.0


In [5]:
Library

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_5
5,2021-01-11,577.0,584.0,574.0,584.0,52772.0,681.0,-7717.0,351.0,-433.0,-10385.0,9.2596,553.361596,546.650220,530.901193,15.749027,9.642037,95.791563,1.0
6,2021-01-12,583.0,597.0,582.0,591.0,52605.0,738.0,-9625.0,2164.0,766.0,-9002.0,8.2691,560.890277,553.481824,535.567763,17.914060,11.356774,96.364177,1.0
7,2021-01-13,595.0,605.0,593.0,605.0,75708.0,2306.0,-7204.0,-260.0,1951.0,-7441.0,11.9535,569.713160,561.416110,540.939857,20.476252,13.233494,98.400949,1.0
8,2021-01-14,587.0,597.0,587.0,592.0,90310.0,481.0,-30284.0,-1843.0,-45.0,-33594.0,13.3009,574.170907,566.125549,544.877485,21.248064,14.873334,99.443302,1.0
9,2021-01-15,621.0,625.0,601.0,601.0,100998.0,-513.0,8090.0,20.0,411.0,1316.0,13.9495,579.537091,571.494925,549.192378,22.302547,16.386434,100.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,2023-06-26,576.0,578.0,574.0,574.0,29870.0,164.0,-4194.0,-3.0,314.0,-2859.0,5.6543,579.105659,576.716989,562.864922,13.852067,15.124979,50.802228,0.0
599,2023-06-27,570.0,575.0,569.0,572.0,22447.0,-71.0,-3850.0,-2006.0,501.0,-168.0,4.2795,577.684527,575.991298,563.541594,12.449704,14.589924,45.390598,0.0
600,2023-06-28,579.0,579.0,571.0,574.0,18685.0,-113.0,-3437.0,-93.0,378.0,-1260.0,4.0474,576.947622,575.684944,564.316291,11.368654,13.945670,41.019391,0.0
601,2023-06-29,578.0,580.0,570.0,573.0,18046.0,28.0,-1355.0,-121.0,1407.0,-434.0,3.7922,576.158098,575.271876,564.959529,10.312347,13.219005,34.887970,0.0


In [6]:
len(Library)

598

In [7]:
Library[f'y_{TP}'].sum()

278.0

1.3.Over-Sampling Library

In [8]:
def over_sampling(Library, target):

    oversample = RandomOverSampler(random_state=87)
    features = Library.drop(columns=['Date', target])
    df_target = Library[target]

    features_resampled, target_resampled = oversample.fit_resample(features, df_target)

    resampled_df = features_resampled.copy()
    resampled_df[target] = target_resampled

    resampled_df.insert(0, 'Date', range(1, len(resampled_df) + 1))

    # print("過取樣後的資料框架大小:", resampled_df.shape)
    return resampled_df

In [9]:
def under_sampling(Library, target):

    undersample = RandomUnderSampler(random_state=87)
    features = Library.drop(columns=['Date', target])
    df_target = Library[target]

    features_resampled, target_resampled = undersample.fit_resample(features, df_target)

    resampled_df = features_resampled.copy()
    resampled_df[target] = target_resampled

    resampled_df.insert(0, 'Date', range(1, len(resampled_df) + 1))

    # print("過取樣後的資料框架大小:", resampled_df.shape)
    return resampled_df

In [10]:
Library = over_sampling(Library=Library, target=TARGET)
Library.tail(50)

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_5
590,591,583.0,585.0,582.0,583.0,14883.0,24.0,-1270.0,112.0,-296.0,-1419.0,2.6661,579.534491,575.34427,558.648208,16.696062,15.35098,74.790087,1.0
591,592,579.0,585.0,579.0,583.0,17013.0,-29.0,1297.0,-764.0,-206.0,-128.0,3.0596,580.227593,576.522075,560.452045,16.07003,15.49479,66.824821,0.0
592,593,582.0,585.0,578.0,581.0,23719.0,56.0,-1047.0,-692.0,590.0,-196.0,4.1668,580.382074,577.210987,561.974115,15.236871,15.443206,62.575711,0.0
593,594,576.0,578.0,574.0,574.0,29870.0,164.0,-4194.0,-3.0,314.0,-2859.0,5.6543,579.105659,576.716989,562.864922,13.852067,15.124979,50.802228,0.0
594,595,570.0,575.0,569.0,572.0,22447.0,-71.0,-3850.0,-2006.0,501.0,-168.0,4.2795,577.684527,575.991298,563.541594,12.449704,14.589924,45.390598,0.0
595,596,579.0,579.0,571.0,574.0,18685.0,-113.0,-3437.0,-93.0,378.0,-1260.0,4.0474,576.947622,575.684944,564.316291,11.368654,13.94567,41.019391,0.0
596,597,578.0,580.0,570.0,573.0,18046.0,28.0,-1355.0,-121.0,1407.0,-434.0,3.7922,576.158098,575.271876,564.959529,10.312347,13.219005,34.88797,0.0
597,598,570.0,576.0,568.0,576.0,33831.0,117.0,-2573.0,38737.0,649.0,-1545.0,6.1965,576.126478,575.383895,565.777341,9.606554,12.496515,34.306232,0.0
598,599,520.0,524.0,519.0,523.0,22520.0,-534.0,3043.0,1843.0,148.0,5037.0,4.9417,511.297727,508.914075,502.089766,6.824309,4.801063,85.411674,1.0
599,600,577.0,586.0,577.0,583.0,23421.0,-766.0,-2830.0,346.0,-95.0,-1994.0,5.6852,579.524786,581.480751,594.510163,-13.029412,-14.865448,36.172456,1.0


In [11]:
Library = under_sampling(Library=Library, target=TARGET)
Library

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_5
531,591,524.0,533.0,524.0,533.0,45681.0,-1083.0,29658.0,184.0,333.0,26367.0,9.7803,518.22836,517.86263,516.457897,1.404733,1.262612,26.817071,1.0
212,592,612.0,612.0,602.0,603.0,22979.0,546.0,-3565.0,-29.0,671.0,-5512.0,4.6955,609.260082,608.20104,603.291849,4.90919,4.893658,83.814573,1.0
304,593,559.0,566.0,558.0,561.0,16579.0,140.0,-3629.0,1085.0,89.0,-2846.0,4.0193,568.444629,570.688854,579.623045,-8.934191,-8.210021,46.056944,1.0
487,594,459.0,459.5,455.0,458.5,23972.0,-751.0,6527.0,-17.0,-635.0,6003.0,6.2956,454.941117,456.655303,460.535638,-3.880335,-1.941887,3.693829,1.0
123,595,583.0,584.0,578.0,582.0,43249.0,376.0,-21917.0,-1417.0,-45.0,-19349.0,5.3219,595.242509,595.153342,593.899066,1.254276,1.648712,50.86061,1.0
190,596,597.0,597.0,590.0,593.0,17725.0,140.0,253.0,83.0,-60.0,-2344.0,3.8974,592.219933,590.899389,590.20461,0.694779,-1.878284,79.996244,1.0
613,597,604.0,610.0,604.0,606.0,16772.0,185.0,6546.0,-359.0,972.0,4983.0,4.2554,603.21856,603.115813,602.836999,0.278814,0.184382,35.545205,1.0
137,598,596.0,596.0,588.0,591.0,13994.0,-82.0,-610.0,-157.0,-774.0,288.0,2.0623,590.222583,589.749475,589.820244,-0.070769,-1.143056,41.339067,1.0
456,599,476.0,491.0,476.0,491.0,39329.0,-25.0,2308.0,-192.0,239.0,8134.0,9.9135,467.28163,458.116101,438.153895,19.962205,9.227963,100.0,1.0
363,600,458.0,458.0,449.5,449.5,24156.0,95.0,-932.0,-34.0,-534.0,-286.0,5.3938,460.225498,464.636229,483.37844,-18.742211,-17.875765,23.280525,1.0


1.3. Concate dataframe function

In [12]:
def concate_Lib_Pred(Library, Prediction, th): 
    """
    th=-1 為Library
    th=n 為第n個Pred concate進Lib
    """
    if th < 0:
        Lib_Pred_df = Library
    
    else:
        row_to_add = Prediction.iloc[th]
        Lib_Pred_df = pd.concat([Library, row_to_add.to_frame().T], ignore_index=True)
        
    # 這種concate方法會有非數值問題, 要這樣修正
    Lib_Pred_df[Library.columns.to_list()[1:]] = Lib_Pred_df[Library.columns.to_list()[1:]].apply(pd.to_numeric, errors='coerce')
    # Lib_Pred_df['Date'] = pd.to_datetime(Lib_Pred_df['Date'])
    
    Lib_Pred_df['Date'] = range(1, len(Lib_Pred_df) + 1)

    return Lib_Pred_df

In [13]:
# Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=0)
# Lib_Pred_df

1.3.Data normalize function

In [14]:
def data_normalize(Lib_Pred_df):

    feature_to_standardize = Lib_Pred_df.columns.to_list()
    feature_to_standardize.remove(Lib_Pred_df.columns[0]) # 排除Date
    feature_to_standardize.remove(Lib_Pred_df.columns[-1])

    scaler = MinMaxScaler() 
    Lib_Pred_df[feature_to_standardize] = scaler.fit_transform(Lib_Pred_df[feature_to_standardize])

    return Lib_Pred_df

In [15]:
# Lib_Pred_df = data_normalize(Lib_Pred_df)
# Lib_Pred_df.tail()

# MDRSmap I

2.1. 製作可餵入EDM格式的train_feature

In [16]:
def find_train_target_feature(data, target):

    df_columns = list(data.columns)
    train_feature = df_columns.copy()
    train_feature.remove('Date') # 先拿掉日期
    
    formatted_columns = ' '.join(df_columns[1:]) # 變成可以餵給 EDM function 參數 'columns' 的形式 
    train_feature.remove(target) # 再拿掉 target_feature

    return formatted_columns, train_feature

In [17]:
# formatted_columns, train_feature = find_train_target_feature(data=Library, target=TARGET)
# formatted_columns

2.2. 找出target_feature最佳嵌入維度

In [18]:
def find_target_OED(data, target):

    target_OED = EmbedDimension(dataFrame=data, lib=f'1 {len(data)}', pred=f'{len(data)-21} {len(data)-1}', columns=target, showPlot=False) # 4. lib訓練全部, pred看最後20筆

    target_OED_rho = target_OED['rho'].max()
    target_OED = int(target_OED['E'][target_OED['rho'] == target_OED['rho'].max()].iloc[0])

    return target_OED, target_OED_rho


In [19]:
# target_OED, target_OED_rho = find_target_OED(data=Library, target=TARGET)
# print(f'target_OED: {target_OED}, target_OED_rho: {target_OED_rho}')

2.3. 找出所有有因果關係的train_feature

In [20]:
def find_rho_sig_df(data, ticker, target, target_OED, train_feature, E_max):
    
    crirho = stats.t.ppf(0.95, len(data) - 1) / (len(data) - 2 + stats.t.ppf(0.95, len(data) - 1) ** 2)
    ccm_libSizes = f'{target_OED+10} {len(data)-10} 10'
    # ccm_libSizes = list(range(10, len(data) + 1, 10)) + [len(data)]  # sequence of library size # original

    rho_sig_df = pd.DataFrame(columns=train_feature)
    for train in train_feature:

        ### 找出該train_feature最好的ccm_OED ###
        ccm_E_termRHO = pd.DataFrame(columns=['E', 'term_rho'])
        """
        這裡假設用 term_rho 來選 ccm_OED
        """
        for e in range(1, E_max+1):
            ccm_result = CCM(dataFrame=data, E=e, columns=train, target=target,
                            libSizes=ccm_libSizes, random=False, showPlot=False)
            # print(e, ccm_result[f'{target}:open'].iloc[-1]) # 有時候會有warning, 測試用
            new_data = {'E': e, 'term_rho': ccm_result[f'{target}:{train}'].iloc[-1]}
            ccm_E_termRHO.loc[len(ccm_E_termRHO)] = new_data

        max_term_rho_index = ccm_E_termRHO['term_rho'].idxmax()
        ccm_OED = ccm_E_termRHO.at[max_term_rho_index, 'E']

        ### 用最好的ccm_OED來做該feature的因果檢定 ###
        ccm_result = CCM(dataFrame=data, E=ccm_OED, columns=train, target=target, 
                        libSizes=ccm_libSizes, random=False, showPlot=False)
        """
        這裡假設用 target:train 、 LibSize 來做 kendalltau 檢定
        """
        ccm_result = ccm_result[['LibSize', f'{target}:{train}']]
        ccm_result[f'{target}:{train}'][ccm_result[f'{target}:{train}'] < 0] = 0
        term_rho = ccm_result[f'{target}:{train}'].iloc[-1]

        tau, p_value = kendalltau(ccm_result['LibSize'], ccm_result[f'{target}:{train}']) # 進行 kendalltau 相關檢定

        alpha = 0.05
        if (p_value < alpha) and (term_rho > crirho): # 顯著相關
            rho_sig_df[train] = [term_rho]

        else: # "不" 顯著相關
            rho_sig_df[train] = [0]

    rho_sig_df.index = pd.Index([f'{ticker}_{target}']) 
    
    return rho_sig_df


In [21]:
# rho_sig_df = find_rho_sig_df(data=Library, ticker=TICKER, target=TARGET, 
#                              target_OED=target_OED, train_feature=train_feature, E_max=10)
# rho_sig_df

2.4. 用有因果關係的train_feature建立Embed_df

In [22]:
def make_Embed_df(data, max_lag, target, rho_sig_df, th):

    #用有因果關係的 train feature + target feature 製作 Embed_df #
    non_zero_columns = rho_sig_df.loc[:, (rho_sig_df != 0).any(axis = 0)] # 選取值非0的column
    train_feature_ls = list(non_zero_columns.columns)
    formatted_columns = ' '.join(train_feature_ls) # 轉成 EDM column 的 input
    columns_to_lag = formatted_columns + f' {target}' # 加入 target 本身

    Embed_df = Embed(dataFrame=data, E=max_lag, tau=-1, columns=columns_to_lag) # 製作 Embed_df
    Embed_df['Date'] = data['Date'] # 加入Date來看index, 才可以防simplex func的bug
    Embed_df.dropna(inplace=True) # 把包含NaN的資料拿掉
    Embed_df = Embed_df.reset_index(drop=True)
    Embed_df = Embed_df[['Date'] + [col for col in Embed_df.columns if col != 'Date']]

    if th == 0 :
        ML_df_date = Embed_df.copy()
        ML_df_date['Date'] = pd.to_datetime(ML_df_date['Date']) # 將index設為日期
        ML_df_date.set_index('Date', inplace=True)
        ML_df_date = ML_df_date.filter(like="(t-0)") # 只留下(t-0)的column
    
    if th > 0 :
        ML_df_date = Embed_df.copy()
        ML_df_date = ML_df_date.dropna().reindex()
        # ML_df_date['Date'] = pd.to_datetime(ML_df_date['Date']) # 將index設為日期
        # ML_df_date.set_index('Date', inplace=True)
        ML_df_date = ML_df_date.filter(like="(t-0)") # 只留下(t-0)的column

    return Embed_df, ML_df_date

In [23]:
# Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df, th=1)
# Embed_df.tail()

In [24]:
# ML_df_date.tail()

2.5. 用simplex randomsearch找出最佳的view

In [25]:
def make_random_simplex(Embed_df, target, target_OED, kmax, kn):    

    Embed_for_train = Embed_df.drop(columns='Date') # 先把 Date 拿掉
    Embed_for_train = Embed_for_train.drop(columns=f'{target}(t-0)') # 先把 target 拿掉
    train_f_ls = list(Embed_for_train.columns) # train_feature
    train_f_num = len(Embed_for_train.columns) # train_feature 的個數

    rho_feature_view = pd.DataFrame(columns=['rho']) # 創建一個df去紀錄每個隨機view的資料
    new_column = pd.DataFrame(columns=['feature_' + str(i) for i in range(1, target_OED+1)])
    rho_feature_view = pd.concat([rho_feature_view, new_column], axis=1)
    k = 1
    while k <= kmax:
        random_pick_train = np.random.choice(train_f_num, target_OED, replace=False)
        # print(random_pick_train)

        train_f_ls = np.array(train_f_ls) # 變成 array 才可以一次選
        select_train_f = train_f_ls[random_pick_train] # 隨機選到的 train_feature
        formatted_random_columns = ' '.join(select_train_f) # 用成符合 EDM 的資料格式
        # print(formatted_random_columns)

        simp = Simplex(dataFrame=Embed_df, E=target_OED, # ver3: 測試近10 or 20個交易日
                       lib=f'1 {len(Embed_df)}', pred = f'{len(Embed_df)-21} {len(Embed_df)-1}', 
                       columns=formatted_random_columns, target=f'{target}(t-0)',
                       embedded = True, showPlot = False) # 原本是False現在改True
        # print(simp)

        sub_simp = simp[['Observations', 'Predictions']] # 計算rho
        rho = sub_simp['Observations'].corr(sub_simp['Predictions'])

        rho_feature_view.loc[len(rho_feature_view), 'rho'] = rho # 將 view 更新到 rho_feature_view 的 df 中
        rho_feature_view.loc[len(rho_feature_view)-1, rho_feature_view.columns[1:]] = select_train_f
        # print(rho)
        k += 1

    allscore = rho_feature_view.sort_values(by='rho', ascending=False).head(kn)
    allscore = allscore.reset_index(drop=True)

    return allscore

In [26]:
# allscore = make_random_simplex(Embed_df=Embed_df, target=TARGET, target_OED=target_OED, kmax=10000, kn=5)
# allscore.head()

# MDRSmap II

3.1. 計算每個時點的(view加權)距離

In [27]:
def compute_view_w_distance(Embed_df, allscore):

    ww = allscore['rho'] / allscore['rho'].sum() # 每個view的權重

    dmatrix_ls = []
    for j in range(allscore.shape[0]):

        view_feature = allscore.iloc[j, 1:] # 選取第j個view的所有feature
        view_feature = np.array(view_feature) # 把所有feature變成array才可以從完整Embed_df中找資料
        view_feature_value = Embed_df[view_feature]
        view_matrix = view_feature_value.to_numpy() # 從df形式變array
        view_matrix = np.vstack(view_matrix) # 這樣才能疊成matrix

        Dx_t2 = pdist(view_matrix, metric='euclidean') * ww[j] # 計算加權距離
        Dx_t2 = squareform(Dx_t2) # 將距離變成squareform
        dmatrix_ls.append(Dx_t2)

    v_w_dmatrix = np.sum(dmatrix_ls, axis=0) # 輸出每個時點的view加權距離

    return v_w_dmatrix

In [28]:
# v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)
# v_w_dmatrix

3.2.尋找elastic-net最佳參數

In [29]:
# ### test ###
# target = 'bs'
# Tp=1

# ML_df_date_new = ML_df_date.copy()
# ML_df_date_new[f'ans(t-0)'] = ML_df_date_new[f'{target}(t-0)'].shift(-Tp) # step.1: 先將target往前移Tp, 製作y
# # ML_df_date_new = ML_df_date_new.multiply(w_tp, axis=0) # step.2: 再將data乘上距離加權
# ML_df_date_new = ML_df_date_new[:-(Tp+1)] # step.3: 拿掉最後Tp+1個, 因為最後面的data是硬拼上去的
# ML_df_date_new

In [30]:
def find_MDRSmap_param(target, ML_df_date, theta_seq, v_w_dmatrix, Tp):

    result_ls = pd.DataFrame(columns=['Theta', 'Score', 'Param']) # 創建紀錄回測結果的dataframe

    ### 將原始資料乘上空間位置權數 ###
    tp = len(ML_df_date) -1
    tp_distence = v_w_dmatrix[tp] # 第tp個時點離其他時點的距離
    mask = np.ones(len(tp_distence), dtype=bool) # 遮蔽該時點計算平均數
    mask[tp] = False
    dpar = np.mean(tp_distence[mask]) # 第tp個時點離其他時點的平均數

    for theta in theta_seq:
        w_tp = np.exp(-theta * tp_distence / dpar) # 計算每個時點資料的加權
        w_tp = np.sqrt(w_tp)

        ### 加入答案列 ###
        ML_df_date_new = ML_df_date.copy()
        ML_df_date_new[f'ans(t-0)'] = ML_df_date_new[f'{target}(t-0)'].shift(-Tp) # step.1: 先將target往前移Tp, 製作y
        ML_df_date_new = ML_df_date_new.multiply(w_tp, axis=0) # step.2: 再將data乘上距離加權
        ML_df_date_new['ans(t-0)'] = ML_df_date_new['ans(t-0)'].apply(lambda x: 1.0 if x != 0 else x) # step.3: 把ans非0的部分變成1
        ML_df_date_new = ML_df_date_new[:-(Tp+1)] # step.4: 拿掉最後Tp+1個, 因為最後面的data是硬拼上去的
        # ML_df_date_new = ML_df_date_new.drop(columns=[f'{target}(t-0)']) # step.5: 原paper有刪target啦, 這邊可選擇刪或不刪
        # ML_df_date_new = ML_df_date_new.dropna().reset_index(drop=True) # 不確定要不要用

        # ### 分拆train, validation(以近60天為基準) ###
        X = ML_df_date_new.iloc[:, :-1]
        y = ML_df_date_new.iloc[:, -1]
        # val_fold = [-1] * (len(X)-60) + [0] * 60 # 最後60筆當validation set
        # ps = PredefinedSplit(test_fold=val_fold)

        logistic_elastic_net = LogisticRegression(penalty='elasticnet', 
                                                  solver='saga', # 只有saga支持elasticnet
                                                  random_state=87)

        ### grid search ###
        param_grid = {'l1_ratio': [0.9, 0.1, 0.01, 0.001, 0.0001],
                      'C': [0.001, 0.01, 0.1, 1, 10],
                      'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
                      'fit_intercept': [True], 
                      'intercept_scaling': [0.1],
                      'warm_start': [True]}
        grid_search = GridSearchCV(estimator=logistic_elastic_net, 
                                   param_grid=param_grid, 
                                #    cv=ps,
                                   cv=5,
                                   scoring='accuracy', 
                                   return_train_score=True)       

        grid_search.fit(X, y)

        ### 記錄結果 ###
        result_ls.loc[len(result_ls), 'Theta'] = theta
        result_ls.loc[len(result_ls)-1, 'Score'] = grid_search.best_score_
        result_ls.loc[len(result_ls)-1, 'Param'] = [grid_search.best_params_]

        theta = result_ls['Theta'][result_ls['Score'].idxmax()]
        param = result_ls['Param'][result_ls['Score'].idxmax()][0]

    return result_ls, theta, param

In [31]:
# result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
#                                              ML_df_date=ML_df_date, 
#                                              theta_seq=[1,2,4,7,11,16,22], 
#                                              v_w_dmatrix=v_w_dmatrix,
#                                              Tp=TP)

In [32]:
# result_ls['Param'][5]

3.3.用最佳參數訓練MDRSmap

In [33]:
def MDRSmap_model(target, ML_df_date, theta, v_w_dmatrix, param, Tp):

    ### 將原始資料乘上空間位置權數 ###
    tp = len(ML_df_date) -1
    tp_distence = v_w_dmatrix[tp] # 第tp個時點離其他時點的距離
    mask = np.ones(len(tp_distence), dtype=bool) # 遮蔽該時點計算平均數
    mask[tp] = False
    dpar = np.mean(tp_distence[mask]) # 第tp個時點離其他時點的平均數

    w_tp = np.exp(-theta * tp_distence / dpar) # 計算每個時點資料的加權
    w_tp = np.sqrt(w_tp)

    ### 加入答案列 ###
    ML_df_date_new = ML_df_date.copy()
    ML_df_date_new[f'ans(t-0)'] = ML_df_date_new[f'{target}(t-0)'].shift(-Tp) # step.1: 先將target往前移Tp, 製作y
    ML_df_date_new = ML_df_date_new.multiply(w_tp, axis=0) # step.2: 再將data乘上距離加權
    ML_df_date_new['ans(t-0)'] = ML_df_date_new['ans(t-0)'].apply(lambda x: 1.0 if x != 0 else x) # step.3: 把ans非0的部分變成1
    ML_df_date_new = ML_df_date_new[:-(Tp+1)] # step.4: 拿掉最後Tp+1個, 因為最後面的data是硬拼上去的
    # ML_df_date_new = ML_df_date_new.drop(columns=[f'{target}(t-0)']) # step.5: 原paper有刪target啦, 這邊可選擇刪或不刪
    # ML_df_date_new = ML_df_date_new.dropna().reset_index(drop=True) # 不確定要不要用

    ### 分拆train, validation(以近60天為基準) ###
    X = ML_df_date_new.iloc[:, :-1]
    y = ML_df_date_new.iloc[:, -1]

    logistic_elastic_net = LogisticRegression(penalty='elasticnet', 
                                              solver='saga', # 只有saga支持elasticnet
                                              random_state=87,
                                              **param)
                             

    logistic_elastic_net.fit(X, y)

    return logistic_elastic_net

In [34]:
# logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
#                                      theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)

3.4.進行預測

In [35]:
# X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
# y_pred = logistic_elastic_net.predict(X_pred)
# y_pred = y_pred[0]
# y_pred

3.5.製作評估dataframe

In [36]:
# Date = origi_data['Date'][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-11-30')].reset_index(drop=True)
# Today = origi_data['bs'][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-11-30')].reset_index(drop=True)
# Yesterday = origi_data['bs'][(origi_data['Date']>='2023-06-30')&(origi_data['Date']<='2023-11-29')].reset_index(drop=True)

# MDRSmap_result = pd.DataFrame(Date)
# MDRSmap_result['Observations'] = Today
# MDRSmap_result['Predictions'] = None
# MDRSmap_result['Yesterday'] = Yesterday
# MDRSmap_result

In [37]:
# th=0
# MDRSmap_result.loc[th, 'Predictions'] = y_pred
# MDRSmap_result

In [38]:
# MDRSmap_result['Date'][th]

### 四、完整預測流程

4.1.完整預測流程

In [39]:
### 從 6/30 預測 7/03 開始 ###

### 製作評估dataframe ###
Date = origi_data['Date'][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-07-20')].reset_index(drop=True)
Observations = origi_data[TARGET][(origi_data['Date']>='2023-07-01')&(origi_data['Date']<='2023-07-20')].reset_index(drop=True)

MDRSmap_result = pd.DataFrame(Date)
MDRSmap_result['Observations'] = Observations
MDRSmap_result['Predictions'] = None

th=0

Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=th)
Lib_Pred_df = data_normalize(Lib_Pred_df)

formatted_columns, train_feature = find_train_target_feature(data=Lib_Pred_df, target=TARGET)
target_OED, target_OED_rho = find_target_OED(data=Lib_Pred_df, target=TARGET)
rho_sig_df = find_rho_sig_df(data=Lib_Pred_df, ticker=2330, target=TARGET, 
                                target_OED=target_OED, train_feature=train_feature, E_max=10)

Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df, th=th)
allscore = make_random_simplex(Embed_df=Embed_df, target=TARGET, target_OED=target_OED, kmax=10000, kn=5)
v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)

result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
                                             ML_df_date=ML_df_date, 
                                             theta_seq=[1,2,4,7,11,16,22], 
                                             v_w_dmatrix=v_w_dmatrix,
                                             Tp=TP)
logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
                                     theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)

X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
y_pred = logistic_elastic_net.predict(X_pred)
y_pred = y_pred[0]

MDRSmap_result.loc[th, 'Predictions'] = y_pred
print(f"{MDRSmap_result['Date'][th]}: finished")

for th in range(1, len(Prediction)):

    Library = over_sampling(Library=Library, target=TARGET)
    # Library = under_sampling(Library=Library, target=TARGET)
    Lib_Pred_df = concate_Lib_Pred(Library=Library, Prediction=Prediction, th=th)
    Lib_Pred_df = data_normalize(Lib_Pred_df)
    Embed_df, ML_df_date = make_Embed_df(data=Lib_Pred_df, max_lag=10, target=TARGET, rho_sig_df=rho_sig_df, th=th)
    v_w_dmatrix = compute_view_w_distance(Embed_df=Embed_df, allscore=allscore)
    result_ls, theta, param = find_MDRSmap_param(target=TARGET, 
                                                    ML_df_date=ML_df_date, 
                                                    theta_seq=[1,2,4,7,11,16,22], 
                                                    v_w_dmatrix=v_w_dmatrix,
                                                    Tp=TP)
    logistic_elastic_net = MDRSmap_model(target=TARGET, ML_df_date=ML_df_date, 
                                            theta=theta, v_w_dmatrix=v_w_dmatrix, param=param, Tp=TP)
    
    X_pred = np.array(ML_df_date.iloc[-1]).reshape(1, -1)
    y_pred = logistic_elastic_net.predict(X_pred)
    y_pred = y_pred[0]

    MDRSmap_result.loc[th, 'Predictions'] = y_pred
    print(f"{MDRSmap_result['Date'][th]}: finished")

ValueError: 
All the 625 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
625 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yitsung/miniconda3/envs/EDM_test/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yitsung/miniconda3/envs/EDM_test/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1196, in fit
    X, y = self._validate_data(
  File "/Users/yitsung/miniconda3/envs/EDM_test/lib/python3.9/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/yitsung/miniconda3/envs/EDM_test/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/Users/yitsung/miniconda3/envs/EDM_test/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/yitsung/miniconda3/envs/EDM_test/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


4.3.評估結果

In [None]:
ACC = len(MDRSmap_result[MDRSmap_result['Predictions'] == MDRSmap_result['Observations']]) / len(MDRSmap_result['Observations'])
print('ACC: ', ACC)
MDRSmap_result.head(60)

ACC:  0.5


Unnamed: 0,Date,Observations,Predictions
0,2023-07-03,1,0.0
1,2023-07-04,0,0.0
2,2023-07-05,0,0.0
3,2023-07-06,1,1.0
4,2023-07-07,1,0.0
5,2023-07-10,1,0.0
6,2023-07-11,1,0.0
7,2023-07-12,1,0.0
8,2023-07-13,1,1.0
9,2023-07-14,1,0.0
