In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

import time
import datetime as dt

from sklearn import preprocessing
from sklearn import svm

import math

In [2]:
data = pd.read_csv("tsharep.csv",encoding=' big5-hkscs ').rename(columns={'代碼':'code','日期':'date','中文簡稱':'name','開盤價(元)':'open','最高價(元)':'high','最低價(元)':'low','收盤價(元)':'close','成交張數(張)':'volume'})
data.head()

Unnamed: 0,code,date,name,open,high,low,close,volume
0,1101,20130102,台泥,38.95,39.1,38.65,39.0,6374
1,1101,20130103,台泥,39.5,39.5,38.75,38.85,9710
2,1101,20130104,台泥,39.4,39.45,38.6,39.0,8682
3,1101,20130107,台泥,39.1,39.1,38.65,38.9,5067
4,1101,20130108,台泥,38.9,39.1,38.2,38.5,6454


In [3]:
def ETF_data_processing(filepath):
    ETFtable = pd.read_csv(filepath,encoding=' big5-hkscs ').rename(columns={'代碼':'code','日期':'date','中文簡稱':'name','開盤價(元)':'open','最高價(元)':'high','最低價(元)':'low','收盤價(元)':'close','成交張數(張)':'volume'})
   
    #processing data type 
    ETFtable['date'] = ETFtable['date'].map(lambda x:dt.datetime.strptime(str(x),'%Y%m%d'))
#     ETFtable['week'] = ETFtable['date'].map(lambda x:x.isoweekday())
    ETFtable['close'] = ETFtable['close'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['open'] = ETFtable['open'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['high'] = ETFtable['high'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['low'] = ETFtable['low'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['volume'] = ETFtable['volume'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    
    return ETFtable

In [4]:
ETFtable = ETF_data_processing('tetfp.csv')
ETFtable.head()

Unnamed: 0,code,date,name,open,high,low,close,volume
0,50,2013-01-02,元大台灣50,54.0,54.65,53.9,54.4,16487.0
1,50,2013-01-03,元大台灣50,54.9,55.05,54.65,54.85,29020.0
2,50,2013-01-04,元大台灣50,54.85,54.85,54.4,54.5,9837.0
3,50,2013-01-07,元大台灣50,54.55,54.55,53.9,54.25,8910.0
4,50,2013-01-08,元大台灣50,54.0,54.2,53.65,53.9,12507.0


In [13]:
#target ETF
def extract_target_ETF(ETFtable,ETFcode):
    ETF_target = ETFtable[ETFtable['code']==ETFcode].drop(['code','name'],axis = 1).groupby(['date']).sum()
#     ETF_target = pd.concat([ETF_target,index_table],axis = 1)
    
    # move the target feature to the last column
    
    return ETF_target

def get_change_rate(stock,back_day = 1):
    stock_r = stock.replace(0,0.1) 
    for back in range(back_day):
        if(back == 0):
            stock_rate = (stock_r - stock_r.shift(back+1).fillna(0))/stock_r
        else:
            stock_temp = (stock_r - stock_r.shift(back+1).fillna(0))/stock_r
            
            #rename columns
            column_names = list(stock_temp.columns)
            dict_ = {}
            for column_name in column_names:
                dict_[column_name] = column_name+f'_{back}'
                
            stock_temp = stock_temp.rename(columns = dict_)
            
            stock_rate = pd.concat([stock_rate,stock_temp],axis = 1)
    
    return stock_rate[back_day:]

def rise_fall(rate,describe):
    if (rate >= describe[1]+describe[2]*0.25):
        return 1
    elif(rate <= describe[1]-describe[2]*0.25):
        return -1
    else :
        return 0

def sign(x):
    if (x > 0):
        return 1
    elif(x < 0):
        return -1
    else:
        return 0

def get_rise_fall(df):
    df_rise_fall_d = df['close'].describe()
#     df_i_rise_fall_d = df['i_close'].describe()
    
    trf = []
    rf = []
    irf = []
    for day in range(df.shape[0]):
        rf.append(rise_fall(df['close'][day],df_rise_fall_d))
#         irf.append(rise_fall(df['i_close'][day],df_i_rise_fall_d))
        trf.append(sign(df['close'][day]))
#     df['i_rise_fall'] = irf 
    df['rise_fall'] = rf
    df['t_rise_fall'] = trf
    
#data type
def normalize_data(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    df_N = df.copy()
    
    for columnname in df_N.columns:
        df_N[columnname] = min_max_scaler.fit_transform(df_N[columnname].values.reshape(-1,1))
    
    return df_N 

def denormalize(stock, normalized_value,columnname): 
    stock_price = stock[columnname].values.reshape(-1,1)
    normalized_value = normalized_value.reshape(-1,1)

    #return df.shape, p.shape
    min_max_scaler = preprocessing.MinMaxScaler()
    stock_price_N = min_max_scaler.fit_transform(stock_price)
    new = min_max_scaler.inverse_transform(normalized_value)
    return new

#load data
def load_data(stock, seq_len):
    amount_of_features = len(stock.columns) 
    data = stock.as_matrix() 
    sequence_length = seq_len + 1 # index starting from 0
    result = []
    
    for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
        result.append(data[index: index + sequence_length]) # index : index + 20days
    
    result = np.array(result)
    row = round(0.9 * result.shape[0]) # 90% split
    train = result[:int(row), :] # 90% date, all features 
    
    x_train = train[:, :-1] #make the last day of train data as y_train
    y_train = train[:, -1][:,-1] #the close price of the last day of every data 
    
    x_test = result[int(row):, :-1] 
    y_test = result[int(row):, -1][:,-1]

    x_train = x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
    x_test =  x_test.reshape((x_test.shape[0],x_test.shape[1]*x_test.shape[2]))
     

    return [x_train, y_train, x_test, y_test]

#load data
def load_data_regression(stock, seq_len):
    t = stock['close'].values
    stock_r = stock.drop(['close'],axis=1)
    stock_r['close'] = t
    
    amount_of_features = len(stock.columns) 
    data = stock.as_matrix() 
    sequence_length = seq_len + 1 # index starting from 0
    result = []
    
    for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
        result.append(data[index: index + sequence_length]) # index : index + 20days
    
    result = np.array(result)
    row = round(0.9 * result.shape[0]) # 90% split
    train = result[:int(row), :] # 90% date, all features 
    
    x_train = train[:, :-1] #not use close and rise_fall 
    y_train = train[:, -1][:,-1] #the close price of the last day of every data 
    
    x_test = result[int(row):, :-1] 
    y_test = result[int(row):, -1][:,-1]

    x_train = x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
    x_test =  x_test.reshape((x_test.shape[0],x_test.shape[1]*x_test.shape[2]))
     

    return [x_train, y_train, x_test, y_test]

# Model
class LinearRegressionReg:

    def __init__(self):
        self._dimension = 0

    def fit(self, X, Y, lamb):  #calculate w
        self._dimension = X.shape[1]
        self._w = np.zeros((self._dimension,1))
        self._lamb = lamb
        self._w = np.linalg.inv(np.dot(X.T, X) + lamb*np.eye(self._dimension)).dot(X.T).dot(Y)

    def predict(self, X):
        result = np.dot(X, self._w)
        return result
    
    def fiveday_predict(self,X,Y):
        for i in range(5):
            Y_predict = self.predict(X)
    
    def error(self, X, Y):  #squared error
        Y_predict = self.predict(X)
        return sum((Y_predict-Y)**2)/(len(Y)*1.0)            

    def get_w(self):
        return self._w

    def print_val(self):
        print ("w: ", self._w)
    
    def score(self,X,Y,original_stock):
        Y_predict = self.predict(X)
        Y_predict_de = denormalize(original_stock,Y_predict,'close').reshape((Y.shape[0]))
        Y_test_de = denormalize(original_stock,Y,'close').reshape((Y.shape[0]))
        return sum(((Y_test_de-abs(Y_predict_de-Y_test_de))/Y_test_de)*0.5)/(len(Y_test_de)*1.0)


In [103]:
def one_day_prediction_regression(ETF_list,window,lamb):
    ETF_score = []
    for ETF_code in ETF_list:
        ETF_target = extract_target_ETF(ETFtable,ETF_code)
        
        #price data
        ETF_target_N = normalize_data(ETF_target)
        X_train_p, y_train_p, X_test_p, y_test_p = load_data(ETF_target_N, window)
        
        #rate data
        ETF_target_rate = get_change_rate(ETF_target)
        get_rise_fall(ETF_target_rate)
        X_train_r, y_train_r, X_test_r, y_test_r = load_data(ETF_target_rate, window)
        
        ### Model
        lr = LinearRegressionReg()
        lr.fit(X_train_p, y_train_p, lamb)

        ### score
        p = lr.predict(X_test_p)
        y_predict_de = denormalize(ETF_target,p,'close').reshape((y_test_p.shape[0]))
        
        y_predict_rise_fall = []
        for day,price in enumerate(y_predict_de):
                if (day != 0):
                    y_predict_rise_fall.append(sign(y_predict_de[day]-y_predict_de[day-1]))                
        y_predict_rise_fall = np.array(y_predict_rise_fall).astype(np.float64)
        
        if(y_test_r.shape[0] > y_predict_rise_fall.shape[0]):
            y_test_r_short = y_test_r[y_test_r.shape[0]-y_predict_rise_fall.shape[0]:] 
            score = (sum(y_test_r_short == y_predict_rise_fall)*0.5)/(y_test_r_short.shape[0]*1.0)
        else:
            score = (sum(y_test_r == y_predict_rise_fall)*0.5)/(y_test_r.shape[0]*1.0)

        print(f'{ETF_code} average one day score is',score)
        ETF_score.append(score)
        
    print('Total ETF one day rise_fall score is',sum(ETF_score))    
    print('Average ETF one day rise_fall score is',sum(ETF_score)/18.0)

In [104]:
# 0 <= score <= 0.5
ETF_list = [50,51,52,53,54,55,56,57,58,59,6201,6203,6204,6208,690,692,701,713]
window = 20
lamb = 1
one_day_prediction_regression(ETF_list,window,lamb)

50 average one day score is 0.2698412698412698
51 average one day score is 0.2896825396825397
52 average one day score is 0.18253968253968253
53 average one day score is 0.24603174603174602
54 average one day score is 0.21031746031746032
55 average one day score is 0.23412698412698413
56 average one day score is 0.2261904761904762
57 average one day score is 0.21031746031746032
58 average one day score is 0.20238095238095238
59 average one day score is 0.21031746031746032
6201 average one day score is 0.25
6203 average one day score is 0.21428571428571427
6204 average one day score is 0.18253968253968253
6208 average one day score is 0.17857142857142858
690 average one day score is 0.2727272727272727
692 average one day score is 0.34210526315789475
701 average one day score is 0.20833333333333334
713 average one day score is 0.2222222222222222
Total ETF one day rise_fall score is 4.15253094858358
Average ETF one day rise_fall score is 0.2306961638101989


In [101]:
def one_day_prediction_svm(ETF_list,window):
    ETF_score = []
    for ETF_code in ETF_list:
        ETF_target = extract_target_ETF(ETFtable,ETF_code)
        ETF_target_rate = get_change_rate(ETF_target)
        get_rise_fall(ETF_target_rate)
        X_train, y_train, X_test, y_test = load_data(ETF_target_rate, window)

        ### Model
#         clf = svm.SVC(kernel='linear')
#         clf.fit(X_train,y_train)
#         y_predict_test = clf.predict(X_test)
#         linear = sum(y_predict_test == y_test)*0.5/(y_test.shape[0]*1.0)
# #         y_predict_train = clf.predict(X_train)
# #         print('Average train one day score is ',sum(y_predict_train == y_train)*0.5/(y_train.shape[0]*1.0))
# #         print('Average test one day score is ',sum(y_predict_test == y_test)*0.5/(y_test.shape[0]*1.0))

#         clf = svm.SVC(kernel='poly',degree = 3)
#         clf.fit(X_train,y_train)
#         y_predict_test = clf.predict(X_test)
#         poly = sum(y_predict_test == y_test)*0.5/(y_test.shape[0]*1.0)
# #         y_predict_train = clf.predict(X_train)
# #         print('Average train one day score is ',sum(y_predict_train == y_train)*0.5/(y_train.shape[0]*1.0))
# #         print('Average test one day score is ',sum(y_predict_test == y_test)*0.5/(y_test.shape[0]*1.0))

        clf = svm.SVC(kernel='rbf',gamma = 1)
        clf.fit(X_train,y_train)
        y_predict_test = clf.predict(X_test)
        rbf = sum(y_predict_test == y_test)*0.5/(y_test.shape[0]*1.0)
#         y_predict_train = clf.predict(X_train)
#         print('Average train one day score is ',sum(y_predict_train == y_train)*0.5/(y_train.shape[0]*1.0))
#         print('Average test one day score is ',sum(y_predict_test == y_test)*0.5/(y_test.shape[0]*1.0))

        print(f'{ETF_code} average one day score is :',rbf)
        ETF_score.append(rbf)

    print('Total ETF one day rise_fall score is',sum(ETF_score))    
    print('Average ETF one day rise_fall score is',sum(ETF_score)/18.0)

In [102]:
# 0 <= score <= 0.5
ETF_list = [50,51,52,53,54,55,56,57,58,59,6201,6203,6204,6208,690,692,701,713]
window = 20
one_day_prediction_svm(ETF_list,window)

50 average one day score is : 0.2698412698412698
51 average one day score is : 0.25396825396825395
52 average one day score is : 0.25793650793650796
53 average one day score is : 0.25793650793650796
54 average one day score is : 0.2222222222222222
55 average one day score is : 0.2619047619047619
56 average one day score is : 0.2619047619047619
57 average one day score is : 0.2261904761904762
58 average one day score is : 0.19047619047619047
59 average one day score is : 0.23412698412698413
6201 average one day score is : 0.25396825396825395
6203 average one day score is : 0.20238095238095238
6204 average one day score is : 0.19047619047619047
6208 average one day score is : 0.1865079365079365
690 average one day score is : 0.21739130434782608
692 average one day score is : 0.25
701 average one day score is : 0.11538461538461539
713 average one day score is : 0.25
Total ETF one day rise_fall score is 4.102617189573712
Average ETF one day rise_fall score is 0.22792317719853955


# Five day prediction

In [94]:
# feature model
class LinearRegressionReg:

    def __init__(self):
        self._dimension = 0

    def fit(self, X, Y, lamb):  #calculate w
        self._dimension = X.shape[1]
        self._w = np.zeros((self._dimension,1))
        self._lamb = lamb
        self._w = np.linalg.inv(np.dot(X.T, X) + lamb*np.eye(self._dimension)).dot(X.T).dot(Y)

    def predict(self, X):
        result = np.dot(X, self._w)
        return result
    
    def fiveday_predict(self,X,Y):
        for i in range(5):
            Y_predict = self.predict(X)
    
    def error(self, X, Y):  #squared error
        Y_predict = self.predict(X)
        return sum((Y_predict-Y)**2)/(len(Y)*1.0)            

    def get_w(self):
        return self._w

    def print_val(self):
        print ("w: ", self._w)
    
    def score(self,X,Y,original_stock):
        Y_predict = self.predict(X)
        Y_predict_de = denormalize(original_stock,Y_predict,'close').reshape((Y.shape[0]))
        Y_test_de = denormalize(original_stock,Y,'close').reshape((Y.shape[0]))
        return sum(((Y_test_de-abs(Y_predict_de-Y_test_de))/Y_test_de)*0.5)/(len(Y_test_de)*1.0)
    
#load data
def load_data_feature(stock, seq_len,feature,day=0):
    amount_of_features = len(stock.columns) 
    data = stock.as_matrix() 
    sequence_length = seq_len + 1 # index starting from 0
    result = []
    
    for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
        result.append(data[index: index + sequence_length]) # index : index + 22days
    
    result = np.array(result)
    row = round(0.9 * result.shape[0]) # 90% split
    train = result[:int(row), :] # 90% date, all features 
    
    x_train = train[:, :-1] 
    y_train = train[:,-1,feature] 

    x_test = result[int(row):, :-1] 
    y_test = result[int(row):, -1,feature]

    
    x_train = x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
    x_test =  x_test.reshape((x_test.shape[0],x_test.shape[1]*x_test.shape[2])) #shape(127,20,6) --> (127,120)
     

    return [x_train, y_train, x_test, y_test]


#transform the shape of features array into the shape of X data
def shape_transfrom(features):
    features_=[]
    for i in range(features.shape[1]):
        temp = np.array([])
        for feature in features:
            temp = np.hstack((temp,feature[i]))
        features_ .append(temp)
    
    features_ = np.array(features_)
    return features_


def make_features_model_first(stock,seq_len,lamb):
    amount_of_features = len(stock.columns)
    models = []
    errors = []
    features = []
    columns = stock.columns
    
    for i in range(amount_of_features): #load data and train
        x_train ,y_train ,x_test,y_test= load_data_feature(stock,seq_len,i)
        lr = LinearRegressionReg()
        lr.fit(x_train,y_train,lamb)
        models.append(lr)
        errors.append(lr.error(x_test,y_test))
        features.append(lr.predict(x_test))
        
    features_ = shape_transfrom(np.array(features))
    
    return [features_,models,x_test,y_test]

def predict_features(x_test,model,amount_of_features):
    errors = []
    features = []
    
    for i in range(amount_of_features): #load data and train
        features.append(model[i].predict(x_test))
        
    features = np.array(features)
    features_ = shape_transfrom(features)
    
    return features_ 

def fiveday_predict(stock,seq_len,lamb):
    amount_of_features = len(stock.columns)
    features,model,x_test,y_test = make_features_model_first(stock,seq_len,lamb)
    y_predict = []
    y_predict.append(features[:,-1]) #shape = (127,1)
    length = x_test.shape[0]
    
    for day in range(4): 
        #add the prediction of features
        x_test_temp = []
        for i in range(length):
            x_test_temp.append(np.concatenate((x_test[i],features[i])))
        x_test = np.array(x_test_temp)[:,amount_of_features:] #still use the recent 20 days data

        #Predict features
        features_1 = predict_features(x_test,model,amount_of_features)
        y_predict.append(features_1[:,-1]) #take the close price
    return [np.array(y_predict),y_test]
    
def week_score(original_stock,stock,window,lamb):
    y_predict ,y_test= fiveday_predict(stock,window,lamb)
    length = y_test.shape[0]

    y_test = denormalize(original_stock,y_test,'close').reshape((length))
    y_test_r = []
    for day,price in enumerate(y_test):
        if (day != 0):
            y_test_r.append(sign(y_test[day]-y_test[day-1]))                
    y_test_r = np.array(y_test_r).astype(np.float64)
    
    week_score_ = []
    
    for day in range(5):
        y_test_temp = y_test_r[day:]
        y_predict_temp = denormalize(original_stock,y_predict[day],'close').reshape((length))[:length-day]
        
        y_predict_rise_fall = []
        for week,price in enumerate(y_predict_temp):
            if (week != 0):
                y_predict_rise_fall.append(sign(y_predict_temp[week]-y_predict_temp[week-1]))                
        y_predict_rise_fall = np.array(y_predict_rise_fall).astype(np.float64)
        
        temp = np.array(list(map(lambda x,y:0.5 if (x==y) else 0,y_predict_rise_fall,y_test_temp)))
        week_score_.append(temp[:length-4-1])
        
    week_score_ = np.array(week_score_)
    
    week_score = []
    for week in range(week_score_.shape[1]):
        score = 0
        for day in range(week_score_.shape[0]):
            if (day == 0):
                score += week_score_[day][week]*0.1
            elif (day == 1):
                score += week_score_[day][week]*0.15
            elif (day == 2):
                score += week_score_[day][week]*0.2
            elif (day == 3):
                score += week_score_[day][week]*0.25
            elif (day == 4):
                score += week_score_[day][week]*0.3
        week_score.append(score)
    
    return [week_score,sum(week_score)/(len(week_score)*1.0)]

In [105]:
def five_day_prediction(ETF_list,window,lamb):
    ETF_score = []
    for ETF_code in ETF_list:
        ETF_target = extract_target_ETF(ETFtable,ETF_code)
        ETF_target_N = normalize_data(ETF_target)
        
        fiveday_score_array,fiveday_score = week_score(ETF_target,ETF_target_N,window,lamb)
        print(f'{ETF_code} average Fiveday_score is ',fiveday_score)
        ETF_score.append(fiveday_score)
    print('18ETF total five day rise_fall score is',sum(ETF_score))
    print('18ETF average five day rise_fall score is',sum(ETF_score)/18.0)

In [106]:
#0 <= score <= 0.5 
ETF_list = [50,51,52,53,54,55,56,57,58,59,6201,6203,6204,6208,690,692,701,713]
window = 20
lamb = 1
five_day_prediction(ETF_list,window,lamb)

50 average Fiveday_score is  0.23381147540983602
51 average Fiveday_score is  0.2336065573770491
52 average Fiveday_score is  0.2331967213114754
53 average Fiveday_score is  0.27151639344262296
54 average Fiveday_score is  0.21536885245901635
55 average Fiveday_score is  0.2549180327868853
56 average Fiveday_score is  0.24364754098360644
57 average Fiveday_score is  0.21885245901639347
58 average Fiveday_score is  0.21844262295081968
59 average Fiveday_score is  0.22438524590163936
6201 average Fiveday_score is  0.24897540983606561
6203 average Fiveday_score is  0.17131147540983602
6204 average Fiveday_score is  0.2293032786885246
6208 average Fiveday_score is  0.2030737704918034
690 average Fiveday_score is  0.24444444444444446
692 average Fiveday_score is  0.3066666666666667
701 average Fiveday_score is  0.275
713 average Fiveday_score is  0.24499999999999997
18ETF total five day rise_fall score is 4.271520947176685
18ETF average five day rise_fall score is 0.23730671928759361
