In [7]:
# Load in our libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

import time
import datetime as dt

from sklearn import preprocessing

# Data

In [8]:
data = pd.read_csv("tsharep.csv",encoding=' big5-hkscs ').rename(columns={'代碼':'code','日期':'date','中文簡稱':'name','開盤價(元)':'open','最高價(元)':'high','最低價(元)':'low','收盤價(元)':'close','成交張數(張)':'volume'})
data.head()

Unnamed: 0,code,date,name,open,high,low,close,volume
0,1101,20130102,台泥,38.95,39.1,38.65,39.0,6374
1,1101,20130103,台泥,39.5,39.5,38.75,38.85,9710
2,1101,20130104,台泥,39.4,39.45,38.6,39.0,8682
3,1101,20130107,台泥,39.1,39.1,38.65,38.9,5067
4,1101,20130108,台泥,38.9,39.1,38.2,38.5,6454


In [9]:
def ETF_data_processing(filepath):
    ETFtable = pd.read_csv(filepath,encoding=' big5-hkscs ').rename(columns={'代碼':'code','日期':'date','中文簡稱':'name','開盤價(元)':'open','最高價(元)':'high','最低價(元)':'low','收盤價(元)':'close','成交張數(張)':'volume'})
   
    #processing data type 
    ETFtable['date'] = ETFtable['date'].map(lambda x:dt.datetime.strptime(str(x),'%Y%m%d'))
#     ETFtable['week'] = ETFtable['date'].map(lambda x:x.isoweekday())
    ETFtable['close'] = ETFtable['close'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['open'] = ETFtable['open'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['high'] = ETFtable['high'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['low'] = ETFtable['low'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    ETFtable['volume'] = ETFtable['volume'].map(lambda x:x if (type(x)==float) else float(x.replace(',','')) )
    
    return ETFtable

In [10]:
ETFtable = ETF_data_processing('tetfp.csv')
ETFtable.head()

Unnamed: 0,code,date,name,open,high,low,close,volume
0,50,2013-01-02,元大台灣50,54.0,54.65,53.9,54.4,16487.0
1,50,2013-01-03,元大台灣50,54.9,55.05,54.65,54.85,29020.0
2,50,2013-01-04,元大台灣50,54.85,54.85,54.4,54.5,9837.0
3,50,2013-01-07,元大台灣50,54.55,54.55,53.9,54.25,8910.0
4,50,2013-01-08,元大台灣50,54.0,54.2,53.65,53.9,12507.0


# One day prediction function

In [11]:
#target ETF
def extract_target_ETF(ETFtable,ETFcode):
    ETF_target = ETFtable[ETFtable['code']==ETFcode].drop(['code','name'],axis = 1).groupby(['date']).sum()
#     ETF_target = pd.concat([ETF_target,index_table],axis = 1)
    
    # move the target feature to the last column
    t = ETF_target['close'].values
    ETF_target.drop(['close'],axis=1,inplace = True)
    ETF_target['close'] = t
    
    return ETF_target

#data type
def normalize_data(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    df_N = df.copy()
    
    for columnname in df_N.columns:
        df_N[columnname] = min_max_scaler.fit_transform(df_N[columnname].values.reshape(-1,1))
    
    return df_N 

def denormalize(stock, normalized_value,columnname): 
    stock_price = stock[columnname].values.reshape(-1,1)
    normalized_value = normalized_value.reshape(-1,1)

    #return df.shape, p.shape
    min_max_scaler = preprocessing.MinMaxScaler()
    stock_price_N = min_max_scaler.fit_transform(stock_price)
    new = min_max_scaler.inverse_transform(normalized_value)
    return new

#load data
def load_data(stock, seq_len):
    amount_of_features = len(stock.columns) 
    data = stock.as_matrix() 
    sequence_length = seq_len + 1 # index starting from 0
    result = []
    
    for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
        result.append(data[index: index + sequence_length]) # index : index + 20days
    
    result = np.array(result)
    row = round(0.9 * result.shape[0]) # 90% split
    train = result[:int(row), :] # 90% date, all features 
    
    x_train = train[:, :-1] #make the last day of train data as y_train
    y_train = train[:, -1][:,-1] #the close price of the last day of every data 
    
    x_test = result[int(row):, :-1] 
    y_test = result[int(row):, -1][:,-1]

    x_train = x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
    x_test =  x_test.reshape((x_test.shape[0],x_test.shape[1]*x_test.shape[2]))
     

    return [x_train, y_train, x_test, y_test]

# Model
class LinearRegressionReg:

    def __init__(self):
        self._dimension = 0

    def fit(self, X, Y, lamb):  #calculate w
        self._dimension = X.shape[1]
        self._w = np.zeros((self._dimension,1))
        self._lamb = lamb
        self._w = np.linalg.inv(np.dot(X.T, X) + lamb*np.eye(self._dimension)).dot(X.T).dot(Y)

    def predict(self, X):
        result = np.dot(X, self._w)
        return result
    
    def fiveday_predict(self,X,Y):
        for i in range(5):
            Y_predict = self.predict(X)
    
    def error(self, X, Y):  #squared error
        Y_predict = self.predict(X)
        return sum((Y_predict-Y)**2)/(len(Y)*1.0)            

    def get_w(self):
        return self._w

    def print_val(self):
        print ("w: ", self._w)
    
    def score(self,X,Y,original_stock):
        Y_predict = self.predict(X)
        Y_predict_de = denormalize(original_stock,Y_predict,'close').reshape((Y.shape[0]))
        Y_test_de = denormalize(original_stock,Y,'close').reshape((Y.shape[0]))
        return sum(((Y_test_de-abs(Y_predict_de-Y_test_de))/Y_test_de)*0.5)/(len(Y_test_de)*1.0)

# Picture
def feature_importance(lr):
    plt.figure(figsize=(10,8))
    plt.xlabel('Feature Importance')
    plt.ylabel('Weight')

    plt.bar([_ for _ in range(len(lr.get_w()))],lr.get_w())
    
def feature_importance_scatter(lr,window,stock):
    picture = [[_] for _ in range(window)]
    picture_label = []
    for i in picture:
        picture_label += i*10
    
    xlabel = list(stock.columns)

    # Scatter plot 
    trace = go.Scatter(
        y = lr.get_w(),
        x = xlabel*20,
        mode='markers',
        name = 'Importance',
        marker=dict(
            sizemode = 'diameter',
            sizeref = 1,
            size = 25,
    #       size= feature_dataframe['AdaBoost feature importances'].values,
            #color = np.random.randn(500), #set color equal to a variable
            color = lr.get_w(),
            colorscale='Portland',
            showscale=True
        ),
        text = xlabel*20
    )
    data = [trace]

    layout= go.Layout(
        autosize= True,
        title= 'Linear Regression Feature Importance on weight',
        hovermode= 'closest',
        xaxis= dict(
            title= 'Feature name',
    #         ticklen= 5,
    #         zeroline= False,
    #         gridwidth= 2,
        ),
        yaxis=dict(
            title= 'Feature Importance',
            ticklen= 5,
            gridwidth= 2
        ),
        showlegend= False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig,filename='scatter1')

    # Scatter plot 
    trace = go.Scatter(
        y = lr.get_w(),
        x = xlabel*20,
        mode='markers',
        name = 'Day',
        marker=dict(
            sizemode = 'diameter',
            sizeref = 1,
            size = 25,
    #       size= feature_dataframe['AdaBoost feature importances'].values,
            #color = np.random.randn(500), #set color equal to a variable
            color = picture_label,
            colorscale='Portland',
            showscale=True
        ),
        text = xlabel*20
    )
    data = [trace]

    layout= go.Layout(
        autosize= True,
        title= 'Linear Regression Feature Importance on day',
        hovermode= 'closest',
        xaxis= dict(
            title= 'Feature name',
    #         ticklen= 5,
    #         zeroline= False,
    #         gridwidth= 2,
        ),
        yaxis=dict(
            title= 'Feature Importance',
            ticklen= 5,
            gridwidth= 2
        ),
        showlegend= False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig,filename='scatter2')

# One day prediction for 18 ETF

In [12]:
def one_day_prediction(ETF_list,window,lamb):
    ETF_score = []
    for ETF_code in ETF_list:
        ETF_target = extract_target_ETF(ETFtable,ETF_code)
        ETF_target_N = normalize_data(ETF_target)
        X_train, y_train, X_test, y_test = load_data(ETF_target_N, window)

        ### Model
        lr = LinearRegressionReg()
        lr.fit(X_train, y_train, lamb)

        ### score
        score = lr.score(X_test,y_test,ETF_target)
        print(f'{ETF_code} average one day score is',score)
        ETF_score.append(score)
        
    print('Total ETF one day price score is',sum(ETF_score))    
    print('Average ETF one day price score is',sum(ETF_score)/18.0)

In [13]:
# 0 <= score <= 0.5
ETF_list = [50,51,52,53,54,55,56,57,58,59,6201,6203,6204,6208,690,692,701,713]
window = 20
lamb = 1
one_day_prediction(ETF_list,window,lamb)

50 average one day score is 0.4962552440648926
51 average one day score is 0.495951881282426
52 average one day score is 0.49376978013907724
53 average one day score is 0.49543359935376463
54 average one day score is 0.49673861951048187
55 average one day score is 0.4971278209973724
56 average one day score is 0.49722225889527605
57 average one day score is 0.49628300750443083
58 average one day score is 0.496891681977097
59 average one day score is 0.4960521928040788
6201 average one day score is 0.49439857990179104
6203 average one day score is 0.496125098926445
6204 average one day score is 0.4970685035802757
6208 average one day score is 0.4960916651966915
690 average one day score is 0.4970759507184719
692 average one day score is 0.4968240960515666
701 average one day score is 0.4968848123659525
713 average one day score is 0.49664836998113193
Total ETF one day price score is 8.932843163251224
Average ETF one day price score is 0.49626906462506803


# Five day prediction

In [14]:
#load data
def load_data_feature(stock, seq_len,feature,day=0):
    amount_of_features = len(stock.columns) 
    data = stock.as_matrix() 
    sequence_length = seq_len + 1 # index starting from 0
    result = []
    
    for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
        result.append(data[index: index + sequence_length]) # index : index + 22days
    
    result = np.array(result)
    row = round(0.9 * result.shape[0]) # 90% split
    train = result[:int(row), :] # 90% date, all features 
    
    x_train = train[:, :-1] 
    y_train = train[:,-1,feature] 

    x_test = result[int(row):, :-1] 
    y_test = result[int(row):, -1,feature]

    
    x_train = x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
    x_test =  x_test.reshape((x_test.shape[0],x_test.shape[1]*x_test.shape[2])) #shape(127,20,6) --> (127,120)
     

    return [x_train, y_train, x_test, y_test]


#transform the shape of features array into the shape of X data
def shape_transfrom(features):
    features_=[]
    for i in range(features.shape[1]):
        temp = np.array([])
        for feature in features:
            temp = np.hstack((temp,feature[i]))
        features_ .append(temp)
    
    features_ = np.array(features_)
    return features_


def make_features_model_first(stock,seq_len,lamb):
    amount_of_features = len(stock.columns)
    models = []
    errors = []
    features = []
    columns = stock.columns
    
    for i in range(amount_of_features): #load data and train
        x_train ,y_train ,x_test,y_test= load_data_feature(stock,seq_len,i)
        lr = LinearRegressionReg()
        lr.fit(x_train,y_train,lamb)
        models.append(lr)
        errors.append(lr.error(x_test,y_test))
        features.append(lr.predict(x_test))
#         print("Eout of %s is %f"%(columns[i],errors[i]))
        
    features_ = shape_transfrom(np.array(features))
    
    return [features_,models,x_test,y_test]

def predict_features(x_test,model,amount_of_features):
    errors = []
    features = []
    
    for i in range(amount_of_features): #load data and train
#         errors.append(lr.score(x_test,y_test))
        features.append(model[i].predict(x_test))
    features = np.array(features)
    features_ = shape_transfrom(features)
    
    return features_ 

def fiveday_predict(stock,seq_len,lamb):
    amount_of_features = len(stock.columns)
    features,model,x_test,y_test = make_features_model_first(stock,seq_len,lamb)
    y_predict = []
    y_predict.append(features[:,-1]) #shape = (127,1)
    length = x_test.shape[0]
    
    for day in range(4): 
        #add the prediction of features
        x_test_temp = []
        for i in range(length):
            x_test_temp.append(np.concatenate((x_test[i],features[i])))
        x_test = np.array(x_test_temp)[:,amount_of_features:] #still use the recent 20 days data

        #Predict features
        features_1 = predict_features(x_test,model,amount_of_features)
        y_predict.append(features_1[:,-1]) #take the close price
    return [np.array(y_predict),y_test]
    
def week_score(original_stock,stock,window,lamb):
    y_predict ,y_test= fiveday_predict(stock,window,lamb)
    
    length = y_test.shape[0]
    y_test = denormalize(original_stock,y_test,'close').reshape((length))
    week_score_ = []
    
    for day in range(5):
        y_test_temp = y_test[day:]
        y_predict_temp = denormalize(original_stock,y_predict[day],'close').reshape((length))[:length-day]
        temp = np.array(list(map(lambda x,y:((y-abs(x-y))/y)*0.5,y_predict_temp,y_test_temp)))
        week_score_.append(temp[:length-4])
    week_score_ = np.array(week_score_)
    
    week_score = []
    for week in range(week_score_.shape[1]):
        score = 0
        for day in range(week_score_.shape[0]):
            if (day == 0):
                score += week_score_[day][week]*0.1
            elif (day == 1):
                score += week_score_[day][week]*0.15
            elif (day == 2):
                score += week_score_[day][week]*0.2
            elif (day == 3):
                score += week_score_[day][week]*0.25
            elif (day == 4):
                score += week_score_[day][week]*0.3
        week_score.append(score)
    
    return [week_score,sum(week_score)/(len(week_score)*1.0)]

In [15]:
def five_day_prediction(ETF_list,window,lamb):
    ETF_score = []
    for ETF_code in ETF_list:
        ETF_target = extract_target_ETF(ETFtable,ETF_code)
        ETF_target_N = normalize_data(ETF_target)
        
        fiveday_score_array,fiveday_score = week_score(ETF_target,ETF_target_N,window,lamb)
        print(f'{ETF_code} average Fiveday_score is ',fiveday_score)
        ETF_score.append(fiveday_score)
        
    print('Total ETF five day price score is',sum(ETF_score))    
    print('Average ETF five day price score is',sum(ETF_score)/18.0)

In [16]:
#0 <= score <= 0.5 
ETF_list = [50,51,52,53,54,55,56,57,58,59,6201,6203,6204,6208,690,692,701,713]
window = 20
lamb = 1
five_day_prediction(ETF_list,window,lamb)

50 average Fiveday_score is  0.4930098536508626
51 average Fiveday_score is  0.49309392074687086
52 average Fiveday_score is  0.4892714483033573
53 average Fiveday_score is  0.49182808540422324
54 average Fiveday_score is  0.4939981504727719
55 average Fiveday_score is  0.4951226829745353
56 average Fiveday_score is  0.494544467130304
57 average Fiveday_score is  0.4932807138845685
58 average Fiveday_score is  0.4949038788712295
59 average Fiveday_score is  0.49405025882091713
6201 average Fiveday_score is  0.49009156636214957
6203 average Fiveday_score is  0.4925345057025705
6204 average Fiveday_score is  0.49403797993481746
6208 average Fiveday_score is  0.492577556526498
690 average Fiveday_score is  0.495462016230729
692 average Fiveday_score is  0.49451744681047505
701 average Fiveday_score is  0.4957275157845913
713 average Fiveday_score is  0.4942928949193936
Total ETF five day price score is 8.882344942530864
Average ETF five day price score is 0.49346360791838134
