In [2]:
import numpy as np
import pandas as pd
from scipy.io import loadmat

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Data Preparation

In [3]:
def load_mat_data(data_path, data_name = 'X'):
    
    data = loadmat(data_path)
    data = data[data_name]
    A = data # (epochs, stocks)
    print(data_path, A.shape)
    
    return A


# 1. Linear Regression
def train_lr(X_train, y_train):
    
    lr_regressor = LinearRegression()
    model = lr_regressor.fit(X_train, y_train)
    
    return model


# 2. Support Vector Regressor
def train_svr(X_train, y_train):
    
    regr = make_pipeline(StandardScaler(), SVR(kernel = 'rbf', C=1.0, epsilon=0.2))
    model = regr.fit(X_train, y_train)
    
    return model

In [5]:
# load original dataset
data_path = '../input/dowjones30/dj30.mat'
data = load_mat_data(data_path)

# load transformed dataset
alt_data_path = '../input/dowjones30/dj30_transformed.mat'
alt_data = load_mat_data(alt_data_path, 'C') # discrete cosine transform

# prepare the target
target_path = '../input/dowjones30/dj30_target.mat'
target = load_mat_data(target_path, 'X')
target = target.flatten()
target = target[1:] # effectively shifting targets

In [6]:
# make a multistep dataset
def make_multistep(df, n_steps, target):
    
    cols = df.columns # get all the existing columns
    for i in range(1, n_steps):
        for col in cols:
            df[str(col) + '(t-' + str(i) + ')'] = df[col].shift(i)
            
    df['target'] = target # align the target with the training data
    df.dropna(inplace = True) # drop those examples with insufficient data
    
    target = df['target'].values
    df.drop('target', axis = 1, inplace = True)
    
    return (df, target)

df = pd.DataFrame(data = data)
df_trans = pd.DataFrame(data = alt_data)

n_steps = 5 # number of features
df, y = make_multistep(df, n_steps, target)
df_trans, y = make_multistep(df_trans, n_steps, target)

X = np.reshape(df.values, (df.values.shape[0], n_steps, -1))
X_trans = np.reshape(df_trans.values, (df_trans.values.shape[0], n_steps, -1))

print(X.shape, X_trans.shape, y.shape)


# to check whether the data is aligned correctly
# print('data\n')
# print(data[:5, 0])
# print('X\n')
# print(X[0, :, 0][::-1])


# Training

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

train_window = 500
val_window = 20
m = 30 # number of base learners

test_start = train_window + val_window
test_end = X.shape[0] - 1
results = pd.DataFrame(index = range(test_start, test_end + 1), columns = ['original', 'transformed', 'target'])



for test_date in range(test_start, test_end + 1):
    
    i = test_date - train_window - val_window # train start date
    
    loss = np.zeros(30)
    pred = np.zeros(30)
    
    loss_trans = np.zeros(30)
    pred_trans = np.zeros(30)
    
    for j in range(m):
        
        X_train = X[i : i + train_window, :, j]
        X_trans_train = X_trans[i : i + train_window, :, j]
        y_train = y[i : i + train_window]
        
        X_val = X[i + train_window: i + train_window + val_window, :, j]
        X_trans_val = X_trans[i + train_window: i + train_window + val_window, :, j]
        y_val_truth = y[i + train_window: i + train_window + val_window]
        
        X_test = np.reshape(X[test_date, :, j], (1, -1))
        X_trans_test = np.reshape(X_trans[test_date, :, j], (1, -1))
        results.at[test_date, 'target'] = y[test_date]
        
        
        model = train_lr(X_train, y_train)
        y_val = model.predict(X_val)
        #loss[j] = mean_absolute_error(y_val, y_val_truth)
        loss[j] = mean_squared_error(y_val, y_val_truth)
        y_test = model.predict(X_test)
        pred[j] = y_test
        
        model_trans = train_lr(X_trans_train, y_train)
        y_trans_val = model_trans.predict(X_trans_val)  
        #loss_trans[j] = mean_absolute_error(y_trans_val, y_val_truth)
        loss_trans[j] = mean_squared_error(y_trans_val, y_val_truth)
        y_trans_test = model_trans.predict(X_trans_test)
        pred_trans[j] = y_trans_test
    
    
    # convert losses to weights
    weights = np.reciprocal(loss) 
    weights = weights / sum(weights)
    
    weights_trans = np.reciprocal(loss_trans)
    weights_trans = weights_trans / sum(weights_trans)
    
    results.at[test_date, 'original'] = np.dot(weights, pred)
    results.at[test_date, 'transformed'] = np.dot(weights_trans, pred_trans)
    
    if test_date%20 == 0:
        print(test_date)
       
        

In [12]:
results.plot()