# Run 1 : Data Split and Model Training

## Stock Data

In [None]:
# import pacakges

import os
os.environ["MKL_NUM_THREADS"] = '4'
os.environ["NUMEXPR_NUM_THREADS"] = '4'
os.environ["OMP_NUM_THREADS"] = '4'

## for data process
import numpy as np
import pandas as pd
from sktime.forecasting.model_selection import temporal_train_test_split
import math

## for plotting
from sktime.utils.plotting import plot_series
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image

## for machine learning
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

## for deep learning
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Bidirectional
from keras.layers import LSTM
from tensorflow.keras.optimizers import SGD
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D,Flatten

## for performance measurement
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## for warnings
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=RuntimeWarning)
simplefilter("ignore", category=FutureWarning)

## 1. Load data

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

## 2. Common functions

### (1) Define X and y

**Parameters**

* y (pd.Series) – Target series

* X (pd.DataFrame, optional (default=None)) – Exogenous data

In [None]:
def define_X_y(df, ticker, exogenous_list):
    data = df[df['ticker'] == ticker]
    
    if len(exogenous_list) == 1:
        X = data[exogenous_list].to_frame()
        
    else:
        X = data[exogenous_list]
    
    y = data['Return_group']
    
    return X, y

### (2) For Machine Learning

In [None]:
# function: split dataset into train and test
# n: number of predictions
# X: Exogenous data, predicting variables
# y: Target series

def split_n (n, X, y):
    
    if n == 1:
        y_train, y_test, X_train, X_test = temporal_train_test_split( y = y, X = X, test_size = 1)
    
    else:
        y_train, y_test, X_train, X_test = temporal_train_test_split( y = y[:(-n+1)], X = X[:(-n+1)],test_size = 1)
    
    return y_train, y_test, X_train, X_test

In [None]:
# function: fit model to the dataset
# y_train: training set for y
# y_test: test set for y
# X_train: training set for X
# X_test: test set for X
# model_name_and_param: model name and its corresponding parameters

def model(y_train, y_test, X_train, X_test, model_name_and_param):
    
    # define model
    model = model_name_and_param

    # fit model
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # evaluate predictions
    acc = accuracy_score(y_test, y_pred)
    
    return acc, y_pred

In [None]:
# function: get one-step forecast and iterate the process for n times
# n: number of predictions
# X: Exogenous data, predicting variables
# y: Target series
# model_name_and_param: model name and its corresponding parameters

def pred_n(n, X, y, model_name_and_param):
    
    y_test_all = y[-n:]
    y_pred_all = []
    
    for i in range(n):
        y_train = split_n((n-i), X, y)[0]
        y_test = split_n((n-i), X, y)[1]
        X_train = split_n((n-i), X, y)[2]
        X_test = split_n((n-i), X, y)[3]
    
        y_pred = model(y_train, y_test, X_train, X_test, model_name_and_param)[1]
        y_pred_all.append(y_pred)
    
    return y_test_all, y_pred_all

In [None]:
# function: display the predction results
# n: number of predictions
# X: Exogenous data, predicting variables
# y: Target series
# model_name_and_param: model name and its corresponding parameters
def result(n, X, y, model_name_and_param):
    
    # y_test
    y_test_all = pred_n(n, X, y, model_name_and_param)[0]
    y_test_list = y_test_all.tolist()
    
    # y_pred
    y_pred_all = pred_n(n, X, y, model_name_and_param)[1]
    
    
    # get accuracy_score
    acc = accuracy_score(y_test_all, y_pred_all)

    # get result
    result = pd.DataFrame(y_pred_all, columns = ['y_pred'])
    result['y_test'] = y_test_list
    
    return acc, result

### (3) For deep Learning

In [None]:
# function: split dataset into train and test; scale and standardize the data
# n: number of predictions
# X: Exogenous data, predicting variables
# y: Target series

def dl_split_n(n, X, y):
    X = X.values
    y = y.values
    
    if n == 1:
        y_train, y_test, X_train, X_test = temporal_train_test_split( y = y, X = X, test_size = 1)
    
    else:
        y_train, y_test, X_train, X_test = temporal_train_test_split( y = y[:(-n+1)], X = X[:(-n+1)],test_size = 1)
    
    # convert y to categorial
    y_train = to_categorical(y_train,3)
    y_test = to_categorical(y_test,3)
    
    # scale the train and test set for X
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)


    return y_train, y_test, X_train, X_test

In [None]:
# CNN Classification, Conv1D layer
# Input --> Conv --> Max Pooling --> softmax --> Classification

# RNN and LSTM
# Input --> Embedding --> Bidirectional --> Dense --> Classification

# function: fit model to the dataset
# model_name: model name
# y_train: training set for y
# y_test: test set for y
# X_train: training set for X
# X_test: test set for X

def dl_model(model_name, y_train, y_test, X_train, X_test):
    
    # initialize the constructor
    model = Sequential()
    

    # reshape data
    var_num = X_train.shape[1]
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    # design model
    if model_name == 'CNN':
        model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape = (X_train.shape[1],1)))
        model.add(Dense(16, activation="relu"))
        model.add(MaxPooling1D(pool_size=1))
        model.add(Flatten())
        model.add(Dense(3, activation = 'softmax'))
        
    if model_name == 'RNN_LSTM':
        model.add(Bidirectional(LSTM(units=128,input_shape=(X_train.shape[1],1))))
        model.add(Dropout(rate=0.5))
        model.add(Dense(units=128, activation='relu'))
        model.add(Dense(y_train.shape[1], activation='softmax'))
    
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    # fit model
    model.fit(X_train, y_train, epochs=20, batch_size=1, verbose=0)
    
    # predict y
    y_pred = model(X_test)
    pred_class = np.argmax(y_pred, axis = -1)
    
    return y_pred, pred_class

In [None]:
# function: display the predction results
# n: number of predictions
# X: Exogenous data, predicting variables
# y: Target series
# model_name: model name
def dl_result(n, X, y, model_name):
    
    y_pred_all = []
    
    for i in range(n):
        y_train = dl_split_n(n-i, X, y)[0]
        y_test = dl_split_n(n-i, X, y)[1]
        X_train = dl_split_n(n-i, X, y)[2]
        X_test = dl_split_n(n-i, X, y)[3]
        
        pred_class = dl_model(model_name, y_train, y_test, X_train, X_test)[1]
        y_pred_all.append(pred_class[0])
        
    for i in range(len(y_pred_all)):
        if y_pred_all[i] == 2:
            y_pred_all[i] = -1
    
    result = pd.DataFrame(y_pred_all, columns = ['y_pred'])
    y_test_all = y[-n:].tolist()
    result['y_test'] = y_test_all
    
    # get accuracy_score
    acc = accuracy_score(y_test_all, y_pred_all)
    
    return acc, result

### (4) For portfolio simulation

In [None]:
# function: get the portfolio simulation dataset
# n: number of prediction days
# ticker: stock tickers to include
# df: dataframe
def mock_dataset(n, pred, ticker, df):
    
    simu = df[df['ticker'] == ticker]
    
    price_all = simu['PRC']
    
    price = simu[(len(price_all)-n-1):(len(price_all)-1)]['PRC'].tolist()
    
    mock = pd.DataFrame(price, columns = ['price'])
    mock['y_pred'] = pred
    
        # create a list of our conditions
    conditions = [
        (mock['y_pred'] == -1),
        (mock['y_pred'] == 0),
        (mock['y_pred'] == 1)
        ]

    # create a list of the values we want to assign for each condition
    values = ['sell', 'hold', 'buy']

    # create a new column and use np.select to assign values to it using our lists as arguments
    mock['action'] = np.select(conditions, values)
    
    return mock

In [None]:
# function: buy shares using all dollar amount
# price: stock price at excuation
# balance: account balance
# share: number of shares in the account
def buy(price, balance, share):
    
    share_change = math.floor(balance / price)
    share += share_change
    balance = balance - ( price * share_change)

    return balance, share

In [None]:
# function: sell all shares in the account
# price: stock price at excuation
# balance: account balance
# share: number of shares in the account
def sell(price, balance, share):
    
    if share > 0:
        balance_change = share * price
        balance += balance_change
        share = 0
    
    return balance, share

In [None]:
# function: hold (take no action)
# price: stock price at excuation
# balance: account balance
# share: number of shares in the account
def hold(price, balance, share):
    
    return balance, share

In [None]:
# function: simulate the portfolio actions
# mock: the mock dataset
def simulation(mock):
    
    balance = 1000000
    beginning_balance = 1000000
    share = 0
    profit_or_loss = 0
    total_value = 1000000
    price = 0
    
    price_all = mock['price']
    pred_all = mock['y_pred']
    num = len(price_all)
    
    i = 0
    while (i < num):
        
        price = price_all[i]
        
        if pred_all[i] == 1:
            effect = buy(price, balance, share)
        
        elif pred_all[i] == -1:
            effect = sell(price, balance, share)
            
        else:
            effect = hold(price, balance, share)
        
        balance  = effect[0]
        share  = effect[1]
        
        i = i + 1;
    
    # calculate final balance amount
    if share == 0:
        profit_or_loss = balance - beginning_balance
        stock_return = (profit_or_loss / beginning_balance)
    elif share > 0:
        total_value = share * price + balance
        profit_or_loss = total_value - beginning_balance
        stock_return = (profit_or_loss / beginning_balance)
    
    return balance, share, profit_or_loss, stock_return

### (5) Model application

In [None]:
def func(x):
    try:
        return '{0:.2%}'.format(x)
    except:
        return x

In [None]:
def measure(df, type):

    df.loc['mean'] = df.mean()
    df.loc['std'] = df[:-1].std()
    mean = df.iloc[-2].tolist()
    std = df.iloc[-1].tolist()
    df = df.applymap(func)
    
    if type == 'portfolio':
        risk_free_rate = 0.0093
        sharpe_ratio = [float("{:.4f}".format((m - risk_free_rate) / s)) for m, s in zip(mean, std)]
        df.loc["sharpe_ratio"] = sharpe_ratio
        
    df = df.replace(['nan%'],'NaN')   
    
    return df

In [None]:
def apply(df, n, ticker, exogenous_list, classifier_list):
    
    # store accuracy score
    acc_list = [ticker]
    
    # store portfolio profit or loss
    balance_list = [ticker]
    
    X = define_X_y(df, ticker, exogenous_list)[0]
    y = define_X_y(df, ticker, exogenous_list)[1]
    
    for classifier in classifier_list[:9]:
        outcome = result(n, X, y, classifier)
        acc = outcome[0]
        acc_list.append(acc)
        
        pred = outcome[1]['y_pred']
        mock = mock_dataset(n, pred, ticker, df)
        stock_return = simulation(mock)[3]
        balance_list.append(stock_return)
    
    # for deep learning
    outcome = dl_result(n,X,y,classifier_list[9])
    acc_list.append(outcome[0])
    pred = outcome[1]['y_pred']
    mock = mock_dataset(n, pred, ticker, df)
    stock_return = simulation(mock)[3]
    balance_list.append(stock_return)
    
    outcome = dl_result(n,X,y,classifier_list[10])
    acc_list.append(outcome[0])
    pred = outcome[1]['y_pred']
    mock = mock_dataset(n, pred, ticker, df)
    stock_return = simulation(mock)[3]
    balance_list.append(stock_return)
    
    return acc_list, balance_list

In [None]:
def acc_combine(df, n, ticker_list, exogenous_list, classifier_list, model_list):
    
    acc_all = []
    balance_all = []
    
    for ticker in ticker_list:
        
        table = apply(df, n, ticker, exogenous_list, classifier_list)
        
        acc_list = table[0]
        acc_all.append(acc_list)
        
        balance_list = table[1]
        balance_all.append(balance_list)
    
    result = pd.DataFrame(acc_all, columns = model_list)
    result = measure(result,'acc')
    
    portfolio = pd.DataFrame(balance_all, columns = model_list)
    portfolio = measure(portfolio,'portfolio')

    return result, portfolio
    

## 3. Classification Models

### `Part 1: Machine Learning Models`

(1) **Logistic Regression**

(2) **KNN**

(3) **SVM**

(4) **Bayes Theorem**

(5) **Decision Tree**

(6) **Ensemble Methods**

* 6.1 Bagging: 
    * Random Forest
    * Bagging Classifier

* 6.2 Boosting: 
    * Gradient Boosting
    * AdaBoost
    

### `Part 2: Deep Learning Models`

(7) **CNN (Convolution Neural Networks)**

(8) **RNN (Recurrent neural network) and LSTM (Long Short-Term Memory)**



## 4. Prediction Parameters

### (1) Days to predict

In [None]:
# how many days to predict
n = 21

### (2) Stock tickers

In [None]:
ticker_list = pd.read_csv('ticker.csv')['ticker'].tolist()
ticker_list

### (3) Classification Models

In [None]:
# Define X: exogenous data
exogenous_list = ['Daily_return','5day_return','log_volume','ma_5','BMQ_rto', 'PEQ_rto', 'BEE_rto',
       'BAM_rto', 'BCA_rto', 'BER_rto', 'ANL_CHG_rto', 'NIP_rto', 'CSS_rto','MCQ_rto']

In [None]:
classifier_list = [LogisticRegression(multi_class='multinomial', solver='lbfgs'), 
                     KNeighborsClassifier(),
                     SVC(kernel = 'sigmoid'),
                     GaussianNB(),
                     DecisionTreeClassifier(max_depth = 2),
                     RandomForestClassifier(random_state=1),
                     BaggingClassifier(),
                     GradientBoostingClassifier(),
                     AdaBoostClassifier(),
                     "CNN",
                     "RNN_LSTM"]

In [None]:
# compare accuracy score
model_list = ["ticker", "Logistic Regression", "KNN", "SVM", "Bayes Theorem", 
              "Decision Tree", "Random Forest", "Bagging", "Gradient Boost", "AdaBoost", "CNN", "RNN_LSTM"]

## 5. Model Results 

In [None]:
result = acc_combine(df, n, ticker_list, exogenous_list, classifier_list, model_list)

### (1) Classification Accruacy Score

In [None]:
acc = result[0]
acc

In [None]:
acc.to_csv('acc_run2.csv')

### (2) Stock Portfolio Simulation

In [None]:
stock_return = result[1]
stock_return

In [None]:
stock_return.to_csv('stock_return_run2.csv')