In [1]:
import os 
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.linear_model.logistic import LogisticRegression

Fucntions of this notebook:
* Read indicator data ("ticker_indicator.csv" in directory ".../Data/indicator", generated by 2_tech_indicator_calculation)
* Split sample into training set and test set
* Fit two logistic regression models, one with all indicators and one with indicators selected by stepwise process (generated by 3_stepwise_selection)
* Output predicted direction and probability for test set as one .csv file (with name "ticker_signal.csv" and stored in directory ".../Output")

### Directory Setting

In [2]:
dir_working = "/Users/user/Desktop/E4733 AT/Project/Coding Environment/Codes"
os.chdir(dir_working)
dir_data = "../Data/indicator"
dir_output = "../Output"
#!# set ticker list 
ticker_list = ['aapl', 'amzn', 'nvda', 'amd', 'msft', 'fb', 'nflx', \
               'goog', 'intc', 'pypl']

### 1. Indicator Sets Selected by Stepwise Process

In [3]:
optimal_set = [['MA_CAT_15','MA_CAT_30','VMA_Cat_3','VMA_Cat_15', \
                'VMA_Cat_20','MACD_CAT_3_5','MACD_CAT_5_15','MACD_CAT_10_15',\
                'MACD_CAT_15_30','WilliamR','RSI_20','PSY_15','PSY_30','ADO',\
                'K_D_CAT_5_3','K_D_CAT_B_15_80_20'],\
               ['MA_CAT_5','MA_CAT_15','MA_CAT_20','MA_CAT_30','VMA_Cat_3',\
                'VMA_Cat_5','VMA_Cat_10','VMA_Cat_20','MACD_CAT_3_5',\
                'MACD_CAT_5_10','MACD_CAT_15_30','WilliamR','RSI_10','PSY_10',\
                'ADO','K_D_CAT_5_3','K_D_CAT_B_5_80_20'],\
               ['MA_CAT_20','VMA_Cat_3','VMA_Cat_10','VMA_Cat_15',\
                'VMA_Cat_20','MACD_CAT_3_5','MACD_CAT_5_15','WilliamR',\
                'RSI_10','RSI_20','RSI_30','PSY_10','ADO','K_D_CAT_5_3',\
                'K_D_CAT_B_5_80_20','K_D_CAT_B_15_80_20'],\
               ['MA_CAT_3','MA_CAT_5','VMA_Cat_15','MACD_CAT_3_5',\
                'MACD_CAT_5_30','MACD_CAT_10_15','WilliamR','PSY_10','PSY_15',\
                'PSY_30','ADO','K_D_CAT_15_3','K_D_CAT_5_3',\
                'K_D_CAT_B_15_80_20','CCI_5'],\
               ['MA_CAT_3','MA_CAT_10','VMA_Cat_20','MACD_CAT_5_15',\
                'WilliamR','PSY_10','ADO','K_D_CAT_15_3'],\
               ['MA_CAT_3','VMA_Cat_3','VMA_Cat_5','VMA_Cat_15',\
                'MACD_CAT_3_5','MACD_CAT_5_10','MACD_CAT_10_20','WilliamR',\
                'RSI_20','PSY_10','ADO','K_D_CAT_15_3','K_D_CAT_B_15_80_20',\
                'CCI_5'],\
               ['MA_CAT_3','MA_CAT_5','MA_CAT_15','MA_CAT_20','VMA_Cat_15',\
                'MACD_CAT_3_5','MACD_CAT_10_15','WilliamR','RSI_10','PSY_10',\
                'ADO'],\
               ['VMA_Cat_3','VMA_Cat_5','VMA_Cat_30','MACD_CAT_3_5',\
                'MACD_CAT_5_10','MACD_CAT_5_30','MACD_CAT_10_15','WilliamR',\
                'RSI_10','PSY_10','ADO','K_D_CAT_5_3','K_D_CAT_B_5_80_20',\
                'K_D_CAT_B_15_80_20','CCI_5'],\
               ['MA_CAT_3','MA_CAT_30','VMA_Cat_10','MACD_CAT_3_5',\
                'MACD_CAT_5_15','MACD_CAT_5_30','WilliamR','RSI_10','PSY_10',\
                'ADO','K_D_CAT_5_3','K_D_CAT_B_5_80_20','K_D_CAT_B_15_80_20'],\
               ['MA_CAT_15','VMA_Cat_5','MACD_CAT_3_5','MACD_CAT_5_15',\
                'MACD_CAT_5_30','MACD_CAT_10_15','WilliamR','RSI_10','PSY_10',\
                'ADO','K_D_CAT_5_3','K_D_CAT_B_15_80_20']]

### 2. Model Function 

In [4]:
def model_fit_optimal(ticker = 'aapl',indicator_set = optimal_set[0]):
    ## read & clean data
    data = pd.read_csv(dir_data + '/' + ticker + '_indicator.csv')
    data.time = pd.to_datetime(data.time)
    data = data.set_index('time', drop = True)
    data.direction = data.direction.shift(-1)
    data['return'] = data['return'].shift(-1)
    data = data.dropna()
    ## training & test sets 
    data['date'] = data.index.date
    # training set 
    data_train = data.where(data.date < dt.date(2018,12,17))
    data_train = data_train.dropna()
    data_train = data_train.drop('date', axis = 1)
    # test set 
    data_test = data.where(data.date >= dt.date(2018,12,17))
    data_test = data_test.dropna()
    data_test = data_test.drop('date', axis = 1)
    
    ## Logistic Full Model
    # values 
    X_train = data_train.loc[:,list(data_train.columns)[11:]].values
    Y_train = data_train.loc[:,'direction'].values
    X_test = data_test.loc[:,list(data_train.columns)[11:]].values
    Y_test = data_test.loc[:,'direction'].values
    # model
    logistic_model = LogisticRegression(solver = 'lbfgs', max_iter = 1000) 
    logistic_model.fit(X_train, Y_train) 
    acc_in_logistic_full = np.mean(logistic_model.predict(X_train) == Y_train)
    Y_pred_logistic_full = logistic_model.predict(X_test)
    acc_out_logistic_full = np.mean(Y_pred_logistic_full == Y_test)
    logistic_proba_full = logistic_model.predict_proba(X_test)
    
    ## Logistic Stepwise Optimized 
    # values 
    X_train = data_train.loc[:,indicator_set].values
    Y_train = data_train.loc[:,'direction'].values
    X_test = data_test.loc[:,indicator_set].values
    Y_test = data_test.loc[:,'direction'].values
    # model
    logistic_model = LogisticRegression(solver = 'lbfgs', max_iter = 1000) 
    logistic_model.fit(X_train, Y_train) 
    acc_in_logistic = np.mean(logistic_model.predict(X_train) == Y_train)
    Y_pred_logistic = logistic_model.predict(X_test)
    acc_out_logistic = np.mean(Y_pred_logistic == Y_test)
    logistic_proba = logistic_model.predict_proba(X_test)
    
    ## Output & Return 
    data_test['signal_logistic_full'] = Y_pred_logistic_full
    data_test['logistic_prob_full'] = np.max(logistic_proba_full, axis = 1)
    data_test['signal_logistic'] = Y_pred_logistic
    data_test['logistic_prob'] = np.max(logistic_proba, axis = 1)
    data_output = data_test.drop(list(data_train.columns)[11:], axis = 1)
    data_output.to_csv(dir_output + '/' + ticker + '_signal.csv')
    return [acc_in_logistic_full,acc_out_logistic_full,\
            acc_in_logistic,acc_out_logistic]

### 3. Execution 

In [5]:
for i in range(len(ticker_list)):
    summary_list = model_fit_optimal(ticker_list[i])
    print('\n'+ticker_list[i])
    print('Full Model:')
    print('in-sample: ', summary_list[0])
    print('out-of-sample: ', summary_list[1])
    print('Stepwise Optimized:')
    print('in-sample: ', summary_list[2])
    print('out-of-sample: ', summary_list[3], '\n')


aapl
Full Model:
in-sample:  0.7315003394433129
out-of-sample:  0.7295694766692447
Stepwise Optimized:
in-sample:  0.7316458151488702
out-of-sample:  0.7308584686774942 


amzn
Full Model:
in-sample:  0.7222141225640278
out-of-sample:  0.7246451612903225
Stepwise Optimized:
in-sample:  0.7226029061573601
out-of-sample:  0.7241290322580645 


nvda
Full Model:
in-sample:  0.7277991338620992
out-of-sample:  0.7272252675541634
Stepwise Optimized:
in-sample:  0.7275558366989441
out-of-sample:  0.7280083529104673 


amd
Full Model:
in-sample:  0.7309619969831151
out-of-sample:  0.7346193952033369
Stepwise Optimized:
in-sample:  0.7306700403873291
out-of-sample:  0.7361835245046924 


msft
Full Model:
in-sample:  0.7376218323586745
out-of-sample:  0.7404619776797301
Stepwise Optimized:
in-sample:  0.7371345029239766
out-of-sample:  0.7386452115234882 


fb
Full Model:
in-sample:  0.7298204641658153
out-of-sample:  0.7358294331773271
Stepwise Optimized:
in-sample:  0.7297718094682042
out-of-s