In [1]:
import numpy as np
import pandas as pd
from preprocessing.wrangling import get_indi_df, get_labels, slide_and_flatten
from preprocessing.extract_features import get_all_ta_features, get_wavelet_coeffs
from evaluation.eval import sliding_window_cv_regression, batch_test_swcv_regression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from numpy.lib.stride_tricks import sliding_window_view
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
import os

In [2]:
from sklearn.svm import SVR

In [3]:
def add_closing_price(y, cls_price):
    return y + cls_price

In [4]:
class PersistanceModel:
    def __init__(self, persist_colname='Close'):
        self.persist_colname = persist_colname

    def __repr__(self):
        return "PersistanceModel(persist_colname={})".format(self.persist_colname)

    def fit(self, Xtr, ytr):
        pass

    def predict(self, Xts):
        return Xts.loc[:, self.persist_colname]


In [5]:
list_dir = 'data_collection/stocks_list'
list_prefix = "ind_nifty"
list_suffix = "list.csv"
save_dir = 'data_collection/ohlcv_data'
save_prefix = "ohlcv_"
save_suffix = ".csv"
cap_n_stocks = 10

for f in os.listdir(list_dir):
    if f.startswith(list_prefix) and f.endswith(list_suffix):
            savefile = os.path.join(save_dir, save_prefix+f[9:-8]+save_suffix)
            listfile = os.path.join(list_dir, f)
            p = pd.read_csv(listfile)
            symbols = list(p['Symbol'].values + '.NS')
            if cap_n_stocks <= 0:
                break
            for symbol in symbols:
                cap_n_stocks -= 1
                if cap_n_stocks <= 0:
                    break
                df = get_indi_df(symbol, ohlcvfile=savefile, start_date="2017-01-01")
                # df = get_all_ta_features(df)
                drop_columns = ['Date', 'Adj Close']
                df.drop(drop_columns, axis=1, inplace=True)
                move_dir_target, cls_target = get_labels(df['Close'])
                df = df.iloc[:-1]
                cls_target = cls_target.iloc[:-1]

                df10 = slide_and_flatten(df, window_len=10)
                df10 = pd.DataFrame(df10, index=df.index[9:])
                # df30 = slide_and_flatten(df, window_len=30)
                # df30 = pd.DataFrame(df30, index=df.index[29:])
                # df60 = slide_and_flatten(df, window_len=60)
                # df60 = pd.DataFrame(df60, index=df.index[59:])

                df10_wavelet = get_wavelet_coeffs(df['Close'], len_window=10, decomp_level=2)
                # df10_wavelet = pd.DataFrame.from_records(df10_wavelet, index=df10.index)
                # df30_wavelet = get_wavelet_coeffs(df['Close'], len_window=30, decomp_level=2)
                # df30_wavelet = pd.DataFrame.from_records(df30_wavelet, index=df30.index)
                # df60_wavelet = get_wavelet_coeffs(df['Close'], len_window=60, decomp_level=2)
                # df60_wavelet = pd.DataFrame.from_records(df60_wavelet, index=df60.index)

                # df10 = df10.merge(df10_wavelet, left_index=True, right_index=True)
                # df30 = df30.merge(df30_wavelet, left_index=True, right_index=True)
                # df60 = df60.merge(df60_wavelet, left_index=True, right_index=True)

                y = cls_target - df['Close']
                # y10 = cls_target[9:] - df['Close'].iloc[9:]
                # y30 = cls_target[29:] - df['Close'].iloc[29:]
                # y60 = cls_target[59:] - df['Close'].iloc[59:]

                pipe1 = make_pipeline(
                    PersistanceModel(persist_colname="Close")
                )

                pipe2 = make_pipeline(
                    (MinMaxScaler()),
                    (GradientBoostingRegressor())
                )


                batch_test_swcv_regression(
                    list_X = [df],
                    list_y = [cls_target],
                    list_pipe = [pipe1],
                    list_n_tr = [30],
                    list_n_ts = [1],
                    scorers = [mean_squared_error,mean_absolute_percentage_error, r2_score],
                    savefile='results/baseline_test_gbr_{}.csv'.format(symbol),
                    comment_X=["persistance"],
                    list_post_processors=[None]
                )

                batch_test_swcv_regression(
                    list_X = [df],
                    list_y = [cls_target],
                    list_pipe = [pipe2],
                    list_n_tr = [30],
                    list_n_ts = [1],
                    scorers = [mean_squared_error,mean_absolute_percentage_error, r2_score],
                    savefile='results/baseline_test_gbr_{}.csv'.format(symbol),
                    comment_X=["baseline_gbr"],
                    list_post_processors=[None]
                )
                # cls_price should have length len(X)-(n_tr+n_ts)

A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
A test completed. (Comment : ['persistance', 30, 1])
A test completed. (Comment : ['baseline_gbr', 30, 1])
