In [1]:
# %pip show pip
# %pip install -U scikit-learn
# python3 -m pip3 install jupyter notebook -U

In [10]:
## Load Corpus
import pandas as pd
import numpy as np
from google.colab import drive 
import sys
import os
drive.mount('/content/drive', force_remount=True) 

os.chdir("/content/drive/MyDrive/Colab Notebooks/CFT")
!ls

Mounted at /content/drive
changepoint_detection.py  LICENSE	   README.md	     train.py
datasets		  metafeatures.py  requirements.txt  ts2vec.ipynb
datautils.py		  models	   scripts	     ts2vec.py
dl_meta_features.ipynb	  __pycache__	   tasks	     utils.py


In [3]:
# %%bash
# ## For when you're ready to save your changes...
# git commit -a -m "message"
# git push -u origin main

In [18]:
%load_ext autoreload
%autoreload 2

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from matplotlib import pyplot as plt
from sklearn import linear_model
from utils import standardize
from datetime import datetime
from models.losses import *
from ts2vec import TS2Vec
from pathlib import Path
from tqdm import tqdm
from os import walk
import pandas as pd
import numpy as np
import datautils
import torch
import json





The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# TODO: Challenges to solve with stock data
    # Have to deal with weekends - time series not continuous (maybe include time features like hour and daily)
    # Have to deal with hours between stock opened and stock closed

In [13]:

base_path = "/content/drive/MyDrive/Colab Notebooks/"  #"/Users/abuj/Documents/GitHub/
timeframe = "hour"
mypath      = base_path + "CFT/datasets/STOCKS/" + timeframe + "/"
ffd_path    = base_path + "CFT/datasets/META_FEATURES/FFD/" + timeframe + "/"
cpd_path    = base_path + "CFT/datasets/META_FEATURES/CPD/" + timeframe + "/"

filenames   = next(walk( mypath), (None, None, []))[2]  

cols_to_perform_ffd = ['open', 'high', 'low', 'close', 'vwap']


In [14]:
short_cpd_lookback_window_length    = 12
long_cpd_lookback_window_length     = 126

In [15]:

data_dict = {}

LOOKBACK_WINDOW                 = 7      # Hyper parameter
STANDARIZE_LOOKBACK_WINDOW      = 7 * 3  # Hyper parameter

num_assets = 0
for file in tqdm(filenames[330:]):
    df                  = pd.read_csv(mypath + file).set_index('timestamp')[cols_to_perform_ffd]
    df_ffd              = pd.read_csv(ffd_path + file).set_index('timestamp')

    if not Path(cpd_path + file[:-4] + "_short.csv").is_file() or not Path(cpd_path + file[:-4] + "_long.csv").is_file():  continue 
    df_cpd_short        = pd.read_csv(cpd_path + file[:-4] + "_short.csv").set_index('date').drop(['t', 'cp_location'], axis=1).fillna(method='ffill')
    df_cpd_long         = pd.read_csv(cpd_path + file[:-4] + "_long.csv").set_index('date').drop(['t', 'cp_location'], axis=1).fillna(method='ffill')

    df_cpd_long.index     = df[-df_cpd_long.shape[0]:].index 
    df_cpd_short.index    = df[-df_cpd_short.shape[0]:].index 

    df_cpd_long.columns   = [x + "_long" for x in df_cpd_long.columns]
    df_cpd_short.columns  = [x + "_short" for x in df_cpd_short.columns]

    meta_features         = pd.concat([df_ffd, df_cpd_short, df_cpd_long], axis=1).sort_index().dropna()



    vol                                   = df.close.ewm(halflife=1).std()
    
    tasks                                 = pd.DataFrame(index=df.index)
    tasks['one_day_price_pred']           = ( df.close - df.close.shift(-1) )
    tasks['two_day_price_pred']           = ( df.close - df.close.shift(-2) )
    tasks['three_day_price_pred']         = ( df.close - df.close.shift(-3) )
    tasks['four_day_price_pred']          = ( df.close - df.close.shift(-4) )
    tasks['five_day_price_pred']          = ( df.close - df.close.shift(-5) )

    ### Check if volatilty is being created correctly
    tasks['one_week_vol_pred']            = ( df.close.rolling(5).std() - df.close.rolling(5).std().shift(-1) )
    tasks['one_month_vol_pred']           = ( df.close.rolling(21).std() - df.close.rolling(21).std().shift(-1) )

    tasks['one_week_skew_pred']            = ( df.close.rolling(5).skew() - df.close.rolling(5).skew().shift(-1) )
    tasks['one_month_skew_pred']           = ( df.close.rolling(21).skew() - df.close.rolling(21).skew().shift(-1) )

    # tasks.dropna(inplace=True)

    # Experiment between this and using raw values 
    df                  = standardize(df,     look_back=STANDARIZE_LOOKBACK_WINDOW)
    meta_features       = standardize(meta_features, look_back=STANDARIZE_LOOKBACK_WINDOW)

    idx                 = meta_features.index.intersection(df.index).intersection(tasks.index)
    df, meta_features,tasks    = df.loc[idx], meta_features.loc[idx], tasks.loc[idx]


    X_data_array, X_data_dict                = [], {}       
    EXP_FEAT_data_array                      = []


    for i in range(LOOKBACK_WINDOW, len(df)+1):
        X_data_array.append( df.iloc[i - LOOKBACK_WINDOW:i].values )
        EXP_FEAT_data_array.append( meta_features.iloc[i-1].values ) # Without minus one, the exp features lead X_DATA by one tiemstamp
        X_data_dict[df.index[i-1]]  = df.iloc[i - LOOKBACK_WINDOW:i].values
        # EXP_FEAT_data_array[meta_features.index[i-1]] = meta_features.iloc[i-1].values
    

        assert meta_features.iloc[i-1].name == df.iloc[i - LOOKBACK_WINDOW:i].iloc[-1].name 

    # X_data_array        = np.array( X_data_array )
    # EXP_FEAT_data_array = np.array( EXP_FEAT_data_array )

    data_dict[file[:-4]]                  = {}
    data_dict[file[:-4]]['X_DATA']        = X_data_array
    data_dict[file[:-4]]['EXP_FEAT_DATA'] = EXP_FEAT_data_array
    data_dict[file[:-4]]['Y_DATA']        = tasks 
    data_dict[file[:-4]]['X_DATA_DICT']   = X_data_dict 
    # print(file, X_data_array.shape, EXP_FEAT_data_array.shape)

    num_assets+=1
num_assets

100%|██████████| 7/7 [00:38<00:00,  5.44s/it]


5

In [30]:
pct_train          = 0.7
train_data         = np.concatenate( [data_dict[x]['X_DATA'][:int(pct_train*len(data_dict[x]['X_DATA']))] for x in data_dict.keys()])
exp_train_data     = np.concatenate( [data_dict[x]['EXP_FEAT_DATA'][:int(pct_train*len(data_dict[x]['X_DATA']))] for x in data_dict.keys()])
# train_labels       = np.concatenate( [data_dict[x]['Y_DATA'][:int(pct_train*len(data_dict[x]['X_DATA']))] for x in data_dict.keys()])
assert train_data.shape[0] == exp_train_data.shape[0] 
print(train_data.shape, exp_train_data.shape)


test_data         = np.concatenate( [data_dict[x]['X_DATA'][int(pct_train*len(data_dict[x]['X_DATA'])):] for x in data_dict.keys()])
exp_test_data     = np.concatenate( [data_dict[x]['EXP_FEAT_DATA'][int(pct_train*len(data_dict[x]['X_DATA'])):] for x in data_dict.keys()])
# test_labels       = np.concatenate( [data_dict[x]['Y_DATA'][:int(pct_train*len(data_dict[x]['X_DATA']))] for x in data_dict.keys()])
assert test_data.shape[0] == exp_test_data.shape[0] 
print(test_data.shape, exp_test_data.shape)
# # (Both train_data and test_data have a shape of n_instances x n_timestamps x n_features)



(48444, 7, 5) (48444, 14)
(20764, 7, 5) (20764, 14)


In [28]:

# Train a TS2Vec model
model = TS2Vec(
    input_dims=train_data.shape[-1],
    device=0,
    output_dims=100,
    batch_size=64, 
   
)
loss_log = model.fit(
    train_data,
    expert_features=exp_train_data, # train_data.reshape(100, -1)[:,40:],
    verbose=True,
    use_expclr_loss=True,
    n_epochs=10
)

# # Compute timestamp-level representations for test set
# test_repr_tl = model.encode(test_data)  # n_instances x n_timestamps x output_dims

# Compute instance-level representations for test set
# test_repr_il = model.encode(test_data, encoding_window='full_series')  # n_instances x output_dims

# # Sliding inference for test set
# test_repr_si = model.encode(
#     test_data,
#     casual=True,
#     sliding_length=1,
#     sliding_padding=50
# )  # n_instances x n_timestamps x output_dims
# # (The timestamp t's representation vector is computed using the observations located in [t-50, t])

KeyboardInterrupt: ignored

In [22]:
data_dict.keys()

dict_keys(['XEL', 'XRAY', 'XRX', 'XYL', 'ZION'])

In [31]:
# data_dict[file[:-4]]['X_DATA_DICT']
TEST_ASSET = "XEL"

num_test_intances = int(pct_train*len(data_dict[TEST_ASSET]['Y_DATA']))

test_y = data_dict[TEST_ASSET]['Y_DATA'][num_test_intances:].dropna().iloc[:1000]

test_x = { idx: data_dict[TEST_ASSET]['X_DATA_DICT'][idx]  for idx in test_y.index if idx in data_dict[TEST_ASSET]['X_DATA_DICT'].keys() }

print( len(test_x) )
print( len(test_y) )

arrar_for_df = []
for idx in test_y.index[:]:
    arrar_for_df.append( model.encode(data_dict[TEST_ASSET]['X_DATA_DICT'][idx][np.newaxis, ...], encoding_window='full_series').reshape(-1) )

test_repr_il = pd.DataFrame(arrar_for_df, index = test_y.index)

0
1000


KeyError: ignored

In [24]:
X_train, X_test, y_train, y_test = train_test_split(test_repr_il, test_y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression()
reg = RidgeClassifier()


reg.fit( X_train, np.sign( y_train['one_day_price_pred'] ) )

print( reg.score(X_train, np.sign( y_train['one_day_price_pred'] ) ) )
print( reg.score(X_test,  np.sign( y_test['one_day_price_pred'] ) ) )

0.573134328358209
0.41818181818181815


In [25]:
# data_dict[file[:-4]]['X_DATA_DICT']  
#  

results = {}

mean = 0

for TEST_ASSET in  list( data_dict.keys() )[:]:
    print("\n", TEST_ASSET)
    results[TEST_ASSET] = {}
    
    num_test_intances = int(pct_train*len(data_dict[TEST_ASSET]['Y_DATA']))
    x_data_index = pd.DatetimeIndex( data_dict[TEST_ASSET]['X_DATA_DICT'].keys() )
    data_dict[TEST_ASSET]['Y_DATA'].index  = pd.to_datetime( data_dict[TEST_ASSET]['Y_DATA'].index )

    idx = data_dict[TEST_ASSET]['Y_DATA'].dropna().index.intersection(x_data_index)
    test_y = data_dict[TEST_ASSET]['Y_DATA'][:].loc[idx].iloc[-2000:] #.dropna()

    test_x = { pd.to_datetime( idx ): val for idx, val in data_dict[TEST_ASSET]['X_DATA_DICT'].items() }

    # print( len(test_x) )
    # print( len(test_y) )

    encoding_window_method = ['full_series', 'net_compression']
    test_name = 'one_day_price_pred'
    for encoding_window in encoding_window_method:
        print(encoding_window)
        arrar_for_df = [ model.encode(test_x[idx][np.newaxis, ...], encoding_window=encoding_window).reshape(-1) for idx in test_y.index ]

        test_repr_il = pd.DataFrame(arrar_for_df, index = test_y.index)

        X_train, X_test, y_train, y_test = train_test_split(test_repr_il, test_y, test_size=0.33, random_state=42)
        # reg = linear_model.LinearRegression()
        reg = RidgeClassifier()
        # reg = linear_model.Lasso(alpha=0.1)

        clf = ExtraTreesClassifier(n_estimators=50)
        clf = clf.fit(X_train, np.sign( y_train[test_name] ))
        feat_selec_model = SelectFromModel(clf, prefit=True)
        X_train = feat_selec_model.transform(X_train)
        X_test = feat_selec_model.transform(X_test)

        reg.fit( X_train, np.sign( y_train[test_name] ) )

        results[TEST_ASSET][encoding_window] = reg.score(X_test,  np.sign( y_test[test_name] ) )
        
        print( reg.score(X_train, np.sign( y_train[test_name] ) ) )
        print(results[TEST_ASSET][encoding_window])
        # print( reg.score(X_test,  np.sign( y_test[test_name] ) ) )

    # print(results[TEST_ASSET]['full_series'], results[TEST_ASSET]['net_compression'] )
        
    mean += results[TEST_ASSET]['full_series'] < results[TEST_ASSET]['net_compression']

mean /= len(results.keys())


print( "\n net_compression acheives better results ", mean, "% of the times " )


 XEL
full_series
0.5373134328358209
0.509090909090909
net_compression


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


0.491044776119403
0.4303030303030303

 XRAY
full_series
0.5686567164179105
0.4954545454545455
net_compression


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


0.5313432835820896
0.47878787878787876

 XRX
full_series
0.5462686567164179
0.5060606060606061
net_compression


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


0.5156716417910447
0.4666666666666667

 XYL
full_series
0.5658986175115207
0.4803738317757009
net_compression


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


0.5225806451612903
0.45981308411214955

 ZION
full_series
0.5417910447761194
0.45606060606060606
net_compression


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


0.5171641791044777
0.46060606060606063

 net_compression acheives better results  0.2 % of the times 


In [None]:
idx = np.sign( y_test['one_day_price_pred'] )[ np.sign( y_test['one_day_price_pred'] ) != 0].index
for i in range(100):
    acc = ( np.sign( X_test )[i].loc[idx] == np.sign( y_test['one_day_price_pred'].loc[idx] ) ).mean()
    if acc > .52:
        print(i, acc)

In [None]:
test_repr_il[2].plot()

In [None]:
results[TEST_ASSET][encoding_window]

# TODO: Result has same values for all keys
mean = 0
for key in results.keys():
    print(key, results[TEST_ASSET]['full_series'], results[TEST_ASSET]['net_compression'])
    mean += results[TEST_ASSET]['full_series'] < results[TEST_ASSET]['net_compression']

mean /= len(results.keys())
mean

In [None]:
test_repr_il[0].plot()

In [None]:
### TEST LOSS FUNCTIONS
BATCH_SIZE              = 100
rnn                     = nn.LSTM(5, 20, batch_first=True).float()
loss_test_values        = torch.from_numpy( train_data[:BATCH_SIZE] ).float()
output, (hn, cn)        = rnn(loss_test_values)
hn                      = hn.squeeze(0)
loss_test_exp_values    = torch.from_numpy( exp_train_data[:BATCH_SIZE]).float()

print( "quadratic_contrastive_loss: ", quadratic_contrastive_loss(hn, loss_test_exp_values, delta=1) )
print( "expclr_loss: ", expclr_loss(hn, loss_test_exp_values, delta=1) )

# max_diff = get_max_norm(batch_f)
# similarity_measure(batch_f[0], batch_f[1], max_diff)

In [None]:
# out = model._eval_with_pooling(torch.from_numpy( test_data[:64] ).to(torch.float), None, encoding_window='')

In [None]:


# # # Load the ECG200 dataset from UCR archive
# train_data, train_labels, test_data, test_labels = datautils.load_ECG()
# # # (Both train_data and test_data have a shape of n_instances x n_timestamps x n_features)

# exp_feat = pd.DataFrame(  train_data.reshape(100, -1)[:,40:] )
# exp_feat = pd.DataFrame(  exp_feat.mean(axis=1) ).values


# # Train a TS2Vec model
# model = TS2Vec(
#     input_dims=1,
#     device=0,
#     output_dims=320
# )
# loss_log = model.fit(
#     train_data,
#     expert_features=exp_feat, #train_data.reshape(100, -1)[:,40:],
#     verbose=True
# )

# # Compute timestamp-level representations for test set
# test_repr_tl = model.encode(test_data)  # n_instances x n_timestamps x output_dims

# # Compute instance-level representations for test set
# test_repr_il = model.encode(test_data, encoding_window='full_series')  # n_instances x output_dims

# # Sliding inference for test set
# test_repr_si = model.encode(
#     test_data,
#     casual=True,
#     sliding_length=1,
#     sliding_padding=50
# )  # n_instances x n_timestamps x output_dims
# # (The timestamp t's representation vector is computed using the observations located in [t-50, t])

In [None]:
# data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv("ETTh2")
# train_data = data[:, train_slice]
        
# # Train a TS2Vec model
# model = TS2Vec(
#     input_dims=1,
#     device=0,
#     output_dims=320
# )
# loss_log = model.fit(
#     train_data,
#     verbose=True
# )

# # Compute timestamp-level representations for test set
# test_repr = model.encode(test_data)  # n_instances x n_timestamps x output_dims

# # Compute instance-level representations for test set
# test_repr = model.encode(test_data, encoding_window='full_series')  # n_instances x output_dims

# # Sliding inference for test set
# test_repr = model.encode(
#     test_data,
#     casual=True,
#     sliding_length=1,
#     sliding_padding=50
# )  # n_instances x n_timestamps x output_dims
# # (The timestamp t's representation vector is computed using the observations located in [t-50, t])