In [1]:
from random import seed
from copy import copy
import warnings
from sklearn.model_selection import train_test_split
from os import path

warnings.filterwarnings("ignore")

import pickle
import pandas as pd
import itertools
import numpy as np
from tqdm.auto import tqdm
import pathlib

import datetime

import torch
from torch import nn

from recommender_module.price_crawler import build_ta_features, get_period_and_frequency
from recommender_module.rnn_utils import get_model, pd_to_tensor
from recommender_module.rnn_trend import to_csv_lines

### Enable CUDA if available

In [13]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda:0'

### Declare Global variables and config generation function

In [None]:
TARGET = ['1D-close', '7D-close', '30D-close']

# Number of trials
TRIAL = 10

# TA features to be used
TA_FEATURES = ['ROC', 'MOM', 'EMA', 'SMA', 'VAR', 'MACD', 'ADX', 'RSI']

SIZE_RANGE = [32, 64, 96]                     # Size of hidden layers within the model
DELTA_RANGE = ['24H']      # Possible duration of each period
MODEL_RANGE = ['LSTM', 'GRU', 'RNN']          # Possible Models
# MODEL_RANGE = ['LSTM']          # Possible Models
NORMALIZE = ['Normal', 'MinMax']              # Possible data normalization method
LR_RANGE = [0.001]                      # Possible learning rates
# ROLL_RANGE = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30]]
ROLL_RANGE = [list(range(1,30)), list(range(1,60, 2))]
NUMBER_BLOCKS = [3, 5]

def generate_config(device='cpu'):
    seed()

    data_iter = len(list(itertools.product(MODEL_RANGE, SIZE_RANGE, LR_RANGE, NUMBER_BLOCKS)))
    all_configs = itertools.product(NORMALIZE, DELTA_RANGE, ROLL_RANGE, MODEL_RANGE, SIZE_RANGE, LR_RANGE, NUMBER_BLOCKS)
    configs = [{
        'roll': c[2],
        'delta': c[1],
        'model': c[3],
        'norm': c[0],
        'hidden': c[4],
        'lr': c[5],
        'n_blocks': c[6],
        'targets': 1,
        'device': device,
        } for c in all_configs]

    target = [copy(TARGET) for _ in configs]

    features = [['open', 'close', 'high', 'low', 'volume'] + [s.lower() for s in TA_FEATURES] for _ in configs]
    for k, feature in enumerate(features):
        feature += [j + '-r' + str(i) for j in feature for i in configs[k]['roll']]

    return configs, target, features, data_iter

### Functions modified for model training

In [None]:
def build_features(address, freq='1D', ta_list=None, ys=['1D-close'], roll=[1]):

    '''
    To build features for the target address.
    Input(address <str>) -> DataFrame
    Read the craweled Dataframe of the specified address and add feature columns to it
    '''

    file_dir = "price_data/" + address + '.pkl'
    with open(file_dir, 'rb') as f:
        df, update_time = pickle.load(f)

    x_df = build_ta_features(df, freq=freq, ta_list=ta_list)

    # Adding rolling features
    columns = copy(x_df.columns)
    for i in roll:
        for c in columns:
            x_df[c + '-r' + str(i)] = x_df[c].shift(i)

    if ys is None:
        y_df = None
    else:
        # Adding labels
        y_df = x_df.copy() # Just to store x_df's index into y_df
        for target in ys:
            interval, feature = target.split('-')
            p, f = get_period_and_frequency(interval)
            y_df[target] = (x_df[feature].shift(periods=-p, freq=f) - x_df[feature]) > 0
        y_df = y_df.drop(columns=x_df.columns) # Features in x_df are dropped

        # Dropping all the np.inf and np.nan values in dataframes
        x_df = x_df.replace(np.inf, np.nan).dropna(how='all', axis=1).dropna(how='any', axis=0)
        y_df = y_df.replace(np.inf, np.nan).dropna(how='all', axis=1).dropna(how='any', axis=0)

        # Truncating series to make sure their indices are the same
        if len(x_df) > len(y_df):
            x_df = x_df.loc[y_df.index]
        else:
            y_df = y_df.loc[x_df.index]

        # Converting it to pd.Series if there is only one prediction target column
        if len(y_df.columns) == 1:
            y_df = y_df.iloc[:, 0] # Convert it to pd.Series

    return x_df, y_df, x_df.columns

In [None]:
def load_data(address, target, cfg):
    X, y, columns = build_features(address, freq=cfg['delta'], ta_list=TA_FEATURES, ys=target, roll=cfg['roll'])
    
    inp = len(X.columns)

    if cfg['norm'] == 'MinMax':
        X = (X - X.min()) / (X.max() - X.min())
    if cfg['norm'] == 'Normal':
        X = (X - X.mean()) / X.std()

    X, y = pd_to_tensor(X, y, device)
    y = y.unsqueeze(-1)
    train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
    train_x = X
    train_y = y

    return train_x, train_y, val_x, val_y, columns, inp

In [None]:
def train(data, cfg, target, device='cpu', finetune_model=None):

    '''
    Input(address <str>, configuration <dict>, features <list>, target <dict>, log <file>) -> list
    This function gives a list of trained model for each time horizon using the configuration above
    '''
    train_x, train_y, val_x, val_y, _, _ = data
    original_x = train_x.detach().clone()

    if finetune_model != None:
        rnn = finetune_model
    else:
        rnn = get_model(cfg)

    optim = torch.optim.Adam(rnn.parameters(), lr=cfg['lr'])
    criterion = nn.BCELoss()
    i = 1
    overfit = 0
    previous_loss = 999
    in_long_run = False
    previous_train_loss = 999
    previous_acc = 0
    while True:
        try:
            optim.zero_grad()
            train_pred = rnn(train_x)
            loss = criterion(train_pred, train_y)
            loss.backward()
            optim.step()
            val_pred = rnn(val_x)
            val_acc = ((val_pred > 0.5) == val_y).float()
            val_acc = torch.mean(val_acc, dim=0).tolist()[0][0]

            if val_acc < previous_acc:
                overfit += 1
            else:
                overfit = 0
            if overfit > 3:
                break

            previous_acc = val_acc
        except RuntimeError:
            return None, None, None, None

        i += 1
        if i > 4000:
            break
        if i % 1000 == 0:
            val_acc = ((val_pred > 0.5) == val_y).float()
            val_acc = torch.mean(val_acc, dim=0).tolist()[0][0]
            print('long run', i // 1000, 'best val:', val_acc)
            in_long_run = True

    train_pred = rnn(original_x)
    train_acc = ((train_pred > 0.5) == train_y).float()
    train_acc = torch.mean(train_acc, dim=0).tolist()[0]
    val_pred = rnn(val_x)
    val_acc = ((val_pred > 0.5) == val_y).float()
    val_acc = torch.mean(val_acc, dim=0).tolist()[0]
    lines = to_csv_lines(cfg, target, train_acc, val_acc)
    return rnn, lines, val_acc, in_long_run

### List all available data and already existing models

In [None]:
existing_models = [f.stem.split('.')[0] for f in pathlib.Path('models/').iterdir()]

data_files = pathlib.Path('price_data/').iterdir()
new_data_files = [(f, datetime.date.fromtimestamp(f.stat().st_mtime)) for f in data_files]
new_data_files = [f[0] for f in new_data_files if f[1] >= datetime.date.today() - datetime.timedelta(days=7)]

### Train new models and fine tune existing models

In [None]:
for f in new_data_files:    
    address = f.stem
        
    file_name = f.stem + '_rnn.csv'
    if path.exists('tuning_logs/' + file_name):
        log = open('tuning_logs/' + file_name, 'a')
        # continue
    else:
        log = open('tuning_logs/' + file_name, 'w')
        log.write('TARGET, MODEL, DELTA, NORM, HIDDEN, LR, TRAIN, VAL\n')

    csv_lines = []
    cfgs, targets, featureses, data_iter = generate_config(device)
    for t in TARGET:
        if address + '-' + t in existing_models:
            # Fine tune model
            print("Fine tuning", address + '-' + t)
            model, cfg, columns =  pickle.load(open(f"models/{address}-{t}.pkl", "rb"))
            model = model.to(device)
            data = load_data(address, [t], cfg)
            model, lines, val_acc, long_run = train(data, cfg, [t], device, finetune_model=model)
            pickle.dump([model.to('cpu'), cfg, columns], open(f"models/{address}-{t}.pkl", "wb")) 
            continue 

        best_model = None
        best_config = None
        best_val = 0
        best_columns = [None]

        train_iteration = list(zip(cfgs, featureses))
        deltas = {
            '1D-close': ['6H', '12H', '24H'],
            '7D-close': ['12H', '24H', '1D'],
            '30D-close': ['24H', '2D', '3D'],
        }
        for d in deltas[t]:
            for i in (pbar := tqdm(range(len(train_iteration)), desc=f"Training ({address +'-'+ t}) [{d}] - Best: {best_val:.3f} - # Col: {len(best_columns)}")):
                cfg, features = train_iteration[i]
                cfg['delta'] = d

                if i % data_iter == 0:
                    try:
                        data = load_data(address, [t], cfg)
                    except ValueError as e:
                        print(e)
                        break
                
                cfg['input'] = data[-1]
                columns = data[-2]
                model, lines, val_acc, long_run = train(data, cfg, [t], device)
                if model == None:
                    continue
                val_acc = np.mean(val_acc)

                log.writelines(lines)
                if (val_acc > best_val):
                    best_val = val_acc
                    best_model = model
                    best_config = cfg

                    best_columns = columns
                    pbar.set_description(f"Training ({address +'-'+ t}) [{d}] - Best: {best_val:.3f} - # Col: {len(best_columns)} - Roll: {cfg['roll']}")
                    if val_acc >= 0.999:
                        break

        if best_model != None:
            pickle.dump([best_model.to('cpu'), best_config, best_columns], open(f"models/{address}-{t}.pkl", "wb"))                  

    log.close()