# Guidance for training a model with your own data

## 1. Import the necessary packages

In [1]:
import argparse
import os
import random

import numpy as np
import pandas as pd
import torch

from softs.exp.exp_custom import Exp_Custom

## 2. Define the hyperparameters

In [2]:
# fix seed for reproducibility
fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)
torch.set_num_threads(6)

# basic config
config = {
    # dataset settings
    'root_path': './dataset/ETT-small/',
    'data_path': 'ETTm1.csv',
    'data': 'ETTm1',
    'features': 'MS',
    'freq': 'B',
    'seq_len': 200,
    'pred_len': 20,
    # model settings
    'model': 'SOFTS',
    'checkpoints': './checkpoints/',
    'd_model': 128,
    'd_core': 64,
    'd_ff': 128,
    'e_layers': 2,
    'learning_rate': 0.0003,
    'lradj': 'cosine',
    'train_epochs': 50,
    'patience': 7,
    'batch_size': 50,
    'dropout': 0.0,
    'activation': 'gelu',
    'use_norm': True,
    'loss_func': 'huber',
    # system settings
    'num_workers': 0,
    'use_gpu': True,
    'gpu': '0',
    'save_model': True,
    'predict_all': True,
}

parser = argparse.ArgumentParser(description='SOFTS')
args = parser.parse_args([])
args.__dict__.update(config)
args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False

print('Args in experiment:')
print(args)

Args in experiment:
Namespace(root_path='./dataset/ETT-small/', data_path='ETTm1.csv', data='ETTm1', features='MS', freq='B', seq_len=200, pred_len=20, model='SOFTS', checkpoints='./checkpoints/', d_model=128, d_core=64, d_ff=128, e_layers=2, learning_rate=0.0003, lradj='cosine', train_epochs=50, patience=7, batch_size=50, dropout=0.0, activation='gelu', use_norm=True, loss_func='huber', num_workers=0, use_gpu=False, gpu='0', save_model=True, predict_all=True)


In [3]:
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()  # take environment variables from .env.

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
print("db_url: ", db_url)

alchemyEngine = create_engine(
    db_url,
    pool_recycle=3600,
)

db_url:  postgresql+psycopg2://postgres:qHmqsRro760Ji0ZPwPVH@10.112.1.162:5432/china_stock_market_db


In [4]:
query = """
SELECT "date", "open", "close", high, low, volume, amount, open_preclose_rate, high_preclose_rate, low_preclose_rate, vol_change_rate, amt_change_rate, change_rate
FROM public.index_daily_em_view 
where symbol = '000922' 
order by date
"""
df = pd.read_sql(query, alchemyEngine, parse_dates=["date"])

In [5]:
df

Unnamed: 0,date,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,change_rate
0,2008-08-04,2759.80,2687.55,2759.80,2683.91,5972906.0,5.436500e+09,,,,,,
1,2008-08-05,2683.84,2582.16,2695.04,2575.85,8046384.0,7.688379e+09,-0.13804,0.27869,-4.15620,34.71473,41.42147,-3.92142
2,2008-08-06,2597.73,2590.76,2614.61,2550.19,6838590.0,5.979064e+09,0.60298,1.25670,-1.23811,-15.01039,-22.23245,0.33305
3,2008-08-07,2589.28,2592.37,2612.05,2562.19,5549979.0,5.082008e+09,-0.05713,0.82177,-1.10277,-18.84323,-15.00328,0.06214
4,2008-08-08,2591.63,2455.95,2591.63,2450.83,6933512.0,6.239304e+09,-0.02855,-0.02855,-5.45987,24.92862,22.77241,-5.26237
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,2024-06-24,5400.87,5347.62,5401.38,5322.85,39129798.0,3.317571e+10,-0.31249,-0.30308,-1.75256,11.20192,10.89010,-1.29536
3860,2024-06-25,5342.56,5360.90,5389.79,5340.14,38294062.0,3.132784e+10,-0.09462,0.78858,-0.13988,-2.13580,-5.56996,0.24833
3861,2024-06-26,5340.99,5368.96,5372.08,5313.30,33556708.0,2.752461e+10,-0.37139,0.20855,-0.88791,-12.37099,-12.14010,0.15035
3862,2024-06-27,5354.17,5334.25,5371.80,5321.77,40649668.0,3.273590e+10,-0.27547,0.05290,-0.87894,21.13723,18.93320,-0.64649


In [5]:
df = df.dropna()
df

Unnamed: 0,date,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,change_rate
1,2008-08-05,2683.84,2582.16,2695.04,2575.85,8046384.0,7.688379e+09,-0.13804,0.27869,-4.15620,34.71473,41.42147,-3.92142
2,2008-08-06,2597.73,2590.76,2614.61,2550.19,6838590.0,5.979064e+09,0.60298,1.25670,-1.23811,-15.01039,-22.23245,0.33305
3,2008-08-07,2589.28,2592.37,2612.05,2562.19,5549979.0,5.082008e+09,-0.05713,0.82177,-1.10277,-18.84323,-15.00328,0.06214
4,2008-08-08,2591.63,2455.95,2591.63,2450.83,6933512.0,6.239304e+09,-0.02855,-0.02855,-5.45987,24.92862,22.77241,-5.26237
5,2008-08-11,2449.03,2326.84,2451.26,2310.30,7688190.0,6.428605e+09,-0.28176,-0.19096,-5.93050,10.88450,3.03401,-5.25703
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,2024-06-24,5400.87,5347.62,5401.38,5322.85,39129798.0,3.317571e+10,-0.31249,-0.30308,-1.75256,11.20192,10.89010,-1.29536
3860,2024-06-25,5342.56,5360.90,5389.79,5340.14,38294062.0,3.132784e+10,-0.09462,0.78858,-0.13988,-2.13580,-5.56996,0.24833
3861,2024-06-26,5340.99,5368.96,5372.08,5313.30,33556708.0,2.752461e+10,-0.37139,0.20855,-0.88791,-12.37099,-12.14010,0.15035
3862,2024-06-27,5354.17,5334.25,5371.80,5321.77,40649668.0,3.273590e+10,-0.27547,0.05290,-0.87894,21.13723,18.93320,-0.64649


In [None]:
# df = df[["date", "change_rate"]]
# df

## 3. Prepare the dataset
Organize your data in the following format:
- The dataset should be a csv file.
- If there is a time feature, the first column contains timestamps in the format 'YYYY-MM-DD HH:MM:SS'. If there's no time feature, the dataset starts directly with the features.
- If the parameter `features` is 'M', the following columns are both the features and the targets. If `features` is 'MS', the following columns are the features, and the last column is the target.

In [6]:
# load data
# data = pd.read_csv(os.path.join(args.root_path, args.data_path))
# print(data.head())

# split data
train_data = df.iloc[: 3658]
vali_data = df.iloc[3658 - args.seq_len: 3758]
test_data = df.iloc[3758 - args.seq_len: ]

# optional: scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
if 'date' in train_data.columns:
    scaler.fit(train_data.iloc[:, 1:])
    train_data.iloc[:, 1:] = scaler.transform(train_data.iloc[:, 1:])
    vali_data.iloc[:, 1:] = scaler.transform(vali_data.iloc[:, 1:])
    test_data.iloc[:, 1:] = scaler.transform(test_data.iloc[:, 1:])
else:
    scaler.fit(train_data.iloc[:, :])
    train_data.iloc[:, :] = scaler.transform(train_data.iloc[:, :])
    vali_data.iloc[:, :] = scaler.transform(vali_data.iloc[:, :])
    test_data.iloc[:, :] = scaler.transform(test_data.iloc[:, :])

## 4. Train and Evaluate the model


In [7]:
Exp = Exp_Custom(args)
setting = f'{args.data}_{args.model}_{args.seq_len}_{args.pred_len}'
print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
Exp.train(setting=setting, train_data=train_data, vali_data=vali_data, test_data=test_data)
print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
Exp.test(setting=setting, test_data=test_data)

Use CPU
>>>>>>>start training : ETTm1_SOFTS_200_20>>>>>>>>>>>>>>>>>>>>>>>>>>
Epoch: 1 cost time: 2.434340715408325
Validation loss decreased (inf --> 0.037627).  Saving model ...
Epoch: 1, Steps: 69 | Train Loss: nan Vali Loss: 0.0376273 Test Loss: 0.1728429
Updating learning rate to 0.0002997040092642407
Epoch: 2 cost time: 2.4519269466400146
Validation loss decreased (0.037627 --> 0.036367).  Saving model ...
Epoch: 2, Steps: 69 | Train Loss: nan Vali Loss: 0.0363674 Test Loss: 0.1702635
Updating learning rate to 0.0002988172051971717
Epoch: 3 cost time: 2.4767849445343018
Validation loss decreased (0.036367 --> 0.036286).  Saving model ...
Epoch: 3, Steps: 69 | Train Loss: nan Vali Loss: 0.0362865 Test Loss: 0.1691395
Updating learning rate to 0.0002973430876093033
Epoch: 4 cost time: 2.468445301055908
EarlyStopping counter: 1 out of 7
Epoch: 4, Steps: 69 | Train Loss: nan Vali Loss: 0.0365019 Test Loss: 0.1692091
Updating learning rate to 0.00029528747416929463
Epoch: 5 cost time: 

(0.5921891487828863, 0.43507387956907584, 0.16913954463115957)

## 5. Get predictions by the model

In [11]:
test_data

Unnamed: 0,date,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,change_rate
3559,2023-03-27,5178.28,5139.55,5178.28,5112.82,44526388.0,3.641135e+10,-0.06427,-0.06427,-1.32758,21.63227,26.25801,-0.81172
3560,2023-03-28,5151.67,5141.40,5161.19,5124.96,33658311.0,2.857319e+10,0.23582,0.42105,-0.28388,-24.40817,-21.52671,0.03600
3561,2023-03-29,5155.21,5126.52,5158.82,5123.06,35792171.0,2.815999e+10,0.26860,0.33882,-0.35671,6.33977,-1.44608,-0.28942
3562,2023-03-30,5130.26,5158.66,5161.25,5096.42,36860505.0,3.167253e+10,0.07295,0.67746,-0.58714,2.98483,12.47351,0.62694
3563,2023-03-31,5163.25,5175.89,5196.64,5157.27,34841760.0,2.846308e+10,0.08898,0.73624,-0.02694,-5.47672,-10.13324,0.33400
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,2024-06-24,5400.87,5347.62,5401.38,5322.85,39129798.0,3.317571e+10,-0.31249,-0.30308,-1.75256,11.20192,10.89010,-1.29536
3860,2024-06-25,5342.56,5360.90,5389.79,5340.14,38294062.0,3.132784e+10,-0.09462,0.78858,-0.13988,-2.13580,-5.56996,0.24833
3861,2024-06-26,5340.99,5368.96,5372.08,5313.30,33556708.0,2.752461e+10,-0.37139,0.20855,-0.88791,-12.37099,-12.14010,0.15035
3862,2024-06-27,5354.17,5334.25,5371.80,5321.77,40649668.0,3.273590e+10,-0.27547,0.05290,-0.87894,21.13723,18.93320,-0.64649


In [8]:
# get predictions
predictions = Exp.predict(setting=setting, pred_data=test_data)
print(predictions.shape)

loading model from ./checkpoints/ETTm1_SOFTS_200_20/checkpoint.pth
(106, 20, 12)


In [9]:
predictions

array([[[-3.53568721e+00, -3.53794146e+00, -3.54941225e+00, ...,
         -1.14707135e-01, -1.09096773e-01, -4.93070781e-02],
        [-3.53651643e+00, -3.53875399e+00, -3.55023813e+00, ...,
         -1.13048047e-01, -1.07222505e-01, -1.50860865e-02],
        [-3.53613353e+00, -3.53838181e+00, -3.54986119e+00, ...,
         -1.23390384e-01, -1.19686268e-01, -3.34360041e-02],
        ...,
        [-3.53634620e+00, -3.53858614e+00, -3.55006814e+00, ...,
         -1.19637080e-01, -1.14562415e-01, -5.23053110e-04],
        [-3.53598213e+00, -3.53824091e+00, -3.54971123e+00, ...,
         -1.10682271e-01, -1.04743317e-01, -8.93943757e-02],
        [-3.53615379e+00, -3.53840208e+00, -3.54987907e+00, ...,
         -1.12765744e-01, -1.06766924e-01, -6.12040982e-02]],

       [[-3.41268945e+00, -3.41547275e+00, -3.42694521e+00, ...,
         -6.13327138e-02, -6.29292876e-02,  1.36018544e-03],
        [-3.58141708e+00, -3.58347178e+00, -3.59495115e+00, ...,
         -1.40827462e-01, -1.35317892e

In [10]:
first_pred = predictions[0, :, :]

In [15]:
for e in first_pred:
    print(e[-1])

-0.25168043
-0.23849231
-0.43918929
-0.5195542
-0.474421
-0.45291921
-0.63255125
-0.5926477
-0.3999927
-0.38951707
-0.3837889
-0.6110861
-0.4160754
-0.15328659
-0.38954854
-0.46345094
-0.5812362
-0.3449399
-0.43831828
-0.40704703


In [10]:
last_pred = predictions[-1, :, -1]

In [11]:
last_pred.shape

(20,)

In [12]:
last_pred

array([ 0.12212987,  0.00228432, -0.00976881,  0.04324216,  0.16158794,
        0.00745515, -0.05529791,  0.03284356, -0.00620433, -0.03796517,
        0.02267565,  0.00676192,  0.06219453,  0.07042953, -0.1283086 ,
       -0.08461678, -0.02100138, -0.01114248, -0.1238388 ,  0.22665255],
      dtype=float32)

In [19]:
for e in last_pred:
    print(e[0])

IndexError: invalid index to scalar variable.

In [13]:
print(predictions.shape)

(106, 20, 12)


In [14]:
last_pred2 = predictions[-1, :, :]

In [15]:
last_pred2.shape

(20, 12)

In [16]:
last_pred2[:, -1]

array([ 0.12212987,  0.00228432, -0.00976881,  0.04324216,  0.16158794,
        0.00745515, -0.05529791,  0.03284356, -0.00620433, -0.03796517,
        0.02267565,  0.00676192,  0.06219453,  0.07042953, -0.1283086 ,
       -0.08461678, -0.02100138, -0.01114248, -0.1238388 ,  0.22665255],
      dtype=float32)

In [17]:
last_pred2

array([[-0.35760903, -0.34167805, -0.34945157, -0.3474265 , -0.03013594,
        -0.1926657 , -0.1430737 , -0.36352468,  0.568133  , -0.08901974,
        -0.05155299,  0.12212987],
       [-0.43095037, -0.42511907, -0.43208   , -0.4167831 , -0.1450961 ,
        -0.2799972 ,  0.17221417, -0.49208504,  0.6781353 , -0.11811571,
        -0.08895723,  0.00228432],
       [-0.29500216, -0.29460555, -0.29819947, -0.28917456, -0.08926313,
        -0.22758208,  0.03462054, -0.43282095,  0.41548863,  0.00636836,
         0.00648158, -0.00976881],
       [-0.43865493, -0.4360272 , -0.43676636, -0.43078753, -0.17196186,
        -0.2972327 , -0.05306545, -0.42743242,  0.49369532, -0.25508988,
        -0.2456123 ,  0.04324216],
       [ 0.11655807,  0.12783968,  0.11718738,  0.12578028,  0.03482726,
        -0.12778388,  0.019248  , -0.37291604,  0.58827174, -0.00442881,
         0.03278171,  0.16158794],
       [-0.08888495, -0.08641553, -0.0917604 , -0.07866722, -0.06558107,
        -0.21008402, -

In [18]:
it_padded_pred = scaler.inverse_transform(last_pred2)

In [19]:
it_padded_pred

array([[ 3.33945996e+03,  3.35963281e+03,  3.37692822e+03,
         3.32183789e+03,  3.32632820e+07,  2.78783222e+10,
        -1.75146565e-01,  3.98692697e-01, -2.45355934e-01,
         6.92340493e-01,  1.53869653e+00,  2.14897335e-01],
       [ 3.26239771e+03,  3.27193042e+03,  3.28973364e+03,
         3.24938745e+03,  2.93874140e+07,  2.48529367e+10,
         3.27476822e-02,  2.63720304e-01, -1.14778727e-01,
        -1.00225799e-01,  5.49598753e-01,  3.24344225e-02],
       [ 3.40524292e+03,  3.40910913e+03,  3.43101245e+03,
         3.38268823e+03,  3.12698160e+07,  2.66687304e+10,
        -5.79786897e-02,  3.25940222e-01, -4.26551044e-01,
         3.29068613e+00,  3.07333183e+00,  1.40837273e-02],
       [ 3.25430225e+03,  3.26046533e+03,  3.28478833e+03,
         3.23475830e+03,  2.84816360e+07,  2.42558566e+10,
        -1.15797006e-01,  3.31597507e-01, -3.33716482e-01,
        -3.83136415e+00, -3.59290433e+00,  9.47920755e-02],
       [ 3.83768311e+03,  3.85312842e+03,  3.8693535

In [20]:
it_padded_pred.shape

(20, 12)

In [21]:
it_padded_pred[:, -1]

array([ 0.21489733,  0.03243442,  0.01408373,  0.09479208,  0.27497157,
        0.04030692, -0.05523358,  0.07896037,  0.0195106 , -0.02884476,
        0.06347989,  0.0392515 ,  0.12364674,  0.13618441, -0.16639115,
       -0.09987108, -0.00301768,  0.01199233, -0.15958595,  0.3740314 ],
      dtype=float32)

In [15]:
train_data.shape

(3658, 13)

In [23]:
# Assuming last_pred is the single column prediction with shape (20, 1)
# and train_data has 13 columns

# Determine the number of columns the scaler was fitted on
num_columns = (
    train_data.shape[1] - 1 if "date" in train_data.columns else train_data.shape[1]
)

# Create an array of zeros with the same number of rows and columns as the scaler's fitted data
padded_pred = np.zeros((last_pred.shape[0], num_columns))

print(padded_pred.shape)

# Insert the single column prediction into the corresponding column (e.g., the first column)
padded_pred[:, 0] = last_pred[:, 0]

# Apply inverse_transform
it_padded_pred = scaler.inverse_transform(padded_pred)

# Extract the inverse transformed single column
it_last_pred = it_padded_pred[:, 0]

print(it_last_pred)

(20, 12)
[3885.40793131 3696.25140262 3774.6123283  3731.39867308 3809.44623464
 3699.65109014 3648.76137504 3721.66227388 3780.73833635 3730.91085114
 3769.92774081 3719.35071022 3732.21961064 3754.28062473 3555.72240312
 3795.2652735  3655.648892   3658.06218308 3602.10752726 3869.83301425]


In [13]:
Exp.metrics

{'epoch': 5,
 'MAE_val': 0.8422023778737977,
 'RMSE_val': 1.1137292115524855,
 'Loss_val': 0.5000086828719738,
 'MAE': 0.3929128,
 'RMSE': 0.5931921,
 'Loss': 0.17228595167398453}