# Guidance for training a model with your own data

## 1. Import the necessary packages

In [1]:
import argparse
import os
import random

import numpy as np
import pandas as pd
import torch

from softs.exp.exp_custom import Exp_Custom

## 2. Define the hyperparameters

In [2]:
# fix seed for reproducibility
fix_seed = 7
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)
torch.set_num_threads(6)

# basic config
config = {
    # dataset settings
    'root_path': './dataset/ETT-small/',
    'data_path': 'ETTm1.csv',
    'data': 'ETTm1',
    'features': 'MS',
    'freq': 'B',
    'seq_len': 60,
    'pred_len': 20,
    # model settings
    'model': 'SOFTS',
    'checkpoints': './checkpoints/',
    'd_model': 32,
    'd_core': 16,
    'd_ff': 32,
    'e_layers': 2,
    'learning_rate': 0.0001,
    'lradj': 'cosine',
    'train_epochs': 50,
    'patience': 3,
    'batch_size': 16,
    'dropout': 0.0,
    'activation': 'gelu',
    'use_norm': True,
    'loss_func': 'huber',
    # system settings
    'num_workers': 0,
    'use_gpu': True,
    'gpu': '0',
    'save_model': True,
    'predict_all': True,
    'mixed_precision': True,
    'optimizer': "AdamW", # SGD / Adam / AdamW
}

parser = argparse.ArgumentParser(description='SOFTS')
args = parser.parse_args([])
args.__dict__.update(config)
args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False

print('Args in experiment:')
print(args)

Args in experiment:
Namespace(root_path='./dataset/ETT-small/', data_path='ETTm1.csv', data='ETTm1', features='MS', freq='B', seq_len=60, pred_len=20, model='SOFTS', checkpoints='./checkpoints/', d_model=32, d_core=16, d_ff=32, e_layers=2, learning_rate=0.0001, lradj='cosine', train_epochs=50, patience=3, batch_size=16, dropout=0.0, activation='gelu', use_norm=True, loss_func='huber', num_workers=0, use_gpu=False, gpu='0', save_model=True, predict_all=True, mixed_precision=True, optimizer='AdamW')


In [3]:
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()  # take environment variables from .env.

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

alchemyEngine = create_engine(
    db_url,
    pool_recycle=3600,
)

In [4]:
# query = """
# with cte as (
# SELECT "date", "open", "close", high, low, volume, amount, open_preclose_rate, high_preclose_rate, low_preclose_rate, vol_change_rate, amt_change_rate, change_rate
# FROM index_daily_em_view 
# where symbol = '399673' 
# order by date desc
# limit 1200
# ) select * from cte order by date
# """
query = """
with cte as (
SELECT "date", "open", "close", high, low, volume, amount, open_preclose_rate, high_preclose_rate, low_preclose_rate, vol_change_rate, amt_change_rate, change_rate
FROM index_daily_em_view 
where symbol = '399673'
) select * from cte order by date
"""

df = pd.read_sql(query, alchemyEngine, parse_dates=["date"])

In [5]:
df

Unnamed: 0,date,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,change_rate
0,2014-06-18,1344.05,1328.84,1351.16,1328.81,3938637.0,8.436718e+09,,,,,,
1,2014-06-19,1327.54,1281.12,1335.44,1269.53,4532795.0,9.384841e+09,-0.09783,0.49667,-4.46329,15.08537,11.23805,-3.59110
2,2014-06-20,1281.14,1294.98,1297.99,1273.18,3111206.0,6.570250e+09,0.00156,1.31682,-0.61977,-31.36231,-29.99082,1.08187
3,2014-06-23,1295.94,1324.14,1330.24,1295.94,3615497.0,7.837101e+09,0.07413,2.72282,0.07413,16.20886,19.28164,2.25177
4,2014-06-24,1322.55,1331.38,1335.14,1320.99,3265263.0,6.941363e+09,-0.12008,0.83073,-0.23789,-9.68702,-11.42946,0.54677
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2441,2024-07-01,1612.26,1613.83,1621.57,1581.79,11104284.0,3.785994e+10,-0.28327,0.29255,-2.16781,-17.55357,-1.67860,-0.18617
2442,2024-07-02,1607.48,1600.24,1618.31,1595.66,10769478.0,3.413663e+10,-0.39347,0.27760,-1.12589,-3.01511,-9.83444,-0.84210
2443,2024-07-03,1601.05,1599.10,1614.89,1585.49,10246141.0,3.069951e+10,0.05062,0.91549,-0.92174,-4.85945,-10.06871,-0.07124
2444,2024-07-04,1606.52,1588.34,1616.41,1588.34,9225758.0,2.946956e+10,0.46401,1.08248,-0.67288,-9.95871,-4.00640,-0.67288


In [6]:
df = df.dropna()
df

Unnamed: 0,date,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,change_rate
1,2014-06-19,1327.54,1281.12,1335.44,1269.53,4532795.0,9.384841e+09,-0.09783,0.49667,-4.46329,15.08537,11.23805,-3.59110
2,2014-06-20,1281.14,1294.98,1297.99,1273.18,3111206.0,6.570250e+09,0.00156,1.31682,-0.61977,-31.36231,-29.99082,1.08187
3,2014-06-23,1295.94,1324.14,1330.24,1295.94,3615497.0,7.837101e+09,0.07413,2.72282,0.07413,16.20886,19.28164,2.25177
4,2014-06-24,1322.55,1331.38,1335.14,1320.99,3265263.0,6.941363e+09,-0.12008,0.83073,-0.23789,-9.68702,-11.42946,0.54677
5,2014-06-25,1330.12,1322.78,1330.70,1310.50,3214760.0,6.779457e+09,-0.09464,-0.05107,-1.56830,-1.54667,-2.33248,-0.64595
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2441,2024-07-01,1612.26,1613.83,1621.57,1581.79,11104284.0,3.785994e+10,-0.28327,0.29255,-2.16781,-17.55357,-1.67860,-0.18617
2442,2024-07-02,1607.48,1600.24,1618.31,1595.66,10769478.0,3.413663e+10,-0.39347,0.27760,-1.12589,-3.01511,-9.83444,-0.84210
2443,2024-07-03,1601.05,1599.10,1614.89,1585.49,10246141.0,3.069951e+10,0.05062,0.91549,-0.92174,-4.85945,-10.06871,-0.07124
2444,2024-07-04,1606.52,1588.34,1616.41,1588.34,9225758.0,2.946956e+10,0.46401,1.08248,-0.67288,-9.95871,-4.00640,-0.67288


In [7]:
df = df[["date", "change_rate"]]
df

Unnamed: 0,date,change_rate
1,2014-06-19,-3.59110
2,2014-06-20,1.08187
3,2014-06-23,2.25177
4,2014-06-24,0.54677
5,2014-06-25,-0.64595
...,...,...
2441,2024-07-01,-0.18617
2442,2024-07-02,-0.84210
2443,2024-07-03,-0.07124
2444,2024-07-04,-0.67288


In [7]:
df.describe()

Unnamed: 0,date,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,change_rate
count,2445,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0,2445.0
mean,2019-06-23 11:24:57.423312896,2020.078687,2020.881059,2045.388863,1995.243493,10783330.0,32825700000.0,-0.040179,1.18917,-1.198846,2.279401,2.162237,0.027744
min,2014-06-19 00:00:00,960.23,970.75,993.16,959.5,2167637.0,4935305000.0,-7.98102,-4.76083,-9.70982,-71.87131,-73.84411,-9.67561
25%,2016-12-16 00:00:00,1547.37,1548.46,1563.76,1529.89,7733564.0,15177560000.0,-0.31202,0.29477,-1.71312,-12.29389,-12.00506,-1.02686
50%,2019-06-24 00:00:00,1902.31,1902.08,1925.58,1881.9,10064840.0,30305080000.0,-0.0367,0.86593,-0.89438,-1.11539,-1.60092,-0.03246
75%,2021-12-24 00:00:00,2437.5,2445.45,2481.21,2398.78,12988260.0,46288300000.0,0.29948,1.75785,-0.3331,12.1334,12.57596,1.10409
max,2024-07-05 00:00:00,3895.61,3871.4,3929.05,3720.17,33951360.0,103538200000.0,8.11195,8.11195,3.00758,306.09311,297.62987,7.46427
std,,633.343884,632.53414,643.07892,620.742851,4550688.0,19682760000.0,0.851786,1.396648,1.475941,23.116774,22.24416,2.012874


## 3. Prepare the dataset
Organize your data in the following format:
- The dataset should be a csv file.
- If there is a time feature, the first column contains timestamps in the format 'YYYY-MM-DD HH:MM:SS'. If there's no time feature, the dataset starts directly with the features.
- If the parameter `features` is 'M', the following columns are both the features and the targets. If `features` is 'MS', the following columns are the features, and the last column is the target.

In [8]:
# load data
# data = pd.read_csv(os.path.join(args.root_path, args.data_path))
# print(data.head())

# split data
end = int(len(df) * 0.9)
train_data = df.iloc[: end]
vali_data = df.iloc[end - args.seq_len: ]
# test_data = df.iloc[1100 - args.seq_len: ]

# optional: scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
if 'date' in train_data.columns:
    scaler.fit(train_data.iloc[:, 1:])
    train_data.iloc[:, 1:] = scaler.transform(train_data.iloc[:, 1:])
    vali_data.iloc[:, 1:] = scaler.transform(vali_data.iloc[:, 1:])
    # test_data.iloc[:, 1:] = scaler.transform(test_data.iloc[:, 1:])
else:
    scaler.fit(train_data.iloc[:, :])
    train_data.iloc[:, :] = scaler.transform(train_data.iloc[:, :])
    vali_data.iloc[:, :] = scaler.transform(vali_data.iloc[:, :])
    # test_data.iloc[:, :] = scaler.transform(test_data.iloc[:, :])

In [9]:
train_data

Unnamed: 0,date,change_rate
1,2014-06-19,-1.763082
2,2014-06-20,0.504059
3,2014-06-23,1.071648
4,2014-06-24,0.244450
5,2014-06-25,-0.334211
...,...,...
2196,2023-06-27,-0.005462
2197,2023-06-28,-0.123362
2198,2023-06-29,-0.095717
2199,2023-06-30,0.362520


In [10]:
vali_data

Unnamed: 0,date,change_rate
2141,2023-04-03,0.351140
2142,2023-04-04,-0.335278
2143,2023-04-06,-0.049792
2144,2023-04-07,0.183249
2145,2023-04-10,0.091058
...,...,...
2441,2024-07-01,-0.111144
2442,2024-07-02,-0.429375
2443,2024-07-03,-0.055384
2444,2024-07-04,-0.347276


## 4. Train and Evaluate the model


In [11]:
Exp = Exp_Custom(args)
setting = f'{args.data}_{args.model}_{args.seq_len}_{args.pred_len}'
print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
Exp.train(setting=setting, train_data=train_data, vali_data=vali_data, test_data=None)
print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
Exp.test(setting=setting, test_data=vali_data)

Use CPU


>>>>>>>start training : ETTm1_SOFTS_60_20>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>testing : ETTm1_SOFTS_60_20<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<


(0.7318887484402247, 0.5751916130559634, 0.24959460243714593)

## 5. Get predictions by the model

In [24]:
vali_data

Unnamed: 0,date,change_rate
2140,2023-03-31,0.035238
2141,2023-04-03,0.351201
2142,2023-04-04,-0.334935
2143,2023-04-06,-0.049566
2144,2023-04-07,0.183379
...,...,...
2440,2024-06-28,-0.643194
2441,2024-07-01,-0.110980
2442,2024-07-02,-0.429146
2443,2024-07-03,-0.055232


In [41]:
# get predictions
predictions = Exp.predict(setting=setting, pred_data=vali_data)
print(predictions.shape)

(246, 20, 12)


In [42]:
last_pred2 = predictions[-1, :, :]

In [43]:
it_padded_pred = scaler.inverse_transform(last_pred2)

In [44]:
it_padded_pred

array([[ 1.72926062e+03,  1.72686865e+03,  1.74489355e+03,
         1.71462952e+03,  1.05938800e+07,  3.98001193e+10,
        -5.89669421e-02,  8.62988114e-01, -9.07311678e-01,
         2.63056707e+00,  2.58245730e+00, -1.86177105e-01],
       [ 1.72287305e+03,  1.72105542e+03,  1.73923853e+03,
         1.70885339e+03,  1.04333480e+07,  3.88074783e+10,
        -1.32070065e-01,  8.33381534e-01, -9.16531861e-01,
         1.28995299e+00,  4.22393382e-01, -2.49974847e-01],
       [ 1.72874341e+03,  1.72724268e+03,  1.74481958e+03,
         1.71453772e+03,  1.05205700e+07,  3.92841380e+10,
        -1.84778079e-01,  7.83455849e-01, -9.80892599e-01,
         1.20063233e+00,  5.72999656e-01, -2.28112161e-01],
       [ 1.72846277e+03,  1.72585388e+03,  1.74338196e+03,
         1.71407666e+03,  1.04830710e+07,  3.90761841e+10,
        -1.04344524e-01,  7.14486778e-01, -9.46922243e-01,
         1.34852231e-01, -5.79224765e-01, -2.79890388e-01],
       [ 1.72550977e+03,  1.72416235e+03,  1.7412616

In [29]:
it_padded_pred.shape

(20, 1)

In [45]:
it_padded_pred[:, -1]

array([-0.1861771 , -0.24997485, -0.22811216, -0.2798904 , -0.15068327,
       -0.2187123 , -0.3109654 , -0.22825038,  0.13667627, -0.45120478,
       -0.06703919, -0.27190477, -0.16351332, -0.3224053 , -0.25958896,
       -0.03928372, -0.23652971, -0.26447386, -0.20533572,  0.1468694 ],
      dtype=float32)

In [46]:
it_padded_pred[:, -1].mean()

-0.19252497

In [47]:
it_padded_pred[:, -1].sum()

-3.8504994

In [32]:
train_data.shape

(2199, 2)

In [23]:
# Assuming last_pred is the single column prediction with shape (20, 1)
# and train_data has 13 columns

# Determine the number of columns the scaler was fitted on
num_columns = (
    train_data.shape[1] - 1 if "date" in train_data.columns else train_data.shape[1]
)

# Create an array of zeros with the same number of rows and columns as the scaler's fitted data
padded_pred = np.zeros((last_pred.shape[0], num_columns))

print(padded_pred.shape)

# Insert the single column prediction into the corresponding column (e.g., the first column)
padded_pred[:, 0] = last_pred[:, 0]

# Apply inverse_transform
it_padded_pred = scaler.inverse_transform(padded_pred)

# Extract the inverse transformed single column
it_last_pred = it_padded_pred[:, 0]

print(it_last_pred)

(20, 12)
[3885.40793131 3696.25140262 3774.6123283  3731.39867308 3809.44623464
 3699.65109014 3648.76137504 3721.66227388 3780.73833635 3730.91085114
 3769.92774081 3719.35071022 3732.21961064 3754.28062473 3555.72240312
 3795.2652735  3655.648892   3658.06218308 3602.10752726 3869.83301425]


In [13]:
Exp.metrics

{'epoch': 5,
 'MAE_val': 0.8422023778737977,
 'RMSE_val': 1.1137292115524855,
 'Loss_val': 0.5000086828719738,
 'MAE': 0.3929128,
 'RMSE': 0.5931921,
 'Loss': 0.17228595167398453}