In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import os
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch import nn
import numpy as np
import glob
import tqdm

from torch.utils.tensorboard import SummaryWriter

pd.options.mode.chained_assignment = None




# Feature Engineering

In [2]:
'''
    TRD_DD : Date
    ISU_CD : Stock Code
    ISU_NM : Stock Name
    TDD_CLSPRC : Closing Price
    TDD_OPNPRC : Opening Price
    TDD_HGPRC : High Price
    TDD_LWPRC : Low Price
    MKTCAP : Market Capitalization
    ACC_TRDVOL : Trading Volume
    EPS : Earnings Per Share
    PER : Price-Earnings Ratio
    BPS : Book Value Per Share
    PBR : Price-Book Ratio
    DPS : Dividends Per Share
    DVD_YLD : Dividend Yield

'''

'\n    TRD_DD : Date\n    ISU_CD : Stock Code\n    ISU_NM : Stock Name\n    TDD_CLSPRC : Closing Price\n    TDD_OPNPRC : Opening Price\n    TDD_HGPRC : High Price\n    TDD_LWPRC : Low Price\n    MKTCAP : Market Capitalization\n    ACC_TRDVOL : Trading Volume\n    EPS : Earnings Per Share\n    PER : Price-Earnings Ratio\n    BPS : Book Value Per Share\n    PBR : Price-Book Ratio\n    DPS : Dividends Per Share\n    DVD_YLD : Dividend Yield\n\n'

In [3]:

def load_and_merge_csv_files(data_directory, preprocessed_directory, file_limit=None):
    data_files = glob.glob(os.path.join(data_directory, "*.csv"))
    preprocessed_files = glob.glob(os.path.join(preprocessed_directory, "*.csv"))
    
    data_files = data_files[:file_limit]
    preprocessed_files = preprocessed_files[:file_limit]
    
    merged_dfs = []
    
    for data_file in data_files:
        file_name = os.path.basename(data_file)
        
        preprocessed_file_name = file_name.replace('.csv', '_preprocessed.csv')
        preprocessed_file_path = os.path.join(preprocessed_directory, preprocessed_file_name)
        
        if preprocessed_file_path in preprocessed_files:
            df_data = pd.read_csv(data_file)
            df_preprocessed = pd.read_csv(preprocessed_file_path)
            
            merged_df = pd.merge(df_data, df_preprocessed, on='TRD_DD', suffixes=('_data', '_preprocessed'))
            merged_dfs.append(merged_df)
    
    total_df = pd.concat(merged_dfs, ignore_index=True)
    
    return total_df

# 예시 사용법
data_directory = path_append + "../data/KR_Data/data"
preprocessed_directory = path_append + "../data/KR_Data/preprocessed"
total_df = load_and_merge_csv_files(data_directory, preprocessed_directory)


total_df.head()


Unnamed: 0,TRD_DD,ISU_CD,ISU_NM,TDD_CLSPRC,TDD_OPNPRC,TDD_HGPRC,TDD_LWPRC,MKTCAP,ACC_TRDVOL,EPS,...,BPS,PBR,DPS,DVD_YLD,GDC_sig,RSI_sig,ROC_sig,MAP_sig,STC_sig,TREND
0,2022/01/11,900110,이스트아시아홀딩스,145,150,151,143,24719670905,4147345,,...,,,,,0,0,0,0,0,
1,2022/01/10,900110,이스트아시아홀딩스,150,152,155,149,25572073350,2628028,,...,,,,,0,0,0,0,0,
2,2022/01/07,900110,이스트아시아홀딩스,151,158,162,148,25742553839,7561654,,...,,,,,0,0,0,0,0,
3,2022/01/06,900110,이스트아시아홀딩스,158,172,186,156,26935917262,22931278,,...,,,,,0,0,0,0,0,
4,2022/01/05,900110,이스트아시아홀딩스,171,164,187,161,29152163619,29419967,,...,,,,,0,0,-1,0,0,


In [4]:
# Assuming total_df is already defined and merged from previous steps

# Reverse the DataFrame to sort dates from past to present
total_df = total_df[::-1].reset_index(drop=True)

# Split the "TRD_DD" column into year, month, and day columns
total_df[["Y", "M", "D"]] = total_df["TRD_DD"].str.split("/", expand=True)

# Drop the original "TRD_DD" column
total_df = total_df.drop("TRD_DD", axis=1)

# Rearrange columns to have year, month, and day first
total_df = total_df[["Y", "M", "D"] + total_df.columns[:-3].to_list()]


In [5]:
# Create a new 'Date' column by combining 'Y', 'M', 'D' columns
total_df['Date'] = pd.to_datetime(total_df[['Y', 'M', 'D']].rename(columns={'Y': 'year', 'M': 'month', 'D': 'day'}))

# Set 'Date' as the index
total_df.set_index('Date', inplace=True)

# Create a 'count_day' column that represents the number of days from the first date
total_df['count_day'] = (total_df.index - total_df.index.min()).days

# Drop the 'Y', 'M', 'Day' columns as they're no longer needed
total_df.drop(columns=['Y', 'M', 'D'], inplace=True)

# Reorder the columns to make 'count_day' first
cols = ['count_day'] + [col for col in total_df.columns if col != 'count_day']
total_df = total_df[cols]

total_df.head()

Unnamed: 0_level_0,count_day,ISU_CD,ISU_NM,TDD_CLSPRC,TDD_OPNPRC,TDD_HGPRC,TDD_LWPRC,MKTCAP,ACC_TRDVOL,EPS,...,BPS,PBR,DPS,DVD_YLD,GDC_sig,RSI_sig,ROC_sig,MAP_sig,STC_sig,TREND
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-04-21,5468,900100,뉴프라이드(Reg.S),12750,15000,17250,12750,89250000000,2186517,,...,,,,,0,0,-1,1,0,
2010-04-22,5469,900100,뉴프라이드(Reg.S),10850,10850,10850,10850,75950000000,47396,,...,,,,,0,0,-1,1,0,
2010-04-23,5470,900100,뉴프라이드(Reg.S),9280,9470,10800,9250,64960000000,5960215,,...,,,,,0,0,-1,1,0,
2010-04-26,5473,900100,뉴프라이드(Reg.S),10650,9560,10650,9510,74550000000,2521236,,...,,,,,0,0,-1,1,0,1.0
2010-04-27,5474,900100,뉴프라이드(Reg.S),9510,11300,11600,9430,66570000000,3643891,,...,,,,,0,0,-1,1,0,0.981481


In [6]:
total_df.drop(['ISU_CD'], axis=1, inplace=True)

In [7]:
total_df.reset_index(drop=True, inplace=True)

In [8]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9384645 entries, 0 to 9384644
Data columns (total 20 columns):
 #   Column      Dtype  
---  ------      -----  
 0   count_day   int64  
 1   ISU_NM      object 
 2   TDD_CLSPRC  object 
 3   TDD_OPNPRC  object 
 4   TDD_HGPRC   object 
 5   TDD_LWPRC   object 
 6   MKTCAP      object 
 7   ACC_TRDVOL  object 
 8   EPS         object 
 9   PER         object 
 10  BPS         object 
 11  PBR         object 
 12  DPS         object 
 13  DVD_YLD     object 
 14  GDC_sig     int64  
 15  RSI_sig     int64  
 16  ROC_sig     int64  
 17  MAP_sig     int64  
 18  STC_sig     int64  
 19  TREND       float64
dtypes: float64(1), int64(6), object(13)
memory usage: 1.4+ GB


In [9]:
# Display non-NaN values of the columns to be dropped (for verification)
print("EPS non-NaN values:\n", total_df["EPS"].dropna())
print("PER non-NaN values:\n", total_df["PER"].dropna())
print("BPS non-NaN values:\n", total_df["BPS"].dropna())
print("PBR non-NaN values:\n", total_df["PBR"].dropna())
print("DPS non-NaN values:\n", total_df["DPS"].dropna())
print("DVD_YLD non-NaN values:\n", total_df["DVD_YLD"].dropna())

# Drop the unusable columns
total_df = total_df.drop(["EPS", "PER", "BPS", "PBR", "DPS", "DVD_YLD"], axis=1)


EPS non-NaN values:
 23765          -
23766          -
23767          -
23768          -
23769          -
           ...  
9374643    1,033
9374644    1,033
9374645    1,033
9374646    1,033
9374647    1,033
Name: EPS, Length: 9125271, dtype: object
PER non-NaN values:
 23765          -
23766          -
23767          -
23768          -
23769          -
           ...  
9374643    14.09
9374644    13.79
9374645    14.13
9374646    14.13
9374647    13.89
Name: PER, Length: 9125271, dtype: object
BPS non-NaN values:
 23765           -
23766           -
23767           -
23768           -
23769           -
            ...  
9374643    11,860
9374644    11,860
9374645    11,860
9374646    11,860
9374647    11,860
Name: BPS, Length: 9125271, dtype: object
PBR non-NaN values:
 23765         -
23766         -
23767         -
23768         -
23769         -
           ... 
9374643    1.23
9374644    1.20
9374645    1.23
9374646    1.23
9374647    1.21
Name: PBR, Length: 9125271, dtype: object


In [10]:
import pandas as pd

# Assuming total_df is already defined and filled with NaN values replaced by 0
# total_df = ...

# 1) Set TREND to 0 for any value that is not -1, 0, or 1
total_df.loc[~total_df["TREND"].isin([-1, 0, 1]), "TREND"] = 0

# 2) Set TREND to -1 for negative values and 1 for positive values
total_df.loc[total_df["TREND"] < 0, "TREND"] = -1
total_df.loc[total_df["TREND"] > 0, "TREND"] = 1

# 3) Adjust TREND values based on the specified conditions
total_df.loc[total_df["TREND"] <= -0.5, "TREND"] = -1
total_df.loc[total_df["TREND"] >= 0.5, "TREND"] = 1
total_df.loc[(total_df["TREND"] > -0.5) & (total_df["TREND"] < 0.5), "TREND"] = 0

# Check the unique values in the TREND column and their counts
unique_trends = set(total_df["TREND"])
trend_counts = total_df["TREND"].value_counts()

# Print the unique values and their counts
print("Unique TREND values:", unique_trends)
print("TREND value counts:\n", trend_counts)



Unique TREND values: {0.0, 1.0, -1.0}
TREND value counts:
  0.0    8933593
 1.0     225720
-1.0     225332
Name: TREND, dtype: int64


In [11]:
total_df["TREND"] += 1

In [12]:
total_df["TREND"] = total_df["TREND"].convert_dtypes(int)
total_df["TREND"]

0          1
1          1
2          1
3          2
4          1
          ..
9384640    1
9384641    1
9384642    1
9384643    1
9384644    1
Name: TREND, Length: 9384645, dtype: Int64

In [13]:
import pandas as pd

# Example DataFrame setup (assuming total_df is already defined)
# total_df = ...

# List of columns to convert from strings to numeric values
columns_to_convert = ["TDD_CLSPRC", "TDD_OPNPRC", "TDD_HGPRC", "TDD_LWPRC", "MKTCAP", "ACC_TRDVOL"]

# Convert the columns to numeric values
for col in columns_to_convert:
    total_df[col] = total_df[col].str.replace(pat=r'[^0-9]', repl=r'' ,regex=True).apply(pd.to_numeric)



In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# Assuming total_df is already defined and filled with NaN values replaced by 0
# total_df = ...

# Define the scalers
mm = MinMaxScaler()
sc = RobustScaler()

# Apply MinMax scaling to the specified columns
minmax_cols = ["count_day", "TDD_CLSPRC", "TDD_OPNPRC", "TDD_HGPRC", "TDD_LWPRC"]
for col in minmax_cols:
    total_df[col] = mm.fit_transform(total_df[col].values.reshape(-1, 1))

# Apply Robust scaling to the specified columns
robust_cols = ["MKTCAP", "ACC_TRDVOL"]
for col in robust_cols:
    total_df[col] = sc.fit_transform(total_df[col].values.reshape(-1, 1))


In [15]:
total_df.head()

Unnamed: 0,count_day,ISU_NM,TDD_CLSPRC,TDD_OPNPRC,TDD_HGPRC,TDD_LWPRC,MKTCAP,ACC_TRDVOL,GDC_sig,RSI_sig,ROC_sig,MAP_sig,STC_sig,TREND
0,0.560763,뉴프라이드(Reg.S),0.002651,0.003,0.003402,0.002713,0.103185,9.104305,0,0,-1,1,0,1
1,0.560866,뉴프라이드(Reg.S),0.002256,0.00217,0.00214,0.002309,0.021247,-0.054263,0,0,-1,1,0,1
2,0.560968,뉴프라이드(Reg.S),0.001929,0.001894,0.00213,0.001968,-0.046459,25.261255,0,0,-1,1,0,1
3,0.561276,뉴프라이드(Reg.S),0.002214,0.001912,0.002101,0.002023,0.012622,10.537392,0,0,-1,1,0,2
4,0.561378,뉴프라이드(Reg.S),0.001977,0.00226,0.002288,0.002006,-0.03654,15.343998,0,0,-1,1,0,1


In [16]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9384645 entries, 0 to 9384644
Data columns (total 14 columns):
 #   Column      Dtype  
---  ------      -----  
 0   count_day   float64
 1   ISU_NM      object 
 2   TDD_CLSPRC  float64
 3   TDD_OPNPRC  float64
 4   TDD_HGPRC   float64
 5   TDD_LWPRC   float64
 6   MKTCAP      float64
 7   ACC_TRDVOL  float64
 8   GDC_sig     int64  
 9   RSI_sig     int64  
 10  ROC_sig     int64  
 11  MAP_sig     int64  
 12  STC_sig     int64  
 13  TREND       Int64  
dtypes: Int64(1), float64(7), int64(5), object(1)
memory usage: 1011.3+ MB


In [17]:
# Ensure 'ISU_NM' is of string type (although it should already be)
total_df["ISU_NM"] = total_df["ISU_NM"].astype(str)

# Calculate where 'ISU_NM' column changes value
isu_nm_changes = total_df['ISU_NM'].shift() != total_df['ISU_NM']
change_indices = isu_nm_changes[isu_nm_changes].index.tolist()

# Calculate lengths between changes
lengths_between_changes = [change_indices[i] - change_indices[i-1] for i in range(1, len(change_indices))]

# Find the minimum cycle length where the label changes
min_cycle_length = min(lengths_between_changes) if lengths_between_changes else None

print("Indices where the 'ISU_NM' label changes:", change_indices)
print("Lengths between changes:", lengths_between_changes)
print(f"Minimum cycle length: {min_cycle_length}")

Indices where the 'ISU_NM' label changes: [0, 250, 2741, 2894, 4723, 5635, 5847, 8609, 10073, 10977, 12982, 14270, 14518, 14642, 14849, 14947, 15093, 15880, 16125, 17171, 17417, 19291, 19305, 19522, 20188, 21116, 22246, 23346, 23693, 23715, 23735, 23765, 23796, 23812, 23850, 23861, 23877, 23905, 23917, 23944, 24024, 24098, 24161, 24223, 24292, 24338, 24419, 24506, 24573, 24650, 24757, 24914, 25071, 25227, 25388, 25455, 25507, 25565, 25629, 25790, 25932, 26083, 26291, 26452, 26487, 26539, 26708, 26757, 26830, 26922, 27036, 27139, 27183, 27249, 27395, 27444, 27684, 27924, 28114, 28332, 28432, 28475, 28689, 28936, 28949, 29191, 29384, 29642, 29849, 30119, 30221, 30453, 30715, 30975, 31092, 31414, 31736, 31964, 32122, 32283, 32451, 32487, 32688, 32992, 33009, 33053, 33407, 33543, 33803, 34158, 34323, 34661, 34871, 35138, 35462, 35731, 36129, 36468, 36877, 37286, 37562, 37676, 38036, 38301, 38569, 38693, 38816, 39002, 39275, 39512, 39676, 40046, 40227, 40591, 40614, 40969, 41379, 41732, 420

In [18]:
# Drop ISU_NM if it exists
if "ISU_NM" in total_df.columns:
    total_df = total_df.drop("ISU_NM", axis=1)
else:
    print("Column 'ISU_NM' not found in DataFrame")

In [19]:
segment_pairs = []
for start, end in zip(change_indices[:-1], change_indices[1:]):
    segment_length = end - start
    if segment_length >= min_cycle_length and segment_length % min_cycle_length == 0:
        # Normalize each sub-segment within the main segment
        for offset in range(0, segment_length, min_cycle_length):
            sub_start = start + offset
            sub_end = sub_start + min_cycle_length
            segment_pairs.append((sub_start, sub_end))
    else:
        irregular_num = segment_length//min_cycle_length
        # Normalize each sub-segment within the main segment
        for i in range(irregular_num):
            sub_start = start + i * min_cycle_length
            if i == irregular_num - 1:
                sub_end = end
            else:
                sub_end = sub_start + min_cycle_length
            segment_pairs.append((sub_start, sub_end))

In [20]:
df_numeric = total_df.apply(pd.to_numeric, errors='coerce')
df_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9384645 entries, 0 to 9384644
Data columns (total 13 columns):
 #   Column      Dtype  
---  ------      -----  
 0   count_day   float64
 1   TDD_CLSPRC  float64
 2   TDD_OPNPRC  float64
 3   TDD_HGPRC   float64
 4   TDD_LWPRC   float64
 5   MKTCAP      float64
 6   ACC_TRDVOL  float64
 7   GDC_sig     int64  
 8   RSI_sig     int64  
 9   ROC_sig     int64  
 10  MAP_sig     int64  
 11  STC_sig     int64  
 12  TREND       Int64  
dtypes: Int64(1), float64(7), int64(5)
memory usage: 939.7 MB


In [23]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

def convert_nullable_int_columns(df):

    int_columns = df.select_dtypes(include=['Int64']).columns
    for col in int_columns:
        df[col] = df[col].astype('int64')
    return df

def process_dataframe(df, segment_pairs, use_scale=False, include_diff=False):

    df_numeric = df.apply(pd.to_numeric, errors='coerce')

    df_numeric = convert_nullable_int_columns(df_numeric)

    df_numeric = df_numeric.dropna()

    df_tensor = torch.tensor(df_numeric.values, dtype=torch.float64).cuda()

    df_list_diff = []

    for start, end in segment_pairs:
        segment = df_tensor[start:end]

        if use_scale:
            scaler = StandardScaler()
            segment = torch.tensor(scaler.fit_transform(segment.cpu()), dtype=torch.float64).cuda()

        if include_diff:
            segment_diff = segment[1:] - segment[:-1]
            df_list_diff.append(segment_diff)

    if include_diff:
        processed_tensor = torch.cat(df_list_diff, dim=0)
    else:
        processed_tensor = torch.cat([df_tensor[start:end] for start, end in segment_pairs], dim=0)

    processed_df = pd.DataFrame(processed_tensor.cpu().numpy(), columns=df_numeric.columns)

    new_segment_pairs = [(0, len(segment)) for segment in df_list_diff] if include_diff else segment_pairs

    return processed_df, new_segment_pairs

total_df, segment_pairs = process_dataframe(total_df, segment_pairs, use_scale=True, include_diff=False)


In [24]:
import pandas as pd
import plotly.graph_objects as go

# Assuming total_df is already defined and filled with NaN values replaced by 0
# total_df = ...

# Creating the figure
fig = go.Figure()

# Adding the line plot for MKTCAP
fig.add_trace(go.Scatter(x=total_df.index, y=total_df["MKTCAP"], mode='lines', name="stay"))

# Adding scatter plots for buy and sell points
fig.add_trace(go.Scatter(x=total_df[total_df["TREND"] < 0].index, y=total_df[total_df["TREND"] < 0]["MKTCAP"], mode="markers", name="buy", marker=dict(color='green')))
fig.add_trace(go.Scatter(x=total_df[total_df["TREND"] > 0].index, y=total_df[total_df["TREND"] > 0]["MKTCAP"], mode="markers", name="sell", marker=dict(color='red')))

# Updating layout
fig.update_layout(
    title='Stock Label',
    xaxis=dict(title='Time'),
    yaxis=dict(title='Market Capitalization')
)

# Display the plot
fig.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming total_df is already defined
# total_df = ...

# Set the figure size
plt.figure(figsize=(25, 8))

# Plot the MKTCAP line
sns.lineplot(x=total_df.index, y=total_df["MKTCAP"], label="stay", color="gray")

# Plot the buy points (TREND < 0)
sns.scatterplot(x=total_df[total_df["TREND"] < 0].index, y=total_df[total_df["TREND"] < 0]["MKTCAP"], label="buy", color="blue")

# Plot the sell points (TREND > 0)
sns.scatterplot(x=total_df[total_df["TREND"] > 0].index, y=total_df[total_df["TREND"] > 0]["MKTCAP"], label="sell", color="red")

# Add titles and labels
plt.title('Stock Market Capitalization with Buy/Sell Signals')
plt.xlabel('Time')
plt.ylabel('Market Capitalization')

# Show the plot
plt.legend()
plt.show()


# Data Overview and Usage Guide

In [None]:
# Data Overview and Usage Guide

"""
- Data Overview
    Preprocessed Data: total_df
    Categorical Columns: Y, M, D, ISU_CD, GDC_sig, RSI_sig, ROC_sig, MAP_sig, STC_sig
    Numerical Columns: TDD_CLSPRC, TDD_OPNPRC, TDD_HGPRC, TDD_LWPRC, MKTCAP, ACC_TRDVOL
    Label: TREND

- Considerations:
    1) It is recommended to use embedding techniques for categorical data.
    2) Labels:
        NaN values have been replaced with 0.

        2-1) Label Processing:
            How to handle -1, 0, 1 depends on the definition.
            ● Classification of -1, 0, 1:
                Commonly, the label being discrete is an issue.
                (1) Set to -1 for values less than 0, and 1 for values greater than 0.
                    # Ratio of -1, 0, 1 = 1397:1440:55
                    : This results in very frequent trading.

                (2) Use only -1, 0, 1.
                    # Ratio of -1, 0, 1 = 76:2740:76
                    : This might cause the model to miss buying opportunities when it should, making it difficult for the model to make accurate predictions.

                (3) Set to -1 for values less than -0.5, and 1 for values greater than 0.5, otherwise 0.
                    # Ratio of -1, 0, 1 = 752:1409:731
                    : (Current preprocessing state) This provides a somewhat balanced ratio.

            ● Regression:
                Keep the label as it is.
                (1) The model performs regression and decides whether to buy or sell based on the predicted increase or decrease.

    3) The utility of GDC, RSI, ROC, MAP, STC indicators for learning is uncertain.
"""


In [None]:
import torch
import random
import torch.nn.functional as F
from torch.utils.data import Dataset

class SequentialDataset(Dataset):
    def __init__(self, df, indices, max_window_size):
        self.df = df
        self.indices = indices
        self.max_window_size = max_window_size
        self.min_window_size = max_window_size // 2

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        start_idx = self.indices[idx]
        # window_size = random.randint(self.min_window_size, self.max_window_size)
        window_size = self.max_window_size
        end_idx = min(start_idx + window_size, len(self.df))

        seq = self.df.iloc[start_idx:end_idx]

        X = seq.drop(['TREND'], axis=1)
        y = seq['TREND']

        X = torch.tensor(X.values, dtype=torch.float32)

        label = torch.tensor(y.values.astype(int), dtype=torch.long)
        label = F.one_hot(label, num_classes=3)

        return X, label

In [None]:
from random import shuffle

# Assume 'df' is your DataFrame and 'event' is the column containing labels

def generate_indices(input_df, input_pairs, max_window_size, test_size=0.2):
    length = len(input_pairs)
    train_length = int(length * (1- test_size))
    training_indices = []
    testing_indices = []
    for iter, (start, end) in enumerate(input_pairs):
        indices = training_indices if iter < train_length else testing_indices
        max_index = end - max_window_size  # Calculate the maximum starting index for this segment
        for i in range(start, max_index):
            # Check if all labels in the window are the same
            if len(input_df['TREND'][i:i + max_window_size].unique()) == 1:
                indices.append(i)
            else:
                print(f"Skipping index {i} due to multiple labels in window.")
    return training_indices, testing_indices


# Assuming 'df' and 'num_classes' are defined
max_window_size = 64
shuffle(segment_pairs)  # Shuffle the indices to randomize the data order
train_indices, test_indices = generate_indices(total_df, segment_pairs, max_window_size)

trainset = SequentialDataset(df=total_df, indices=train_indices, max_window_size=max_window_size)
testset = SequentialDataset(df=total_df, indices=test_indices, max_window_size=max_window_size)


print('Train indices: ', len(train_indices))
print('Test indices: ', len(test_indices))

print(trainset[0][0].shape)
print(len(trainset))

In [None]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

In [None]:
data_config = DataConfig(dataset_name = 'stock_price', task_type='multi_class_classification', obs_shape=[12], label_size=3)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

first_data = trainset[0]
X, y = first_data

print(f"Input shape: {X.shape}")
print(f"Label shape: {y.shape}")

print(f"Total number of samples in trainset: {len(trainset)}")

In [None]:
from trainer_hub import TrainerHub

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False)

In [None]:
trainer_hub.train(trainset, testset)