In [None]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import os
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch import nn
import numpy as np
import glob
import tqdm

from torch.utils.tensorboard import SummaryWriter

pd.options.mode.chained_assignment = None


# Feature Engineering

In [None]:
'''
    TRD_DD : Date
    ISU_CD : Stock Code
    ISU_NM : Stock Name
    TDD_CLSPRC : Closing Price
    TDD_OPNPRC : Opening Price
    TDD_HGPRC : High Price
    TDD_LWPRC : Low Price
    MKTCAP : Market Capitalization
    ACC_TRDVOL : Trading Volume
    EPS : Earnings Per Share
    PER : Price-Earnings Ratio
    BPS : Book Value Per Share
    PBR : Price-Book Ratio
    DPS : Dividends Per Share
    DVD_YLD : Dividend Yield

'''

In [None]:

def load_and_merge_csv_files(data_directory, preprocessed_directory, file_limit=None):
    data_files = glob.glob(os.path.join(data_directory, "*.csv"))
    preprocessed_files = glob.glob(os.path.join(preprocessed_directory, "*.csv"))
    
    data_files = data_files[:file_limit]
    preprocessed_files = preprocessed_files[:file_limit]
    
    merged_dfs = []
    
    for data_file in data_files:
        file_name = os.path.basename(data_file)
        
        preprocessed_file_name = file_name.replace('.csv', '_preprocessed.csv')
        preprocessed_file_path = os.path.join(preprocessed_directory, preprocessed_file_name)
        
        if preprocessed_file_path in preprocessed_files:
            df_data = pd.read_csv(data_file)
            df_preprocessed = pd.read_csv(preprocessed_file_path)
            
            merged_df = pd.merge(df_data, df_preprocessed, on='TRD_DD', suffixes=('_data', '_preprocessed'))
            merged_dfs.append(merged_df)
    
    total_df = pd.concat(merged_dfs, ignore_index=True)
    
    return total_df

data_directory = path_append + "../data/KR_Data/data"
preprocessed_directory = path_append + "../data/KR_Data/preprocessed"
total_df = load_and_merge_csv_files(data_directory, preprocessed_directory)


total_df.head()


In [None]:
# Assuming total_df is already defined and merged from previous steps

# Reverse the DataFrame to sort dates from past to present
total_df = total_df[::-1].reset_index(drop=True)

# Split the "TRD_DD" column into year, month, and day columns
total_df[["Y", "M", "D"]] = total_df["TRD_DD"].str.split("/", expand=True)

# Drop the original "TRD_DD" column
total_df = total_df.drop("TRD_DD", axis=1)

# Rearrange columns to have year, month, and day first
total_df = total_df[["Y", "M", "D"] + total_df.columns[:-3].to_list()]


In [None]:
# Create a new 'Date' column by combining 'Y', 'M', 'D' columns
total_df['Date'] = pd.to_datetime(total_df[['Y', 'M', 'D']].rename(columns={'Y': 'year', 'M': 'month', 'D': 'day'}))

# Set 'Date' as the index
total_df.set_index('Date', inplace=True)

# Create a 'count_day' column that represents the number of days from the first date
total_df['count_day'] = (total_df.index - total_df.index.min()).days

# Drop the 'Y', 'M', 'Day' columns as they're no longer needed
total_df.drop(columns=['Y', 'M', 'D'], inplace=True)

# Reorder the columns to make 'count_day' first
cols = ['count_day'] + [col for col in total_df.columns if col != 'count_day']
total_df = total_df[cols]

total_df.head()

In [None]:
total_df.drop(['ISU_CD'], axis=1, inplace=True)

In [None]:
total_df.reset_index(drop=True, inplace=True)

In [None]:
total_df.info()

In [None]:
# Display non-NaN values of the columns to be dropped (for verification)
print("EPS non-NaN values:\n", total_df["EPS"].dropna())
print("PER non-NaN values:\n", total_df["PER"].dropna())
print("BPS non-NaN values:\n", total_df["BPS"].dropna())
print("PBR non-NaN values:\n", total_df["PBR"].dropna())
print("DPS non-NaN values:\n", total_df["DPS"].dropna())
print("DVD_YLD non-NaN values:\n", total_df["DVD_YLD"].dropna())

# Drop the unusable columns
total_df = total_df.drop(["EPS", "PER", "BPS", "PBR", "DPS", "DVD_YLD"], axis=1)


In [None]:
import pandas as pd

# Assuming total_df is already defined and filled with NaN values replaced by 0
# total_df = ...

# 1) Set TREND to 0 for any value that is not -1, 0, or 1
total_df.loc[~total_df["TREND"].isin([-1, 0, 1]), "TREND"] = 0

# 2) Set TREND to -1 for negative values and 1 for positive values
total_df.loc[total_df["TREND"] < 0, "TREND"] = -1
total_df.loc[total_df["TREND"] > 0, "TREND"] = 1

# 3) Adjust TREND values based on the specified conditions
total_df.loc[total_df["TREND"] <= -0.5, "TREND"] = -1
total_df.loc[total_df["TREND"] >= 0.5, "TREND"] = 1
total_df.loc[(total_df["TREND"] > -0.5) & (total_df["TREND"] < 0.5), "TREND"] = 0

# Check the unique values in the TREND column and their counts
unique_trends = set(total_df["TREND"])
trend_counts = total_df["TREND"].value_counts()

# Print the unique values and their counts
print("Unique TREND values:", unique_trends)
print("TREND value counts:\n", trend_counts)



In [None]:
total_df["TREND"] += 1

In [None]:
total_df["TREND"] = total_df["TREND"].convert_dtypes(int)
total_df["TREND"]

In [None]:
import pandas as pd

# List of columns to convert from strings to numeric values
columns_to_convert = ["TDD_CLSPRC", "TDD_OPNPRC", "TDD_HGPRC", "TDD_LWPRC", "MKTCAP", "ACC_TRDVOL"]

# Convert the columns to numeric values
for col in columns_to_convert:
    total_df[col] = total_df[col].str.replace(pat=r'[^0-9]', repl=r'' ,regex=True).apply(pd.to_numeric)



In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# Assuming total_df is already defined and filled with NaN values replaced by 0
# total_df = ...

# Define the scalers
mm = MinMaxScaler()
sc = RobustScaler()

# Apply MinMax scaling to the specified columns
minmax_cols = ["count_day", "TDD_CLSPRC", "TDD_OPNPRC", "TDD_HGPRC", "TDD_LWPRC"]
for col in minmax_cols:
    total_df[col] = mm.fit_transform(total_df[col].values.reshape(-1, 1))

# Apply Robust scaling to the specified columns
robust_cols = ["MKTCAP", "ACC_TRDVOL"]
for col in robust_cols:
    total_df[col] = sc.fit_transform(total_df[col].values.reshape(-1, 1))


In [None]:
total_df.head()

In [None]:
total_df.info()

In [None]:
# Ensure 'ISU_NM' is of string type
total_df["ISU_NM"] = total_df["ISU_NM"].astype(str)

# Calculate where 'ISU_NM' column changes value
isu_nm_changes = total_df['ISU_NM'].shift() != total_df['ISU_NM']
change_indices = [0] + isu_nm_changes[isu_nm_changes].index.tolist() + [len(total_df)]

# Compute pairs of (start, end) indices
segment_pairs = [(change_indices[i], change_indices[i+1]) for i in range(len(change_indices) - 1)]

print("Pairs of (start, end) indices:", segment_pairs)

In [None]:
# Drop ISU_NM if it exists
if "ISU_NM" in total_df.columns:
    total_df = total_df.drop("ISU_NM", axis=1)
else:
    print("Column 'ISU_NM' not found in DataFrame")

In [None]:
df_numeric = total_df.apply(pd.to_numeric, errors='coerce')
df_numeric.info()

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

def convert_nullable_int_columns(df):

    int_columns = df.select_dtypes(include=['Int64']).columns
    for col in int_columns:
        df[col] = df[col].astype('int64')
    return df

def gpu_standard_scaler(tensor, dim = 1):
    return (tensor - tensor.mean(dim = dim))/(tensor.std(dim = dim) + 1e-8)


def process_dataframe(df, segments, use_scale=False, include_diff=False):

    df_numeric = df.apply(pd.to_numeric, errors='coerce')

    df_numeric = convert_nullable_int_columns(df_numeric)

    df_numeric = df_numeric.dropna()

    df_tensor = torch.tensor(df_numeric.values, dtype=torch.float64).cuda()

    df_list_diff = []

    for start, end in segments:
        segment = df_tensor[start:end]

        if use_scale:
            segment = gpu_standard_scaler(segment, dim = 0)
            # scaler = StandardScaler()
            # segment = torch.tensor(scaler.fit_transform(segment.cpu()), dtype=torch.float64).cuda()

        if include_diff:
            segment_diff = segment[1:] - segment[:-1]
            df_list_diff.append(segment_diff)

    if include_diff:
        processed_tensor = torch.cat(df_list_diff, dim=0)
    else:
        processed_tensor = torch.cat([df_tensor[start:end] for start, end in segments], dim=0)

    processed_df = pd.DataFrame(processed_tensor.cpu().numpy(), columns=df_numeric.columns)

    new_segment_pairs = [(0, len(segment)) for segment in df_list_diff] if include_diff else segments

    return processed_df, new_segment_pairs

total_df, segment_pairs = process_dataframe(total_df, segment_pairs, use_scale=True, include_diff=False)


In [None]:
segment_pairs

In [None]:
# import pandas as pd
# import plotly.graph_objects as go

# # Assuming total_df is already defined and filled with NaN values replaced by 0
# # total_df = ...

# # Creating the figure
# fig = go.Figure()

# # Adding the line plot for MKTCAP
# fig.add_trace(go.Scatter(x=total_df.index, y=total_df["MKTCAP"], mode='lines', name="stay"))

# # Adding scatter plots for buy and sell points
# fig.add_trace(go.Scatter(x=total_df[total_df["TREND"] < 0].index, y=total_df[total_df["TREND"] < 0]["MKTCAP"], mode="markers", name="buy", marker=dict(color='green')))
# fig.add_trace(go.Scatter(x=total_df[total_df["TREND"] > 0].index, y=total_df[total_df["TREND"] > 0]["MKTCAP"], mode="markers", name="sell", marker=dict(color='red')))

# # Updating layout
# fig.update_layout(
#     title='Stock Label',
#     xaxis=dict(title='Time'),
#     yaxis=dict(title='Market Capitalization')
# )

# # Display the plot
# fig.show()


In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Assuming total_df is already defined
# # total_df = ...

# # Set the figure size
# plt.figure(figsize=(25, 8))

# # Plot the MKTCAP line
# sns.lineplot(x=total_df.index, y=total_df["MKTCAP"], label="stay", color="gray")

# # Plot the buy points (TREND < 0)
# sns.scatterplot(x=total_df[total_df["TREND"] < 0].index, y=total_df[total_df["TREND"] < 0]["MKTCAP"], label="buy", color="blue")

# # Plot the sell points (TREND > 0)
# sns.scatterplot(x=total_df[total_df["TREND"] > 0].index, y=total_df[total_df["TREND"] > 0]["MKTCAP"], label="sell", color="red")

# # Add titles and labels
# plt.title('Stock Market Capitalization with Buy/Sell Signals')
# plt.xlabel('Time')
# plt.ylabel('Market Capitalization')

# # Show the plot
# plt.legend()
# plt.show()


# Data Overview and Usage Guide

In [None]:
# Data Overview and Usage Guide

"""
- Data Overview
    Preprocessed Data: total_df
    Categorical Columns: Y, M, D, ISU_CD, GDC_sig, RSI_sig, ROC_sig, MAP_sig, STC_sig
    Numerical Columns: TDD_CLSPRC, TDD_OPNPRC, TDD_HGPRC, TDD_LWPRC, MKTCAP, ACC_TRDVOL
    Label: TREND

- Considerations:
    1) It is recommended to use embedding techniques for categorical data.
    2) Labels:
        NaN values have been replaced with 0.

        2-1) Label Processing:
            How to handle -1, 0, 1 depends on the definition.
            ● Classification of -1, 0, 1:
                Commonly, the label being discrete is an issue.
                (1) Set to -1 for values less than 0, and 1 for values greater than 0.
                    # Ratio of -1, 0, 1 = 1397:1440:55
                    : This results in very frequent trading.

                (2) Use only -1, 0, 1.
                    # Ratio of -1, 0, 1 = 76:2740:76
                    : This might cause the model to miss buying opportunities when it should, making it difficult for the model to make accurate predictions.

                (3) Set to -1 for values less than -0.5, and 1 for values greater than 0.5, otherwise 0.
                    # Ratio of -1, 0, 1 = 752:1409:731
                    : (Current preprocessing state) This provides a somewhat balanced ratio.

            ● Regression:
                Keep the label as it is.
                (1) The model performs regression and decides whether to buy or sell based on the predicted increase or decrease.

    3) The utility of GDC, RSI, ROC, MAP, STC indicators for learning is uncertain.
"""


In [None]:
import torch
import random
import torch.nn.functional as F
from torch.utils.data import Dataset

class SequentialDataset(Dataset):
    def __init__(self, df, indices, max_window_size):
        self.df = df
        self.indices = indices
        self.max_window_size = max_window_size
        self.min_window_size = max_window_size // 2

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):

        print(self.indices)
        start_idx = self.indices[idx]
        # window_size = random.randint(self.min_window_size, self.max_window_size)
        window_size = self.max_window_size
        end_idx = min(start_idx + window_size, len(self.df))

        seq = self.df.iloc[start_idx:end_idx]

        X = seq.drop(['TREND'], axis=1)
        y = seq['TREND']

        X = torch.tensor(X.values, dtype=torch.float32)

        label = torch.tensor(y.values.astype(int), dtype=torch.long)
        label = F.one_hot(label, num_classes=3)

        return X, label

In [None]:
from random import shuffle

# Assume 'df' is your DataFrame and 'event' is the column containing labels

def generate_indices(input_df, input_pairs, max_window_size, test_size=0.2):
    length = len(input_pairs)
    print(length)
    train_length = int(length * (1- test_size))
    training_indices = []
    testing_indices = []
    for iter, (start, end) in enumerate(input_pairs):
        indices = training_indices if iter < train_length else testing_indices
        max_index = end - max_window_size  # Calculate the maximum starting index for this segment
        for i in range(start, max_index):
            indices.append(i)
            # # Check if all labels in the window are the same
            # if len(input_df['TREND'][i:i + max_window_size].unique()) == 1:
            #     indices.append(i)
            # else:
            #     print(f"Skipping index {i} due to multiple labels in window.")
    return training_indices, testing_indices


# Assuming 'df' and 'num_classes' are defined
max_window_size = 64
shuffle(segment_pairs)  # Shuffle the indices to randomize the data order
train_indices, test_indices = generate_indices(total_df, segment_pairs, max_window_size)

trainset = SequentialDataset(df=total_df, indices=train_indices, max_window_size=max_window_size)
testset = SequentialDataset(df=total_df, indices=test_indices, max_window_size=max_window_size)


print('Train indices: ', len(train_indices))
print('Test indices: ', len(test_indices))

print(trainset[0][0].shape)
print(len(trainset))

In [None]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

In [None]:
data_config = DataConfig(dataset_name = 'stock_price', task_type='multi_class_classification', obs_shape=[12], label_size=3)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

first_data = trainset[0]
X, y = first_data

print(f"Input shape: {X.shape}")
print(f"Label shape: {y.shape}")

print(f"Total number of samples in trainset: {len(trainset)}")

In [None]:
from trainer_hub import TrainerHub

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False)

In [None]:
trainer_hub.train(trainset, testset)