In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.preprocessing import MinMaxScaler 

from keras.callbacks import EarlyStopping
from keras.layers import Dense, GRU
from keras.metrics import MeanSquaredError
from keras.models import Sequential 
from keras.optimizers.legacy import SGD

In [2]:
data_amex  = pd.read_csv('./dataset/amex_data.csv')
data_nsdq = pd.read_csv('./dataset/nasdaq_data.csv')
data_nyse = pd.read_csv('./dataset/nyse_data.csv')
data = pd.concat([data_amex, data_nsdq, data_nyse])
data = data.drop(columns=['Unnamed: 0'])
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date').sort_values(by=['Date', 'symbol'])
data = data[['symbol','Adj Close','Open','High','Low',"Close","Volume"]]
data.head()

Unnamed: 0_level_0,symbol,Adj Close,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1962-01-02,AEP,0.98161,0.0,35.125,34.3125,34.3125,5800.0
1962-01-02,GT,1.951758,0.0,11.1875,11.0,11.125,32000.0
1962-01-02,HON,1.086282,0.0,8.328744,8.272595,8.310028,40740.0
1962-01-02,XRX,0.856244,0.0,4.713805,4.655248,4.684526,51233.0
1962-01-03,AEP,0.979822,0.0,34.75,34.0625,34.25,10200.0


In [3]:
df_21 = data[data.index.year == 2021]
df_21.head()

Unnamed: 0_level_0,symbol,Adj Close,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-04,AACG,1.18,1.17,1.22,1.17,1.18,18000.0
2021-01-04,AADI,15.75,14.925,16.049999,14.58,15.75,373920.0
2021-01-04,AAL,15.13,15.85,15.88,15.03,15.13,69732500.0
2021-01-04,AAME,2.093662,2.14,2.18,2.06,2.14,298100.0
2021-01-04,AAOI,8.76,8.58,8.97,8.49,8.76,782800.0


In [4]:
df_22 = data[data.index.year == 2022]
df_22.head()

Unnamed: 0_level_0,symbol,Adj Close,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-03,AACG,1.19,1.17,1.31,1.15,1.19,127700.0
2022-01-03,AACI,9.91,9.81,9.91,9.81,9.91,112300.0
2022-01-03,AACIU,10.15,10.1,10.15,10.1,10.15,1200.0
2022-01-03,AADI,24.969999,24.459999,25.0,23.67,24.969999,62000.0
2022-01-03,AAL,18.75,18.23,19.18,18.200001,18.75,42781000.0


In [None]:
symbol_21 = df_21['symbol'].unique()
symbol_22 = df_22['symbol'].unique()

# 두 데이터프레임에서 공통된 symbol 중 row 수가 일치하는 symbol만 선택합니다.
symbol_total = [
    s for s in symbol_21 if s in symbol_22
    and df_21[df_21['symbol'] == s].shape[0]-1 == df_22[df_22['symbol'] == s].shape[0]
]

# symbol 리스트를 4개로 분할합니다.
n = len(symbol_total)//4
symbol_divided = [symbol_total[i:i + n] for i in range(0, len(symbol_total), n)]

In [11]:
# data_21과 data_22 폴더가 없다면 생성합니다.
def create_data_folders():
    # data_21과 data_22 폴더가 없다면 생성합니다.
    if not os.path.exists("data_21"):
        os.makedirs("data_21")
    if not os.path.exists("data_22"):
        os.makedirs("data_22")
    
    
def save_symbol_data_to_csv(symbols, df_21, df_22):
    for symbol in symbols:
        # df_21에 대한 CSV 파일 저장
        temp_df_21 = df_21[df_21['symbol'] == symbol]
        file_path_21 = os.path.join("data_21", f"df_21_{symbol}.csv")
        temp_df_21.to_csv(file_path_21, index=False)
        
        # df_22에 대한 CSV 파일 저장
        temp_df_22 = df_22[df_22['symbol'] == symbol]
        file_path_22 = os.path.join("data_22", f"df_22_{symbol}.csv")
        temp_df_22.to_csv(file_path_22, index=False)


# 여기에서 symbols_parts[0], symbols_parts[1] 등으로 분할된 리스트를 사용하여 연산을 수행합니다.
create_data_folders()
save_symbol_data_to_csv(symbol_divided[0], df_21, df_22)  # 첫 번째 부분에 대한 처리
save_symbol_data_to_csv(symbol_divided[1], df_21, df_22)  # 두 번째 부분에 대한 처리 
save_symbol_data_to_csv(symbol_divided[2], df_21, df_22)  # 세 번째 부분에 대한 처리
save_symbol_data_to_csv(symbol_divided[3], df_21, df_22)  # 네 번째 부분에 대한 처리

In [12]:
def ts_train_test_normalize(train_data, test_data, time_steps, for_periods):
    """
    입력: 
        train_data: 훈련 데이터
        test_data: 테스트 데이터
    출력: 
        X_train, y_train: 훈련 데이터
        X_test : 테스트 데이터
        sc : 훈련 데이터에 맞게 인스턴스화된 MinMaxScaler 객체
    """
    # 훈련 및 테스트 데이터를 값으로 변환
    ts_train = train_data.iloc[:,1:2].values
    ts_test = test_data.iloc[:,1:2].values
    ts_train_len = len(ts_train)
    ts_test_len = len(ts_test)
    
    # 데이터 스케일링
    sc = MinMaxScaler(feature_range=(0,1))
    ts_train_scaled = sc.fit_transform(ts_train)
    
    # s 샘플과 t 타임 스텝의 훈련 데이터 생성
    X_train = []
    y_train = []
    for i in range(time_steps, ts_train_len-1):
        X_train.append(ts_train_scaled[i-time_steps:i, 0])
        y_train.append(ts_train_scaled[i:i+for_periods, 0])
    X_train, y_train = np.array(X_train), np.array(y_train)
    
    # 효율적인 모델링을 위한 X_train 재구성
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    
    # 입력 데이터 준비
    inputs = np.concatenate((ts_train, ts_test), axis=0)
    inputs = inputs[len(inputs)-len(ts_test)-time_steps:]
    inputs = inputs.reshape(-1,1)
    inputs = sc.transform(inputs)
    
    # X_test 준비
    X_test = []
    for i in range(time_steps, ts_test_len + time_steps - for_periods):
        X_test.append(inputs[i-time_steps:i,0])
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    
    return X_train, y_train, X_test, sc

In [13]:
def GRU_model(X_train, y_train, X_test, sc):
    
        
    # The GRU architecture 
    my_GRU_model = Sequential()
    my_GRU_model.add(GRU(units = 50, 
                         return_sequences = True, 
                         input_shape = (X_train.shape[1],1), 
                         activation = 'tanh'))
    my_GRU_model.add(GRU(units = 50, 
                         activation = 'tanh'))
    my_GRU_model.add(Dense(units = 2))
    
    
    # Compiling the RNN 
    my_GRU_model.compile(optimizer = SGD(learning_rate = 0.01, 
                                         decay = 1e-7, 
                                         momentum = 0.9, 
                                         nesterov = False), 
                         loss = 'mean_squared_error')
    
    
    # Fitting to the trainig set 
    my_GRU_model.fit(X_train, 
                     y_train, 
                     epochs = 50, 
                     batch_size = 150, 
                     verbose = 0)
    
    
    GRU_prediction = my_GRU_model.predict(X_test)
    GRU_prediction = sc.inverse_transform(GRU_prediction)
    
    
    return my_GRU_model, GRU_prediction 

In [18]:
def train_and_save_models(i, symbol_divided, df_21, df_22):
    # models_21 폴더와 prdcts_22 폴더를 생성합니다.
    os.makedirs("models_21", exist_ok=True)
    os.makedirs("prdcts_22", exist_ok=True)
    
    symbol_cnt = len(symbol_divided)
    
    # symbol 별로 전처리, 학습, 저장
    for idx, symbol in enumerate(symbol_divided):
        print(f"progress:{i+1}/4, processing for symbol: {symbol} ({idx + 1}/{symbol_cnt})")
        
        # 심볼별 csv 파일을 불러옵니다.
        X_file_path = os.path.join("data_21", f"df_21_{symbol}.csv")
        y_file_path = os.path.join("data_22", f"df_22_{symbol}.csv")
        
        if os.path.exists(X_file_path) and os.path.exists(y_file_path):
            X = pd.read_csv(X_file_path)
            y = pd.read_csv(y_file_path)
            
            # 데이터 전처리
            X_train, y_train, X_test, sc = ts_train_test_normalize(X, y, 5, 2)

            # 모델 학습
            model, prdct = GRU_model(X_train, y_train, X_test, sc)

            # 모델을 keras 형식으로 저장
            model.save(os.path.join("models_21", f"model_21_{symbol}.keras"))

            # 예측 결과를 CSV 형식으로 저장
            pd.DataFrame(prdct).to_csv(os.path.join("prdcts_22", f"prdct_22_{symbol}.csv"), index=False)
        else:
            print(f"{symbol}에 해당하는 csv 파일을 찾을 수 없습니다.")


for i in range(3):
    train_and_save_models(i, symbol_divided[i], df_21, df_22)


progress:1/4, processing for symbol: AACG (1/742)
progress:1/4, processing for symbol: AADI (2/742)
progress:1/4, processing for symbol: AAL (3/742)
progress:1/4, processing for symbol: AAME (4/742)
progress:1/4, processing for symbol: AAOI (5/742)
progress:1/4, processing for symbol: AAON (6/742)
progress:1/4, processing for symbol: AAPL (7/742)
progress:1/4, processing for symbol: AAU (8/742)
progress:1/4, processing for symbol: ABAT (9/742)
progress:1/4, processing for symbol: ABCB (10/742)
progress:1/4, processing for symbol: ABCL (11/742)
progress:1/4, processing for symbol: ABCM (12/742)
progress:1/4, processing for symbol: ABEO (13/742)
progress:1/4, processing for symbol: ABIO (14/742)
progress:1/4, processing for symbol: ABL (15/742)
progress:1/4, processing for symbol: ABNB (16/742)
progress:1/4, processing for symbol: ABUS (17/742)
progress:1/4, processing for symbol: ABVC (18/742)
progress:1/4, processing for symbol: ACAD (19/742)
progress:1/4, processing for symbol: ACB (2

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


progress:3/4, processing for symbol: PCG-PD (564/742)
progress:3/4, processing for symbol: PCG-PE (565/742)
progress:3/4, processing for symbol: PCG-PG (566/742)
progress:3/4, processing for symbol: PCG-PH (567/742)
progress:3/4, processing for symbol: PCG-PI (568/742)


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


progress:3/4, processing for symbol: PCH (569/742)
progress:3/4, processing for symbol: PCRX (570/742)
progress:3/4, processing for symbol: PCSA (571/742)
progress:3/4, processing for symbol: PCT (572/742)
progress:3/4, processing for symbol: PCTI (573/742)
progress:3/4, processing for symbol: PCTY (574/742)
progress:3/4, processing for symbol: PCVX (575/742)
progress:3/4, processing for symbol: PCYG (576/742)
progress:3/4, processing for symbol: PCYO (577/742)
progress:3/4, processing for symbol: PDCO (578/742)
progress:3/4, processing for symbol: PDD (579/742)
progress:3/4, processing for symbol: PDEX (580/742)
progress:3/4, processing for symbol: PDFS (581/742)
progress:3/4, processing for symbol: PDLB (582/742)
progress:3/4, processing for symbol: PDSB (583/742)
progress:3/4, processing for symbol: PEBK (584/742)
progress:3/4, processing for symbol: PEBO (585/742)
progress:3/4, processing for symbol: PED (586/742)
progress:3/4, processing for symbol: PEGA (587/742)
progress:3/4, pr

: 

In [None]:
def actual_pred_plot(y_test, y_pred):
    """
    실제 값과 예측 값을 그래프로 그립니다.
    :param y_test: 실제 값
    :param y_pred: 예측 값
    :return: Mean Squared Error와 그래프
    """
    actual_pred = pd.DataFrame(columns=['Adj Close', 'prediction'])  # DataFrame 생성
    actual_pred['Adj Close'] = y_test  # 실제 값
    actual_pred['prediction'] = y_pred  # 예측 값

    m = MeanSquaredError()  # MeanSquaredError 인스턴스 생성
    m.update_state(np.array(y_test), np.array(y_pred))  # 상태 업데이트

    plt.figure(figsize=(12, 6))
    plt.plot(actual_pred['Adj Close'], label='Actual')
    plt.plot(actual_pred['prediction'], label='Prediction')
    plt.legend()
    plt.show()

    return m.result().numpy()  # Mean Squared Error 반환

In [None]:
def confirm_result(y_test, y_pred):
    MAE = mean_absolute_error(y_test, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSLE = np.sqrt(mean_squared_log_error(y_test, y_pred))
    R2 = r2_score(y_test, y_pred)
    
    pd.options.display.float_format = '{:.5f}'.format
    Result = pd.DataFrame(data=[MAE,RMSE, RMSLE, R2],
                         index = ['MAE','RMSE', 'RMSLE', 'R2'],
                         columns=['Results'])
    return Result

In [None]:
sy = 'AMZN'
y_pred = pd.read_csv(f'prdcts_22/prdct_22_{sy}.csv') 
y_test = df_22[df_22['symbol'] == sy]['Adj Close'].values

actual_pred_plot(y_test, y_pred)
confirm_result(y_test, y_pred)