In [None]:
import pandas as pd
import numpy as np
import pandas_ta as pta
import matplotlib.pyplot as plt
import os
from time import time

# 딥러닝 관련
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Conv2DTranspose, Dense, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

# 클러스터링 관련
from sklearn.cluster import AgglomerativeClustering

# 노트북에 그래프를 바로 표시하기 위한 설정
%matplotlib inline

print("모든 라이브러리를 성공적으로 임포트했습니다.")

In [30]:
def load_data(filepath='/Users/youngjaekim/fundamentals.csv'):
    """
    생성된 fundamentals.csv 파일을 읽어 MultiIndex DataFrame으로 변환하는 새 함수
    """
    # CSV 파일의 첫 두 줄을 헤더로 사용하여 MultiIndex를 바로 생성합니다.
    df = pd.read_csv(filepath, header=[0, 1], index_col=0)
    
    # 날짜 인덱스를 datetime 객체로 변환합니다.
    df.index = pd.to_datetime(df.index, format='%d-%m-%Y')
    df.index.name = 'Dates'
    
    # 컬럼 레벨 이름 지정
    df.columns.names = ['Ticker', 'Feature']
    
    return df

print("데이터 로딩 함수가 준비되었습니다.")

데이터 로딩 함수가 준비되었습니다.


In [31]:
# 1. 새로운 load_data 함수로 데이터를 불러옵니다.
data = load_data('/Users/youngjaekim/fundamentals.csv')

# 2. 데이터를 슬라이싱합니다.
data = data.iloc[-680:-1].copy()

# 3. Volume 컬럼은 정수(Int64) 타입으로, 나머지는 실수(float)로 변환
for ticker in data.columns.levels[0]:
    for feature in data.columns.levels[1]:
        col_tuple = (ticker, feature)
        if col_tuple in data.columns:
            if 'VOLUME' in feature.upper():
                data[col_tuple] = pd.to_numeric(data[col_tuple], errors='coerce').astype(pd.Int64Dtype())
            else:
                data[col_tuple] = pd.to_numeric(data[col_tuple], errors='coerce').astype(float)

print("데이터 로딩 및 타입 변환 완료!")
data.head()

데이터 로딩 및 타입 변환 완료!


Ticker,SPX Index,SPX Index,SPX Index,SPX Index,SPX Index,SPX Index,SPX Index,SPX Index,SPX Index,SPX Index,...,TWSE Index,TWSE Index,TWSE Index,TWSE Index,TWSE Index,TWSE Index,TWSE Index,TWSE Index,TWSE Index,TWSE Index
Feature,PX_OPEN,PX_HIGH,PX_LOW,PX_LAST,PX_VOLUME,PX_TO_BOOK_RATIO,PE_RATIO,DIVIDEND_YIELD,MARKET_CAP,EBITDA,...,CURRENT_RATIO,QUICK_RATIO,INTEREST_COVERAGE,SALES_GROWTH,EPS_GROWTH,BETA,ALPHA,SHARPE_RATIO,SORTINO_RATIO,RSI_14D
Dates,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-01-13,3960.600098,4003.949951,3947.669922,3999.090088,3939700000.0,69.7526,7.686065,102.856062,44.425227,41.662965,...,10.743941,30.73437,5.095635,56.44466,62.808286,42.889623,42.498472,15.758017,21.692059,18.299982
2023-01-16,,,,,,71.841254,7.591438,88.941628,43.979476,38.912932,...,10.196516,35.098865,5.306596,58.39212,60.918187,44.628309,38.910724,19.562545,21.105558,16.90378
2023-01-17,3999.280029,4015.389893,3984.570068,3990.969971,4235560000.0,72.819142,7.865362,93.714961,43.215063,36.104349,...,9.793899,31.482791,5.881072,54.808302,59.570959,45.745476,41.203013,19.137737,18.850843,18.145993
2023-01-18,4002.25,4014.159912,3926.590088,3928.860107,4298710000.0,75.710731,7.619952,94.31246,47.824735,35.410077,...,9.544762,32.724067,4.946221,58.746332,66.780203,46.212986,42.372712,18.94404,20.268045,18.645984
2023-01-19,3911.840088,3922.939941,3885.540039,3898.850098,3991500000.0,75.850893,7.823756,102.836715,43.040862,36.792623,...,9.668249,33.563641,5.943591,59.585356,61.145674,45.400121,36.8599,19.241337,20.50154,19.457527


In [32]:
indices = list(data.columns.get_level_values(0).unique())
index_dict = {}

print("기술적 지표 계산을 시작합니다...")
for index in indices:
    df = data[index].copy()
    
    rename_dict = {
        'PX_OPEN': 'open', 'PX_HIGH': 'high', 'PX_LOW': 'low',
        'PX_LAST': 'close', 'PX_VOLUME': 'volume'
    }
    df.rename(columns=rename_dict, inplace=True)

    df.dropna(subset=['open', 'high', 'low', 'close', 'volume'], inplace=True)
    if df.empty: continue

    df['volume'] = df['volume'].astype('int64')

    # pandas-ta 기술적 지표 계산
    df.ta.rsi(append=True)
    df.ta.mfi(append=True)
    df.ta.adx(append=True)
    df.ta.obv(append=True)
    df.ta.atr(append=True)
    df.ta.ema(length=14, append=True)
    
    bbands = df.ta.bbands(length=20, append=False)
    if bbands is not None:
        df['Boll_upper'] = bbands[f'BBU_20_2.0']
        df['Boll_mid'] = bbands[f'BBM_20_2.0']
        df['Boll_lower'] = bbands[f'BBL_20_2.0']

    macd = df.ta.macd(fast=14, slow=30, append=False)
    if macd is not None:
        df['MACD'] = macd[f'MACD_14_30_9']
    
    # 로그 수익률 계산
    if 'close' in df.columns:
        for i in range(1, 49):
            df[f'LR_{i}'] = np.log(df['close']) - np.log(df['close'].shift(i))
    
    df.dropna(inplace=True, axis=0)
    
    # 0-1 정규화
    for col in df.columns:
        min_val, max_val = df[col].min(), df[col].max()
        if max_val - min_val > 0:
            df[col] = (df[col] - min_val) / (max_val - min_val)
        else:
            df[col] = 0
            
    index_dict[index] = df

print("기술적 지표 계산 및 정규화 완료!")

기술적 지표 계산을 시작합니다...


 1.59091031e+13 1.89889924e+13 1.99220909e+13 2.34668539e+13
 1.80462262e+13 1.42184992e+13 1.62287877e+13 1.68469627e+13
 1.58246862e+13 1.53143419e+13 1.68385987e+13 1.64602363e+13
 1.62322231e+13 2.21539482e+13 2.23822715e+13 2.10767862e+13
 1.96533085e+13 1.68662118e+13 1.66610450e+13 1.59079025e+13
 1.85171530e+13 1.74342034e+13 1.42747180e+13 1.40268765e+13
 1.50736603e+13 1.48609659e+13 1.47962413e+13 1.49577726e+13
 1.46943973e+13 1.35995026e+13 1.54383644e+13 1.69871530e+13
 1.38599775e+13 1.72585958e+13 1.50572467e+13 1.67572066e+13
 1.47091496e+13 1.67447602e+13 1.66567340e+13 1.69558922e+13
 1.72087106e+13 1.55733264e+13 1.78003361e+13 1.84825431e+13
 1.90254469e+13 1.63207409e+13 1.63950422e+13 1.62986286e+13
 1.70761103e+13 1.86602540e+13 1.85707499e+13 1.84159174e+13
 3.02833678e+13 1.53500451e+13 1.56016986e+13 1.63625978e+13
 1.62243947e+13 1.74348961e+13 9.05548882e+12 1.60185736e+13
 1.60569374e+13 1.75416381e+13 1.72993605e+13 1.64533305e+13
 1.59934817e+13 1.858482

기술적 지표 계산 및 정규화 완료!


In [33]:
# 빈 데이터프레임 제거
index_keys = list(index_dict.keys())
for index in index_keys:
    if index_dict[index].shape[0] == 0:
        del index_dict[index]

indices = list(index_dict.keys())

# 모든 데이터프레임 길이를 통일
min_len = min(len(df) for df in index_dict.values())
print(f"모든 데이터의 길이를 가장 짧은 길이인 {min_len}으로 통일합니다.")
trimmed_index_dict = {index: df.tail(min_len) for index, df in index_dict.items()}

# 모델 입력을 위한 4차원 데이터(N, T, F, C) 생성
n_timesteps = min_len
n_features = trimmed_index_dict[indices[0]].shape[1]
n_indices = len(indices)

x = np.array([df.values for df in trimmed_index_dict.values()])
x = x.reshape(n_indices, n_timesteps, n_features, 1).astype('float32')

print(f"모델 입력 데이터(x) 형태: {x.shape}")

모든 데이터의 길이를 가장 짧은 길이인 580으로 통일합니다.
모델 입력 데이터(x) 형태: (10, 580, 83, 1)


In [34]:
args = {
    'n_clusters': 5,
    'batch_size': 16,
    'epochs': 50, # 테스트를 위해 50으로 설정, 나중에 늘려서 성능 향상 가능
    'save_dir': 'results'
}
if not os.path.exists(args['save_dir']):
    os.makedirs(args['save_dir'])

input_shape = x.shape[1:]
filters = [32, 64, 128, 10]
pad3 = 'valid'

# --- 인코더 (Functional API) ---
encoder_input = Input(shape=input_shape, name='encoder_input')
x_enc = Conv2D(filters[0], 5, padding='same', activation='relu', name='conv1')(encoder_input)
x_enc = Conv2D(filters[1], 5, padding='same', activation='relu', name='conv2')(x_enc)
conv3_output = Conv2D(filters[2], 3, padding=pad3, activation='relu', name='conv3')(x_enc)
flatten_output = Flatten(name='flatten')(conv3_output)
embedding = Dense(units=filters[3], name='embedding')(flatten_output)
encoder = Model(encoder_input, embedding, name='encoder')

# --- 디코더 (Functional API) ---
decoder_input = Input(shape=(filters[3],), name='decoder_input')
x_dec = Dense(units=flatten_output.shape[1], activation='relu')(decoder_input)
x_dec = Reshape(conv3_output.shape[1:])(x_dec)
x_dec = Conv2DTranspose(filters[1], 3, padding=pad3, activation='relu', name='deconv3')(x_dec)
x_dec = Conv2DTranspose(filters[0], 5, padding='same', activation='relu', name='deconv2')(x_dec)
decoder_output = Conv2DTranspose(input_shape[2], 5, padding='same', name='deconv1')(x_dec)
decoder = Model(decoder_input, decoder_output, name='decoder')

# --- 오토인코더 모델 결합 ---
autoencoder = Model(encoder.input, decoder(encoder.output), name='autoencoder')

print("Functional API를 사용하여 모델을 성공적으로 구축했습니다.")
autoencoder.summary()

Functional API를 사용하여 모델을 성공적으로 구축했습니다.


In [35]:
autoencoder.compile(optimizer='adam', loss='mse')
csv_logger = CSVLogger(os.path.join(args['save_dir'], 'pretrain-log.csv'))
model_checkpoint = ModelCheckpoint(
    filepath=os.path.join(args['save_dir'], 'best_model.h5'), 
    monitor='loss', 
    save_best_only=True
)

print("\n모델 학습을 시작합니다...")
autoencoder.fit(x, x, 
                batch_size=args['batch_size'], 
                epochs=args['epochs'], 
                verbose=1, 
                callbacks=[csv_logger, model_checkpoint])
print('학습 완료!')


모델 학습을 시작합니다...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - loss: 0.3184



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 34s/step - loss: 0.3184
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29s/step - loss: 0.2866



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 40s/step - loss: 0.2866
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step - loss: 9.6948
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18s/step - loss: 0.2321



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27s/step - loss: 0.2321
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - loss: 0.3578
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23s/step - loss: 0.3358
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 47s/step - loss: 0.3052
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 35s/step - loss: 0.2838
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27s/step - loss: 0.2462
Epoch 10/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16s/step - loss: 0.1716



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23s/step - loss: 0.1716
Epoch 11/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18s/step - loss: 0.0691



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 25s/step - loss: 0.0691
Epoch 12/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 25s/step - loss: 0.2921
Epoch 13/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18s/step - loss: 0.0283



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 26s/step - loss: 0.0283
Epoch 14/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 30s/step - loss: 0.0889
Epoch 15/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24s/step - loss: 0.1454
Epoch 16/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - loss: 0.1644
Epoch 17/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22s/step - loss: 0.1557
Epoch 18/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 43s/step - loss: 0.1228
Epoch 19/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - loss: 0.0711
Epoch 20/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19s/step - loss: 0.0382
Epoch 21/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27s/step - loss: 0.0896
Epoch 22/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19s/step - loss: 0.0726
Epoch 23/50
[1m1/1[0



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 28s/step - loss: 0.0219
Epoch 24/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23s/step - loss: 0.0322
Epoch 25/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23s/step - loss: 0.0525
Epoch 26/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 54s/step - loss: 0.0549
Epoch 27/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22s/step - loss: 0.0400
Epoch 28/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15s/step - loss: 0.0221
Epoch 29/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27s/step - loss: 0.0261
Epoch 30/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 37s/step - loss: 0.0437
Epoch 31/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 30s/step - loss: 0.0312
Epoch 32/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34s/step - loss: 0.0175



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 47s/step - loss: 0.0175
Epoch 33/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 59s/step - loss: 0.0230
Epoch 34/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33s/step - loss: 0.0305
Epoch 35/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 29s/step - loss: 0.0285
Epoch 36/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24s/step - loss: 0.0197
Epoch 37/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20s/step - loss: 0.0160



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33s/step - loss: 0.0160
Epoch 38/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19s/step - loss: 0.0229
Epoch 39/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23s/step - loss: 0.0248
Epoch 40/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 46s/step - loss: 0.0173
Epoch 41/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26s/step - loss: 0.0158



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 36s/step - loss: 0.0158
Epoch 42/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33s/step - loss: 0.0198
Epoch 43/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - loss: 0.0211
Epoch 44/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19s/step - loss: 0.0177
Epoch 45/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23s/step - loss: 0.0147



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33s/step - loss: 0.0147
Epoch 46/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 28s/step - loss: 0.0167
Epoch 47/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 26s/step - loss: 0.0189
Epoch 48/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - loss: 0.0164
Epoch 49/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - loss: 0.0145



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27s/step - loss: 0.0145
Epoch 50/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18s/step - loss: 0.0159
학습 완료!


In [36]:
# 학습된 인코더로 핵심 특징(embedding) 추출
features = encoder.predict(x)

# 계층적 군집화 수행
agg_clustering = AgglomerativeClustering(n_clusters=args['n_clusters'])
pred = agg_clustering.fit_predict(features)

# 결과 저장
cluster_df = pd.DataFrame({'Index': indices, 'Cluster': pred})
cluster_df.to_csv('cluster_results.csv', index=False)

print("\n클러스터링 완료! 'cluster_results.csv' 파일로 저장되었습니다.")
cluster_df.head()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step

클러스터링 완료! 'cluster_results.csv' 파일로 저장되었습니다.


Unnamed: 0,Index,Cluster
0,SPX Index,0
1,INDU Index,2
2,NDX Index,0
3,DAX Index,1
4,SXXP Index,4


In [37]:
for i in sorted(cluster_df['Cluster'].unique()):
    cluster_indices = cluster_df[cluster_df['Cluster'] == i]['Index'].values
    print(f'--- Cluster {i} ---\n{list(cluster_indices)}\n')

--- Cluster 0 ---
['SPX Index', 'NDX Index', 'CAC Index']

--- Cluster 1 ---
['DAX Index', 'UKX Index', 'ASX Index']

--- Cluster 2 ---
['INDU Index', 'SMI Index']

--- Cluster 3 ---
['TWSE Index']

--- Cluster 4 ---
['SXXP Index']

