In [1]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt


In [2]:
import sys

from pathlib import Path

# 경로설정

CWD = Path.cwd()
WORKSPACE_PATH = CWD.parent
COMMON_PATH = WORKSPACE_PATH / 'common'
DATA_PATH = WORKSPACE_PATH / 'data'
ALPHAS_PATH = WORKSPACE_PATH / 'alphas'
FONT_PATH = COMMON_PATH / 'fonts' / 'NanumGothic.ttf'

sys.path.append(str(COMMON_PATH))

In [3]:
import fn_loader as fn
import fn_config as fncfg
import gaps_config as gcfg

from validator import *
from backtest import Backtest

In [4]:
# loader 생성 (데이터 로드 + multi index로 전처리)
# use_alias=True 일 경우 긴 ETF명 대신 축약한 단어 사용
pricevolume1 = fn.FnDataLoader('DBGAPS2024_pricevolume_240722.xlsx', use_alias=True)  

In [5]:
returns_df = pricevolume1.get_data('return') / 100


In [6]:
#무위험 수익률을 마지막 줄에 추가
risk_free_rate = returns_df.mean(axis=1)
returns_df['Risk_Free_Rate'] = risk_free_rate
returns_df

Unnamed: 0_level_0,shortterm,usd,usdinv,kodexinv,oil,gold,hybond,midbond,10y,csi300,nikkei,euro50,sp500,kosdaq150,kodex200,Risk_Free_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-03-04,0.0000,-0.0096,0.0092,-0.0012,0.0127,0.0183,0.0037,-0.0020,-0.0019,0.0042,0.0019,-0.0046,0.0043,0.0028,0.0002,0.002533
2016-03-07,0.0000,-0.0036,0.0019,-0.0024,0.0458,-0.0021,0.0037,0.0017,0.0000,0.0048,-0.0024,-0.0005,0.0006,-0.0051,0.0021,0.002967
2016-03-08,-0.0001,0.0045,-0.0043,0.0079,0.0093,0.0067,0.0005,0.0000,0.0029,-0.0089,-0.0048,-0.0005,-0.0011,0.0009,-0.0070,0.000400
2016-03-09,0.0002,0.0097,-0.0082,-0.0042,-0.0145,-0.0107,-0.0005,-0.0014,-0.0005,-0.0035,-0.0100,-0.0057,-0.0021,0.0060,0.0043,-0.002740
2016-03-10,-0.0001,-0.0108,0.0112,-0.0073,0.0174,-0.0046,-0.0010,0.0019,-0.0010,0.0021,0.0116,0.0010,0.0017,0.0055,0.0092,0.002453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-16,0.0002,0.0017,-0.0029,0.0000,-0.0106,0.0108,-0.0023,0.0013,0.0042,0.0080,0.0003,-0.0089,0.0019,-0.0162,0.0006,-0.000793
2024-07-17,0.0000,-0.0026,0.0047,0.0090,-0.0096,0.0130,0.0062,-0.0004,-0.0006,-0.0017,-0.0039,-0.0096,0.0000,-0.0164,-0.0083,-0.001347
2024-07-18,0.0002,-0.0007,-0.0012,0.0063,0.0280,0.0036,-0.0012,-0.0003,-0.0003,0.0051,-0.0222,-0.0054,-0.0063,-0.0050,-0.0074,-0.000453
2024-07-19,0.0000,0.0046,-0.0035,0.0126,-0.0094,-0.0216,0.0004,-0.0003,-0.0010,0.0108,-0.0028,-0.0054,-0.0082,0.0089,-0.0118,-0.001780


In [None]:
excess_returns_df = returns_df.subtract(risk_free_rate, axis=0)
excess_returns_df=excess_returns_df.drop('Risk_Free_Rate', axis=1)
excess_returns_df

In [None]:

sharpe_ratios_df = pd.DataFrame(excess_returns_df.mean(axis=1) / excess_returns_df.std(axis=1), columns=['Sharp_ratio'])


In [None]:
window_size = 5  # 예를 들어 5일씩 묶어서 계산
rolling_correlations = []

for i in range(window_size, len(excess_returns_df)):
    rolling_window = excess_returns_df.iloc[i-window_size:i]
    correlation_matrix = rolling_window.corr()
    rolling_correlations.append(correlation_matrix.values)

# 3D 배열로 변환
rolling_correlations_3d = np.array(rolling_correlations)
rolling_correlations_3d


In [None]:
# Set the window size for the rolling calculation
window_size = 5  # for example, 5-day rolling window
rolling_covariances = []

# Iterate over each window to calculate the covariance matrix
for i in range(window_size, len(excess_returns_df) + 1):
    rolling_window = excess_returns_df.iloc[i-window_size:i]
    covariance_matrix = rolling_window.cov()
    rolling_covariances.append(covariance_matrix.values)

# Convert the list to a 3D NumPy array
rolling_covariances_3d = np.array(rolling_covariances)

# Display the shape of the 3D array to ensure correctness
rolling_covariances_3d


In [None]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

# 기본 자산별 경계 설정 (최소 0, 최대 1)
labels = excess_returns_df.columns.tolist()
bounds = {label: (0, 1) for label in labels}

# 개별 자산별 경계 예시 (필요 시 수정)
individual_bounds = {
    'A138230': (0.00, 0.20),
    'A114800': (0.00, 0.20),
    'A132030': (0.00, 0.15),
    'A130680': (0.00, 0.15),
    'A192090': (0.00, 0.20),
    'A136340': (0.00, 0.40)
}

# 개별 경계를 bounds에 적용
for asset, bound in individual_bounds.items():
    bounds[asset] = bound

# 계층적 클러스터링을 사용하여 자산 클러스터 분류
def hierarchical_clustering(corr):
    # 상관행렬의 대각선 값을 0으로 설정
    np.fill_diagonal(corr.values, 0)  # 원본 DataFrame에 반영됩니다.
    
    # 거리 행렬 계산
    dist = 1 - corr  # (1 - corr)로 거리 행렬을 계산
    np.fill_diagonal(dist.values, 0)
    linkage_matrix = linkage(squareform(dist), method='single')
    return linkage_matrix

# Quasi-Diagonalization
def get_quasi_diag(linkage_matrix):
    sortIx = [int(linkage_matrix[-1, 0]), int(linkage_matrix[-1, 1])]
    numItems = int(linkage_matrix[-1, 3])  # 클러스터의 항목 수
    while max(sortIx) >= numItems:
        new_sortIx = []
        for i in range(0, len(sortIx), 2):
            if sortIx[i] >= numItems:
                new_sortIx.extend([int(linkage_matrix[sortIx[i] - numItems, 0]), int(linkage_matrix[sortIx[i] - numItems, 1])])
            else:
                new_sortIx.append(sortIx[i])
            if i + 1 < len(sortIx):
                if sortIx[i + 1] >= numItems:
                    new_sortIx.extend([int(linkage_matrix[sortIx[i + 1] - numItems, 0]), int(linkage_matrix[sortIx[i + 1] - numItems, 1])])
                else:
                    new_sortIx.append(sortIx[i + 1])
        sortIx = new_sortIx
    return sortIx

# getClusterVar 함수
def get_cluster_var(cov, cItems):
    cov_ = cov.loc[cItems, cItems]  # 하위 집합의 공분산 행렬
    diag_cov = np.diag(cov_).copy()  # 대각선 값을 복사하여 수정 가능하게 함
    
    # 0 값 대체 (매우 작은 값으로 대체)
    diag_cov[diag_cov == 0] = 1e-10  # 0을 작은 값으로 대체
    
    ivp = 1. / diag_cov
    ivp /= ivp.sum()
    w_ = np.dot(ivp, cov_)
    return np.dot(w_, ivp)


# HRP 알고리즘 함수
def get_rec_bipart(cov, sortIx, bounds):
    w = pd.Series(1.0, index=cov.index)  # 초기 가중치를 1.0으로 설정하여 float 타입으로 만듦
    clItems = [cov.index.tolist()]  # 모든 자산을 하나의 클러스터로 초기화
    while len(clItems) > 0:
        new_clItems = []
        for items in clItems:
            if len(items) > 1:
                # 하위 집합을 두 개의 클러스터로 분할
                half = len(items) // 2
                cItems0 = items[:half]
                cItems1 = items[half:]

                cVar0 = get_cluster_var(cov, cItems0)
                cVar1 = get_cluster_var(cov, cItems1)
                
                # cVar0 + cVar1가 0인 경우를 처리
                if cVar0 + cVar1 == 0:
                    alpha = 0.5  # 분모가 0인 경우, alpha를 0.5로 설정 (양쪽 클러스터에 균등 배분)
                else:
                    alpha = 1 - cVar0 / (cVar0 + cVar1)
                
                w[cItems0] *= alpha
                w[cItems1] *= 1 - alpha

                # 자산별 경계를 적용
                for asset in cItems0:
                    w[asset] = np.clip(w[asset], bounds[asset][0], bounds[asset][1])
                for asset in cItems1:
                    w[asset] = np.clip(w[asset], bounds[asset][0], bounds[asset][1])

                new_clItems.extend([cItems0, cItems1])
        clItems = new_clItems

    # 가중치가 0~1 사이에 있는지 확인
    w = np.clip(w, 0, 1)

    # 가중치의 합을 1로 정규화
    w /= w.sum()
    return w



hrp_weights_list = []

for i in range(rolling_covariances_3d.shape[0]):
    # 각 날짜에 대해 상관행렬 및 공분산 행렬 가져오기
    corr_matrix = pd.DataFrame(rolling_covariances_3d[i], index=labels, columns=labels)
    
    # 계층적 클러스터링 및 정렬된 인덱스 가져오기
    linkage_matrix = hierarchical_clustering(corr_matrix)
    sorted_indices = get_quasi_diag(linkage_matrix)
    sorted_labels = [labels[i] for i in sorted_indices]
    
    # HRP 가중치 계산
    sorted_cov_matrix = corr_matrix.loc[sorted_labels, sorted_labels]
    hrp_weights = get_rec_bipart(sorted_cov_matrix, sorted_labels, bounds)
    
    # 결과 저장
    hrp_weights_list.append(hrp_weights)

# 최종 결과
hrp_weights_list


In [None]:
rolling_covariances_3d = excess_returns_df.rolling(window=60).cov().dropna()
hrp_weights_list = []

for i in range(len(rolling_covariances_3d) // len(labels)):
    idx = i * len(labels)
    corr_matrix = pd.DataFrame(rolling_covariances_3d.iloc[idx:idx + len(labels)].values, index=labels, columns=labels)
    
    linkage_matrix = hierarchical_clustering(corr_matrix)
    sorted_indices = get_quasi_diag(linkage_matrix)
    sorted_labels = [labels[i] for i in sorted_indices]
    
    sorted_cov_matrix = corr_matrix.loc[sorted_labels, sorted_labels]
    hrp_weights = get_rec_bipart(sorted_cov_matrix, sorted_labels, bounds)
    
    hrp_weights_list.append(hrp_weights)

hrp_weights_df = pd.DataFrame(hrp_weights_list, index=excess_returns_df.index[-len(hrp_weights_list):])

# 포트폴리오 수익률 및 누적 수익률 계산
portfolio_returns = (excess_returns_df[-len(hrp_weights_list):] * hrp_weights_df).sum(axis=1)
cumulative_returns = (1 + portfolio_returns).cumprod()

# 결과 시각화
plt.figure(figsize=(14, 7))
plt.plot(cumulative_returns, label='HRP Portfolio')
plt.title('HRP Portfolio Cumulative Returns')
plt.xlabel('Date')
plt.ylabel('Cumulative Return')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pickle

with open(ALPHAS_PATH / 'garam_hrp_cumreturn.pkl', 'wb') as f:
    pickle.dump(cumulative_returns, f)