In [1]:
import sys
import os
sys.path.append('../dags')

from database import create_database_engine


# 환경 변수 설정
os.environ["POSTGRES_USER"] = "airflow"
os.environ["POSTGRES_PASSWORD"] = "airflow"
os.environ["POSTGRES_DB"] = "events"
os.environ["POSTGRES_PORT"] = "5433"

engine = create_database_engine(host="127.0.0.1")
print("Successfully connected to the database")

Successfully connected to the database


In [2]:
import pandas as pd
from datetime import datetime

# 변수 관리
label_table_1 = "kind"
label_table_2 = "label_finbert"
return_table = "abnormal_return_kind"

# delta_ar_1m ~ delta_ar_10m 계산 위해, 시간별 abn_ret_xm 컬럼 정의
abn_return_cols = [
    "abn_ret_minus_10m", "abn_ret_minus_9m", "abn_ret_minus_8m", "abn_ret_minus_7m", "abn_ret_minus_6m",
    "abn_ret_minus_5m", "abn_ret_minus_4m", "abn_ret_minus_3m", "abn_ret_minus_2m", "abn_ret_minus_1m",
    "abn_ret_1m", "abn_ret_2m", "abn_ret_3m", "abn_ret_4m", "abn_ret_5m", "abn_ret_6m",
    "abn_ret_7m", "abn_ret_8m", "abn_ret_9m", "abn_ret_10m"
]

# delta_ar_n 컬럼 생성용: (+n,-n) 쌍 리스트
delta_minutes = list(range(1, 11))
delta_ar_cols = [f"delta_ar_{i}m" for i in delta_minutes]
abn_ret_pos = [f"abn_ret_{i}m" for i in delta_minutes]
abn_ret_neg = [f"abn_ret_minus_{i}m" for i in delta_minutes]

# SQL: abn_ret_1m - abn_ret_minus_1m AS delta_ar_1m ...
delta_exprs = [
    f"(ar.abn_ret_{i}m - ar.abn_ret_minus_{i}m) AS delta_ar_{i}m"
    for i in delta_minutes
]

sql_columns = (
    ["ar.event_ts"] +
    delta_exprs +
    [f'k.label AS label_gpt', f'f.label AS label_finbert']
)

sql_columns_str = ",\n    ".join(sql_columns)

query = f"""
SELECT {sql_columns_str}
FROM {return_table} ar
JOIN {label_table_1} k ON ar.event_id = k.id
LEFT JOIN {label_table_2} f ON ar.event_id = f.event_id
ORDER BY ar.event_ts ASC
"""

df_total = pd.read_sql(query, engine)
df_total

DETAIL:  The database was created using collation version 2.36, but the operating system provides version 2.41.
HINT:  Rebuild all objects in this database that use the default collation and run ALTER DATABASE events REFRESH COLLATION VERSION, or build PostgreSQL with the right library version.


Unnamed: 0,event_ts,delta_ar_1m,delta_ar_2m,delta_ar_3m,delta_ar_4m,delta_ar_5m,delta_ar_6m,delta_ar_7m,delta_ar_8m,delta_ar_9m,delta_ar_10m,label_gpt,label_finbert
0,2021-01-04 10:08:00+00:00,2.71,1.41,2.27,2.24,2.18,1.70,1.08,0.53,1.80,1.69,0,1
1,2021-01-04 10:08:00+00:00,2.46,1.64,0.03,0.00,-0.03,0.20,0.02,0.08,,,1,1
2,2021-01-04 10:08:00+00:00,2.71,1.41,2.27,2.24,2.18,1.70,1.08,0.53,1.80,1.69,0,1
3,2021-01-04 10:14:00+00:00,7.87,7.97,5.76,4.42,5.01,4.68,4.68,4.15,3.89,3.50,1,1
4,2021-01-04 10:16:00+00:00,0.64,0.78,0.24,0.04,-0.17,-0.47,-0.27,-0.47,-1.00,-0.19,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8651,2023-12-28 14:46:00+00:00,5.28,11.02,10.94,15.47,8.83,9.45,9.78,9.11,9.72,9.18,1,1
8652,2023-12-28 14:53:00+00:00,0.72,-0.63,0.24,0.29,0.19,0.22,-0.39,-0.41,-0.37,0.32,1,1
8653,2023-12-28 14:56:00+00:00,-0.49,-0.03,-0.03,-0.08,-0.08,-0.10,-0.13,-0.14,0.01,0.21,0,0
8654,2023-12-28 14:56:00+00:00,-0.49,-0.03,-0.03,-0.08,-0.08,-0.10,-0.13,-0.14,0.01,0.21,0,0


# 중립 제거

In [3]:
import numpy as np

# Hit ratio 계산
results = []

for m in delta_minutes:
    delta_col = f"delta_ar_{m}m"
    
    # delta_ar과 label이 모두 null이 아니고, 중립(0)이 아닌 경우만
    mask_gpt = (
        df_total[delta_col].notna() & 
        df_total["label_gpt"].notna() & 
        (df_total["label_gpt"] != 0)
    )
    
    mask_finbert = (
        df_total[delta_col].notna() & 
        df_total["label_finbert"].notna() & 
        (df_total["label_finbert"] != 0)
    )
    
    # GPT label hit ratio
    if mask_gpt.sum() > 0:
        delta_sign_gpt = np.sign(df_total.loc[mask_gpt, delta_col])
        label_sign_gpt = np.sign(df_total.loc[mask_gpt, "label_gpt"])
        hit_gpt = (delta_sign_gpt == label_sign_gpt).sum()
        total_gpt = mask_gpt.sum()
        hit_ratio_gpt = hit_gpt / total_gpt
    else:
        hit_ratio_gpt = np.nan
        total_gpt = 0
    
    # FinBERT label hit ratio
    if mask_finbert.sum() > 0:
        delta_sign_finbert = np.sign(df_total.loc[mask_finbert, delta_col])
        label_sign_finbert = np.sign(df_total.loc[mask_finbert, "label_finbert"])
        hit_finbert = (delta_sign_finbert == label_sign_finbert).sum()
        total_finbert = mask_finbert.sum()
        hit_ratio_finbert = hit_finbert / total_finbert
    else:
        hit_ratio_finbert = np.nan
        total_finbert = 0
    
    results.append({
        "delta_ar": f"{m}m",
        "hit_ratio_gpt": hit_ratio_gpt,
        "n_gpt": total_gpt,
        "hit_ratio_finbert": hit_ratio_finbert,
        "n_finbert": total_finbert
    })

result_df = pd.DataFrame(results)
result_df = result_df.round({"hit_ratio_gpt": 4, "hit_ratio_finbert": 4})
result_df


Unnamed: 0,delta_ar,hit_ratio_gpt,n_gpt,hit_ratio_finbert,n_finbert
0,1m,0.7746,5640,0.7695,7341
1,2m,0.7754,5611,0.7746,7308
2,3m,0.7614,5575,0.7633,7267
3,4m,0.7505,5548,0.7527,7227
4,5m,0.7468,5521,0.7481,7189
5,6m,0.7385,5484,0.7407,7145
6,7m,0.7266,5458,0.732,7105
7,8m,0.7253,5409,0.7277,7056
8,9m,0.724,5369,0.7265,7006
9,10m,0.717,5335,0.7216,6960


# 중립 포함

In [4]:
import numpy as np
from scipy.stats import norm

# SE 계산용 유틸 함수
def _safe_std_1d(a):
    """NaN 무시, 길이<=1이면 0 반환"""
    a = np.asarray(a, dtype=float)
    a = a[~np.isnan(a)]
    n = a.size
    if n <= 1:
        return 0.0
    return float(np.nanstd(a, ddof=1))

def _eventwise_se_delta(df_subset, t):
    """
    이벤트별 ΔCAR의 SE (근사):
      - pre:  abn_ret_minus_1m ... abn_ret_minus_tm
      - post: abn_ret_1m ... abn_ret_tm
    SE_pre  ~= SD(pre_window)  * sqrt(t)
    SE_post ~= SD(post_window) * sqrt(t)
    SE_delta_i = sqrt(SE_pre^2 + SE_post^2)
    """
    pre_cols  = [f"abn_ret_minus_{k}m" for k in range(1, t+1)]
    post_cols = [f"abn_ret_{k}m"       for k in range(1, t+1)]
    
    pre_mat  = df_subset[pre_cols].to_numpy(dtype=float)
    post_mat = df_subset[post_cols].to_numpy(dtype=float)
    
    # 이벤트별 표준편차
    sd_pre  = np.apply_along_axis(_safe_std_1d, 1, pre_mat)
    sd_post = np.apply_along_axis(_safe_std_1d, 1, post_mat)
    
    se_pre  = sd_pre  * np.sqrt(t)
    se_post = sd_post * np.sqrt(t)
    se_delta = np.sqrt(se_pre**2 + se_post**2)
    
    # ΔCAR
    delta = post_mat.sum(axis=1) - pre_mat.sum(axis=1)
    return se_delta, delta

# SE 계산을 위해 abn_ret 컬럼들도 필요하므로 데이터 로드 (또는 기존 df_total에 추가)
# 현재 df_total에는 delta_ar만 있으므로, 필요한 abn_ret 컬럼들을 추가로 로드
sql_columns_with_abn = (
    ["ar.event_ts"] +
    delta_exprs +
    [f"ar.{col}" for col in abn_return_cols] +
    [f'k.label AS label_gpt', f'f.label AS label_finbert']
)

sql_columns_str_with_abn = ",\n    ".join(sql_columns_with_abn)

query_with_abn = f"""
SELECT {sql_columns_str_with_abn}
FROM {return_table} ar
JOIN {label_table_1} k ON ar.event_id = k.id
LEFT JOIN {label_table_2} f ON ar.event_id = f.event_id
ORDER BY ar.event_ts ASC
"""

df_total_with_abn = pd.read_sql(query_with_abn, engine)
df_total_with_abn


Unnamed: 0,event_ts,delta_ar_1m,delta_ar_2m,delta_ar_3m,delta_ar_4m,delta_ar_5m,delta_ar_6m,delta_ar_7m,delta_ar_8m,delta_ar_9m,...,abn_ret_3m,abn_ret_4m,abn_ret_5m,abn_ret_6m,abn_ret_7m,abn_ret_8m,abn_ret_9m,abn_ret_10m,label_gpt,label_finbert
0,2021-01-04 10:08:00+00:00,2.71,1.41,2.27,2.24,2.18,1.70,1.08,0.53,1.80,...,-0.18,0.19,-0.20,-0.23,-0.24,-0.67,-0.34,-0.45,0,1
1,2021-01-04 10:08:00+00:00,2.46,1.64,0.03,0.00,-0.03,0.20,0.02,0.08,,...,-2.66,-2.49,-2.27,-1.24,-1.62,-1.37,-1.04,-1.91,1,1
2,2021-01-04 10:08:00+00:00,2.71,1.41,2.27,2.24,2.18,1.70,1.08,0.53,1.80,...,-0.18,0.19,-0.20,-0.23,-0.24,-0.67,-0.34,-0.45,0,1
3,2021-01-04 10:14:00+00:00,7.87,7.97,5.76,4.42,5.01,4.68,4.68,4.15,3.89,...,-2.60,-4.00,-3.64,-3.51,-3.64,-4.17,-4.70,-4.90,1,1
4,2021-01-04 10:16:00+00:00,0.64,0.78,0.24,0.04,-0.17,-0.47,-0.27,-0.47,-1.00,...,0.25,0.19,0.06,-0.24,-0.06,-0.24,-0.50,0.03,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8651,2023-12-28 14:46:00+00:00,5.28,11.02,10.94,15.47,8.83,9.45,9.78,9.11,9.72,...,9.00,13.65,7.01,7.60,7.93,7.26,7.78,7.36,1,1
8652,2023-12-28 14:53:00+00:00,0.72,-0.63,0.24,0.29,0.19,0.22,-0.39,-0.41,-0.37,...,0.24,0.29,0.21,0.21,0.23,0.27,0.27,0.24,1,1
8653,2023-12-28 14:56:00+00:00,-0.49,-0.03,-0.03,-0.08,-0.08,-0.10,-0.13,-0.14,0.01,...,0.42,0.44,0.48,0.48,0.45,0.45,0.42,0.42,0,0
8654,2023-12-28 14:56:00+00:00,-0.49,-0.03,-0.03,-0.08,-0.08,-0.10,-0.13,-0.14,0.01,...,0.42,0.44,0.48,0.48,0.45,0.45,0.42,0.42,0,0


In [5]:
# 중립 포함 Hit ratio 계산
alpha = 0.05
z = norm.ppf(1 - alpha/2.0)

results_with_neutral = []

for m in delta_minutes:
    delta_col = f"delta_ar_{m}m"
    
    # delta_ar과 label이 모두 null이 아닌 경우 (중립 포함)
    mask_gpt = (
        df_total_with_abn[delta_col].notna() & 
        df_total_with_abn["label_gpt"].notna()
    )
    
    mask_finbert = (
        df_total_with_abn[delta_col].notna() & 
        df_total_with_abn["label_finbert"].notna()
    )
    
    # GPT label hit ratio
    if mask_gpt.sum() > 0:
        df_gpt = df_total_with_abn.loc[mask_gpt].copy()
        se_delta_gpt, delta_gpt = _eventwise_se_delta(df_gpt, m)
        eps_i_gpt = z * se_delta_gpt
        
        label_sign_gpt = df_gpt["label_gpt"].to_numpy()
        delta_values_gpt = df_gpt[delta_col].to_numpy()
        
        # hit 생성: label=±1이면 sign 비교, label=0이면 |delta| <= eps_i
        hit_gpt = np.zeros(len(df_gpt), dtype=int)
        mask_posneg_gpt = label_sign_gpt != 0
        hit_gpt[mask_posneg_gpt] = (np.sign(delta_values_gpt[mask_posneg_gpt]) == label_sign_gpt[mask_posneg_gpt]).astype(int)
        mask_neutral_gpt = label_sign_gpt == 0
        hit_gpt[mask_neutral_gpt] = (np.abs(delta_values_gpt[mask_neutral_gpt]) <= eps_i_gpt[mask_neutral_gpt]).astype(int)
        
        total_gpt = len(df_gpt)
        hit_ratio_gpt = hit_gpt.sum() / total_gpt if total_gpt > 0 else np.nan
    else:
        hit_ratio_gpt = np.nan
        total_gpt = 0
    
    # FinBERT label hit ratio
    if mask_finbert.sum() > 0:
        df_finbert = df_total_with_abn.loc[mask_finbert].copy()
        se_delta_finbert, delta_finbert = _eventwise_se_delta(df_finbert, m)
        eps_i_finbert = z * se_delta_finbert
        
        label_sign_finbert = df_finbert["label_finbert"].to_numpy()
        delta_values_finbert = df_finbert[delta_col].to_numpy()
        
        # hit 생성: label=±1이면 sign 비교, label=0이면 |delta| <= eps_i
        hit_finbert = np.zeros(len(df_finbert), dtype=int)
        mask_posneg_finbert = label_sign_finbert != 0
        hit_finbert[mask_posneg_finbert] = (np.sign(delta_values_finbert[mask_posneg_finbert]) == label_sign_finbert[mask_posneg_finbert]).astype(int)
        mask_neutral_finbert = label_sign_finbert == 0
        hit_finbert[mask_neutral_finbert] = (np.abs(delta_values_finbert[mask_neutral_finbert]) <= eps_i_finbert[mask_neutral_finbert]).astype(int)
        
        total_finbert = len(df_finbert)
        hit_ratio_finbert = hit_finbert.sum() / total_finbert if total_finbert > 0 else np.nan
    else:
        hit_ratio_finbert = np.nan
        total_finbert = 0
    
    results_with_neutral.append({
        "delta_ar": f"{m}m",
        "hit_ratio_gpt": hit_ratio_gpt,
        "n_gpt": total_gpt,
        "hit_ratio_finbert": hit_ratio_finbert,
        "n_finbert": total_finbert
    })

result_df_with_neutral = pd.DataFrame(results_with_neutral)
result_df_with_neutral = result_df_with_neutral.round({"hit_ratio_gpt": 4, "hit_ratio_finbert": 4})
result_df_with_neutral


Unnamed: 0,delta_ar,hit_ratio_gpt,n_gpt,hit_ratio_finbert,n_finbert
0,1m,0.5199,8606,0.6633,8606
1,2m,0.6932,8563,0.7471,8563
2,3m,0.7417,8508,0.7626,8508
3,4m,0.7665,8457,0.7658,8457
4,5m,0.7896,8411,0.7705,8411
5,6m,0.7976,8360,0.7695,8360
6,7m,0.7986,8311,0.765,8311
7,8m,0.8031,8248,0.763,8248
8,9m,0.8059,8192,0.7629,8192
9,10m,0.8037,8135,0.7593,8135


In [6]:
# abn_ret_10m ~ 60m을 위한 delta_ar 계산 (10분 단위만 존재)
post_horizons = [10, 20, 30, 40, 50, 60]
post_abn_cols = [f"abn_ret_{h}m" for h in post_horizons]
post_abn_minus_cols = [f"abn_ret_minus_{h}m" for h in post_horizons]

# delta_ar_10m = abn_ret_10m - abn_ret_minus_10m 형태로 계산
delta_exprs_post = [
    f"(ar.abn_ret_{h}m - ar.abn_ret_minus_{h}m) AS delta_ar_{h}m"
    for h in post_horizons
]

# SQL 쿼리: 10분 단위 컬럼만 가져오기
sql_columns_post = (
    ["ar.event_ts"] +
    delta_exprs_post +
    [f"ar.{col}" for col in post_abn_cols] +
    [f"ar.{col}" for col in post_abn_minus_cols] +
    [f'k.label AS label_gpt', f'f.label AS label_finbert']
)

sql_columns_str_post = ",\n    ".join(sql_columns_post)

query_post = f"""
SELECT {sql_columns_str_post}
FROM {return_table} ar
JOIN {label_table_1} k ON ar.event_id = k.id
LEFT JOIN {label_table_2} f ON ar.event_id = f.event_id
ORDER BY ar.event_ts ASC
"""

df_total_post = pd.read_sql(query_post, engine)
df_total_post


Unnamed: 0,event_ts,delta_ar_10m,delta_ar_20m,delta_ar_30m,delta_ar_40m,delta_ar_50m,delta_ar_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,...,abn_ret_50m,abn_ret_60m,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,label_gpt,label_finbert
0,2021-01-04 10:08:00+00:00,1.69,2.31,2.19,,,,-0.45,0.17,0.05,...,-0.15,0.18,-2.14,-2.14,-2.14,,,,0,1
1,2021-01-04 10:08:00+00:00,,,,,,,-1.91,-0.66,-1.47,...,-0.60,-0.95,,,,,,,1,1
2,2021-01-04 10:08:00+00:00,1.69,2.31,2.19,,,,-0.45,0.17,0.05,...,-0.15,0.18,-2.14,-2.14,-2.14,,,,0,1
3,2021-01-04 10:14:00+00:00,3.50,,,,,,-4.90,-4.20,-3.46,...,-5.16,-6.00,-8.40,,,,,,1,1
4,2021-01-04 10:16:00+00:00,-0.19,-2.38,-2.09,-2.51,,,0.03,-0.83,-0.54,...,-0.57,-1.83,0.22,1.55,1.55,1.55,,,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8651,2023-12-28 14:46:00+00:00,9.18,6.88,7.22,,,,7.36,4.96,5.17,...,,,-1.82,-1.92,-2.05,-1.96,-2.06,-1.89,1,1
8652,2023-12-28 14:53:00+00:00,0.32,-0.56,,,,,0.24,0.24,,...,,,-0.08,0.80,0.02,0.26,0.25,0.86,1,1
8653,2023-12-28 14:56:00+00:00,0.21,-0.80,,,,,0.42,-0.14,,...,,,0.21,0.66,0.07,1.30,1.76,1.52,0,0
8654,2023-12-28 14:56:00+00:00,0.21,-0.80,,,,,0.42,-0.14,,...,,,0.21,0.66,0.07,1.30,1.76,1.52,0,0


# delta_ar_10m~60m: 중립 제거


In [7]:
# delta_ar_10m~60m: 중립 제거 Hit ratio 계산
results_post = []

for h in post_horizons:
    delta_col = f"delta_ar_{h}m"
    
    # delta_ar과 label이 모두 null이 아니고, 중립(0)이 아닌 경우만
    mask_gpt = (
        df_total_post[delta_col].notna() & 
        df_total_post["label_gpt"].notna() & 
        (df_total_post["label_gpt"] != 0)
    )
    
    mask_finbert = (
        df_total_post[delta_col].notna() & 
        df_total_post["label_finbert"].notna() & 
        (df_total_post["label_finbert"] != 0)
    )
    
    # GPT label hit ratio
    if mask_gpt.sum() > 0:
        delta_sign_gpt = np.sign(df_total_post.loc[mask_gpt, delta_col])
        label_sign_gpt = np.sign(df_total_post.loc[mask_gpt, "label_gpt"])
        hit_gpt = (delta_sign_gpt == label_sign_gpt).sum()
        total_gpt = mask_gpt.sum()
        hit_ratio_gpt = hit_gpt / total_gpt
    else:
        hit_ratio_gpt = np.nan
        total_gpt = 0
    
    # FinBERT label hit ratio
    if mask_finbert.sum() > 0:
        delta_sign_finbert = np.sign(df_total_post.loc[mask_finbert, delta_col])
        label_sign_finbert = np.sign(df_total_post.loc[mask_finbert, "label_finbert"])
        hit_finbert = (delta_sign_finbert == label_sign_finbert).sum()
        total_finbert = mask_finbert.sum()
        hit_ratio_finbert = hit_finbert / total_finbert
    else:
        hit_ratio_finbert = np.nan
        total_finbert = 0
    
    results_post.append({
        "delta_ar": f"{h}m",
        "hit_ratio_gpt": hit_ratio_gpt,
        "n_gpt": total_gpt,
        "hit_ratio_finbert": hit_ratio_finbert,
        "n_finbert": total_finbert
    })

result_df_post = pd.DataFrame(results_post)
result_df_post = result_df_post.round({"hit_ratio_gpt": 4, "hit_ratio_finbert": 4})
result_df_post


Unnamed: 0,delta_ar,hit_ratio_gpt,n_gpt,hit_ratio_finbert,n_finbert
0,10m,0.717,5335,0.7216,6960
1,20m,0.6945,5008,0.701,6532
2,30m,0.6848,4667,0.6877,6077
3,40m,0.6891,4339,0.6881,5634
4,50m,0.6756,3986,0.6718,5176
5,60m,0.6713,3633,0.665,4702


# delta_ar_10m~60m: 중립 포함


In [8]:
# delta_ar_10m~60m: 중립 포함 Hit ratio 계산
# 10분 단위는 단일 값이므로 SE 계산 대신 전체 표준편차 사용
alpha = 0.05
z = norm.ppf(1 - alpha/2.0)

results_post_with_neutral = []

for h in post_horizons:
    delta_col = f"delta_ar_{h}m"
    pre_col = f"abn_ret_minus_{h}m"
    post_col = f"abn_ret_{h}m"
    
    # delta_ar과 label이 모두 null이 아닌 경우 (중립 포함)
    mask_gpt = (
        df_total_post[delta_col].notna() & 
        df_total_post["label_gpt"].notna()
    )
    
    mask_finbert = (
        df_total_post[delta_col].notna() & 
        df_total_post["label_finbert"].notna()
    )
    
    # GPT label hit ratio
    if mask_gpt.sum() > 0:
        df_gpt = df_total_post.loc[mask_gpt].copy()
        
        # 10분 단위는 단일 값이므로 전체 표준편차를 사용
        pre_vals = df_gpt[pre_col].dropna()
        post_vals = df_gpt[post_col].dropna()
        sd_pre = pre_vals.std(ddof=1) if len(pre_vals) > 1 else 0.0
        sd_post = post_vals.std(ddof=1) if len(post_vals) > 1 else 0.0
        se_delta_global = np.sqrt(sd_pre**2 + sd_post**2) if (sd_pre > 0 or sd_post > 0) else 0.0
        eps_i_gpt = np.full(len(df_gpt), z * se_delta_global)
        
        label_sign_gpt = df_gpt["label_gpt"].to_numpy()
        delta_values_gpt = df_gpt[delta_col].to_numpy()
        
        # hit 생성: label=±1이면 sign 비교, label=0이면 |delta| <= eps_i
        hit_gpt = np.zeros(len(df_gpt), dtype=int)
        mask_posneg_gpt = label_sign_gpt != 0
        hit_gpt[mask_posneg_gpt] = (np.sign(delta_values_gpt[mask_posneg_gpt]) == label_sign_gpt[mask_posneg_gpt]).astype(int)
        mask_neutral_gpt = label_sign_gpt == 0
        hit_gpt[mask_neutral_gpt] = (np.abs(delta_values_gpt[mask_neutral_gpt]) <= eps_i_gpt[mask_neutral_gpt]).astype(int)
        
        total_gpt = len(df_gpt)
        hit_ratio_gpt = hit_gpt.sum() / total_gpt if total_gpt > 0 else np.nan
    else:
        hit_ratio_gpt = np.nan
        total_gpt = 0
    
    # FinBERT label hit ratio
    if mask_finbert.sum() > 0:
        df_finbert = df_total_post.loc[mask_finbert].copy()
        
        # 10분 단위는 단일 값이므로 전체 표준편차를 사용
        pre_vals = df_finbert[pre_col].dropna()
        post_vals = df_finbert[post_col].dropna()
        sd_pre = pre_vals.std(ddof=1) if len(pre_vals) > 1 else 0.0
        sd_post = post_vals.std(ddof=1) if len(post_vals) > 1 else 0.0
        se_delta_global = np.sqrt(sd_pre**2 + sd_post**2) if (sd_pre > 0 or sd_post > 0) else 0.0
        eps_i_finbert = np.full(len(df_finbert), z * se_delta_global)
        
        label_sign_finbert = df_finbert["label_finbert"].to_numpy()
        delta_values_finbert = df_finbert[delta_col].to_numpy()
        
        # hit 생성: label=±1이면 sign 비교, label=0이면 |delta| <= eps_i
        hit_finbert = np.zeros(len(df_finbert), dtype=int)
        mask_posneg_finbert = label_sign_finbert != 0
        hit_finbert[mask_posneg_finbert] = (np.sign(delta_values_finbert[mask_posneg_finbert]) == label_sign_finbert[mask_posneg_finbert]).astype(int)
        mask_neutral_finbert = label_sign_finbert == 0
        hit_finbert[mask_neutral_finbert] = (np.abs(delta_values_finbert[mask_neutral_finbert]) <= eps_i_finbert[mask_neutral_finbert]).astype(int)
        
        total_finbert = len(df_finbert)
        hit_ratio_finbert = hit_finbert.sum() / total_finbert if total_finbert > 0 else np.nan
    else:
        hit_ratio_finbert = np.nan
        total_finbert = 0
    
    results_post_with_neutral.append({
        "delta_ar": f"{h}m",
        "hit_ratio_gpt": hit_ratio_gpt,
        "n_gpt": total_gpt,
        "hit_ratio_finbert": hit_ratio_finbert,
        "n_finbert": total_finbert
    })

result_df_post_with_neutral = pd.DataFrame(results_post_with_neutral)
result_df_post_with_neutral = result_df_post_with_neutral.round({"hit_ratio_gpt": 4, "hit_ratio_finbert": 4})
result_df_post_with_neutral


Unnamed: 0,delta_ar,hit_ratio_gpt,n_gpt,hit_ratio_finbert,n_finbert
0,10m,0.8052,8135,0.7555,8135
1,20m,0.7904,7629,0.738,7629
2,30m,0.7828,7078,0.7241,7078
3,40m,0.7828,6552,0.7233,6552
4,50m,0.7747,6004,0.7085,6004
5,60m,0.771,5446,0.7024,5446
