In [1]:
import sys
import os
sys.path.append('../dags')

from database import create_database_engine


# 환경 변수 설정
os.environ["POSTGRES_USER"] = "airflow"
os.environ["POSTGRES_PASSWORD"] = "airflow"
os.environ["POSTGRES_DB"] = "events"
os.environ["POSTGRES_PORT"] = "5433"

engine = create_database_engine(host="127.0.0.1")
print("Successfully connected to the database")

Successfully connected to the database


In [2]:
import pandas as pd
from datetime import datetime


# 나머지는 변수로 관리
label_table = "kind"
return_table = "abnormal_return_kind"
label_col = "label"

# 포함할 abnormal return 컬럼
abn_return_cols = [
    "abn_ret_minus_10m", "abn_ret_minus_9m", "abn_ret_minus_8m", "abn_ret_minus_7m", "abn_ret_minus_6m",
    "abn_ret_minus_5m", "abn_ret_minus_4m", "abn_ret_minus_3m", "abn_ret_minus_2m", "abn_ret_minus_1m",
    "abn_ret_1m", "abn_ret_2m", "abn_ret_3m", "abn_ret_4m", "abn_ret_5m", "abn_ret_6m",
    "abn_ret_7m", "abn_ret_8m", "abn_ret_9m", "abn_ret_10m"
]

# SQL 컬럼 문자열을 만듬 (event_id, event_ts는 하드코딩)
sql_columns = (
    [f"ar.event_ts"] +
    [f"ar.{col}" for col in abn_return_cols] +
    [f"k.{label_col}"]
)
sql_columns_str = ",\n    ".join(sql_columns)

query = f"""
SELECT {sql_columns_str}
FROM {return_table} ar
JOIN "{label_table}" k ON ar.event_id = k.id
ORDER BY ar.event_ts ASC
"""

df_total = pd.read_sql(query, engine)
df_total["period_dummy"] = (
    pd.to_datetime(df_total["event_ts"], utc=True) > pd.Timestamp("2022-06-30", tz="UTC")
).astype(int)
df_total

DETAIL:  The database was created using collation version 2.36, but the operating system provides version 2.41.
HINT:  Rebuild all objects in this database that use the default collation and run ALTER DATABASE events REFRESH COLLATION VERSION, or build PostgreSQL with the right library version.


Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_9m,abn_ret_minus_8m,abn_ret_minus_7m,abn_ret_minus_6m,abn_ret_minus_5m,abn_ret_minus_4m,abn_ret_minus_3m,abn_ret_minus_2m,...,abn_ret_3m,abn_ret_4m,abn_ret_5m,abn_ret_6m,abn_ret_7m,abn_ret_8m,abn_ret_9m,abn_ret_10m,label,period_dummy
0,2021-01-04 10:08:00+00:00,-2.14,-2.14,-1.20,-1.32,-1.93,-2.38,-2.05,-2.45,-1.76,...,-0.18,0.19,-0.20,-0.23,-0.24,-0.67,-0.34,-0.45,0,0
1,2021-01-04 10:08:00+00:00,-2.14,-2.14,-1.20,-1.32,-1.93,-2.38,-2.05,-2.45,-1.76,...,-0.18,0.19,-0.20,-0.23,-0.24,-0.67,-0.34,-0.45,0,0
2,2021-01-04 10:08:00+00:00,,,-1.45,-1.64,-1.44,-2.24,-2.49,-2.69,-1.96,...,-2.66,-2.49,-2.27,-1.24,-1.62,-1.37,-1.04,-1.91,1,0
3,2021-01-04 10:14:00+00:00,-8.40,-8.59,-8.32,-8.32,-8.19,-8.65,-8.42,-8.36,-8.10,...,-2.60,-4.00,-3.64,-3.51,-3.64,-4.17,-4.70,-4.90,1,0
4,2021-01-04 10:16:00+00:00,0.22,0.50,0.23,0.21,0.23,0.23,0.15,0.01,-0.31,...,0.25,0.19,0.06,-0.24,-0.06,-0.24,-0.50,0.03,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8651,2023-12-28 14:46:00+00:00,-1.82,-1.94,-1.85,-1.85,-1.85,-1.82,-1.82,-1.94,-1.98,...,9.00,13.65,7.01,7.60,7.93,7.26,7.78,7.36,1,1
8652,2023-12-28 14:53:00+00:00,-0.08,0.64,0.68,0.62,-0.01,0.02,0.00,0.00,-0.01,...,0.24,0.29,0.21,0.21,0.23,0.27,0.27,0.24,1,1
8653,2023-12-28 14:56:00+00:00,0.21,0.41,0.59,0.58,0.58,0.56,0.52,0.45,0.45,...,0.42,0.44,0.48,0.48,0.45,0.45,0.42,0.42,0,1
8654,2023-12-28 14:56:00+00:00,0.21,0.41,0.59,0.58,0.58,0.56,0.52,0.45,0.45,...,0.42,0.44,0.48,0.48,0.45,0.45,0.42,0.42,0,1


In [3]:
df_total[df_total["period_dummy"] == 1].head()

Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_9m,abn_ret_minus_8m,abn_ret_minus_7m,abn_ret_minus_6m,abn_ret_minus_5m,abn_ret_minus_4m,abn_ret_minus_3m,abn_ret_minus_2m,...,abn_ret_3m,abn_ret_4m,abn_ret_5m,abn_ret_6m,abn_ret_7m,abn_ret_8m,abn_ret_9m,abn_ret_10m,label,period_dummy
5072,2022-07-01 10:03:00+00:00,-1.8,-0.66,0.14,0.19,-0.19,-0.19,0.28,-0.14,-0.14,...,0.33,0.1,0.0,-0.28,-0.66,-0.33,-0.8,-1.27,1,1
5073,2022-07-01 10:24:00+00:00,-1.57,-1.43,-1.43,-1.43,-1.28,-1.23,-1.19,-1.66,-1.66,...,0.62,0.67,0.86,0.48,0.1,0.48,0.1,0.53,1,1
5074,2022-07-01 10:26:00+00:00,0.1,-0.05,0.1,0.0,0.05,-0.14,-0.14,-0.14,-0.05,...,0.38,0.14,0.05,-0.14,-0.24,-0.24,-0.14,0.14,0,1
5075,2022-07-01 10:43:00+00:00,-0.89,-0.99,-1.11,-1.13,-1.03,-1.13,-1.18,-1.26,-1.34,...,-0.03,0.19,-0.2,0.74,0.09,0.75,-0.04,0.08,0,1
5076,2022-07-01 11:03:00+00:00,-0.21,-0.3,-0.35,-0.28,-0.04,-0.04,-0.14,0.04,-0.06,...,0.04,0.0,-0.08,-0.02,-0.11,-0.26,-0.16,-0.23,1,1


# 회귀분석

### 📘 모형식

$$
\left| CAR_{post,i} - CAR_{pre,i} \right| = \alpha + \beta \cdot Telegram_i + \epsilon_i
$$

---

### 📊 변수 설명

**종속변수 (Dependent Variable)**  
이벤트 전후 CAR 변화폭의 절댓값:

$$
\left| CAR_{post,i} - CAR_{pre,i} \right|
$$


**설명변수 (Key Independent Variable)**  
텔레그램 도입 여부 변수:

$$
Telegram_i =
\begin{cases}
1, & \text{텔레그램 도입 이후 (After introduction)} \\
0, & \text{도입 이전 (Before introduction)}
\end{cases}
$$

---

### 📑 가설 설정

$$
\begin{aligned}
H_0 &: \beta = 0 \quad &(\text{텔레그램 도입 이후와 이전 간 CAR 변화폭의 차이가 없다.}) \\
H_1 &: \beta < 0 \quad &(\text{텔레그램 도입 이후 CAR 변화폭이 감소했다.})
\end{aligned}
$$
---

### 📈 해석

이 회귀모형은 **텔레그램 도입 여부**가 이벤트 전후 누적초과수익률(CAR) 변화폭의 크기에 미치는 영향을 분석하기 위한 것입니다.  
β < 0 이고 통계적으로 유의하다면 귀무가설이 기각되어 공시 정보가 도입 이후 시장에 보다 신속하게 반영되어 정보 비효율성이 감소했음을 시사하며, 텔레그램 공시 알림 서비스가 시장 효율성 향상에 실질적으로 기여했음을 보여줍니다.


In [4]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# df_total이 이미 존재한다고 가정
horizons = list[int](range(1, 11))
rows = []

for h in horizons:
    post_col = f"abn_ret_{h}m"
    pre_col  = f"abn_ret_minus_{h}m"
    if post_col not in df_total.columns or pre_col not in df_total.columns:
        raise KeyError(f"'{post_col}' 또는 '{pre_col}' 컬럼을 찾을 수 없습니다.")

    y = (df_total[post_col] - df_total[pre_col]).abs()
    X = sm.add_constant(df_total["period_dummy"], has_constant="add")

    valid = ~(y.isna() | X.isna().any(axis=1))
    y_ = y[valid]
    X_ = X[valid]

    model = sm.OLS(y_, X_).fit(cov_type="HC1")

    beta = model.params.get("period_dummy", np.nan)
    se   = model.bse.get("period_dummy", np.nan)
    tval = model.tvalues.get("period_dummy", np.nan)
    pval = model.pvalues.get("period_dummy", np.nan)

    rows.append({
        "horizon_min": h,
        "beta": beta,
        "std_err": se,
        "t_value": tval,
        "p_value": pval,
        "sig_0.05": "*" if (not pd.isna(pval) and pval < 0.05) else "",
        "n_obs": int(model.nobs),
        "r_squared": model.rsquared
    })

result_table = pd.DataFrame(rows).sort_values("horizon_min").reset_index(drop=True)

# ✅ p-value 포맷 조정 (지수표기 없이 소수점 8자리, 0은 "<1e-8"로 표시)
result_table["p_value"] = result_table["p_value"].apply(
    lambda x: f"{x:.12f}" if (x > 0 and not pd.isna(x)) else "<1e-8"
)

# 보기 좋게 출력
display_cols = ["horizon_min", "beta", "std_err", "t_value", "p_value", "sig_0.05", "n_obs", "r_squared"]
print(result_table[display_cols].to_string(index=False))

 horizon_min     beta  std_err  t_value        p_value sig_0.05  n_obs  r_squared
           1 0.206663 0.043081 4.797013 0.000001610492        *   8606   0.002849
           2 0.180292 0.046854 3.847966 0.000119102338        *   8563   0.001836
           3 0.191986 0.049038 3.915014 0.000090399026        *   8508   0.001940
           4 0.191368 0.050178 3.813763 0.000136866809        *   8457   0.001873
           5 0.162647 0.049603 3.279003 0.001041745461        *   8411   0.001400
           6 0.151854 0.049614 3.060701 0.002208197210        *   8360   0.001219
           7 0.127205 0.048732 2.610276 0.009046910950        *   8311   0.000884
           8 0.100489 0.048493 2.072243 0.038242759840        *   8248   0.000555
           9 0.121149 0.048981 2.473382 0.013384096256        *   8192   0.000803
          10 0.127670 0.049418 2.583460 0.009781479642        *   8135   0.000883


# 방향 정합성

### 📘 모형식 정의

$$
\text{logit}^{-1}(x) = \frac{1}{1 + e^{-x}}
$$

$$
Pr(hit_{i,t} = 1) = \text{logit}^{-1}(\alpha + \beta \cdot period\_dummy_i)
$$
---

### 📊 변수 설명


이벤트 *i*, 창 *t* 에 대해 Delta CAR, post CAR 2개의 지표를 사용

$$
t \in \{10,\,20,\,\dots,\,60\}
$$

$$
\Delta CAR_{i,t} = CAR_{post,i,t} - CAR_{pre,i,t}  
$$

$$
CAR_{post,i,t}
$$

$$
label\_sign_i =
\begin{cases}
+1, & \text{긍정 (1)} \\
0, & \text{중립 (0)} \\
-1, & \text{부정 (-1)} 
\end{cases}
$$



$$
hit_{i,t} =
\begin{cases}
1, & \text{if } \operatorname{sign}(\Delta CAR_{i,t}) = label\_sign_i \\
0, & \text{otherwise}
\end{cases}
$$

---

### 📑 가설 설정

$$
\begin{aligned}
H_0 &: \beta = 0 \quad &(\text{도입 전후 차이 없음}) \\
H_1 &: \beta > 0 \quad &(\text{도입 후 정합률 상승})
\end{aligned}
$$

---

### 📈 해석

- delta CAR: β > 0 이고 **통계적으로 유의**하다면  → “텔레그램 서비스 출시 이후 과잉반응 감소 / 즉시 반영 후 빠른 수렴 → 정보의 선반영 및 사전 확산이 강화 → 정보 효율성 향상”을 의미한다고 볼 수 있다.
- Post CAR: β > 0 이고 **통계적으로 유의**하다면  → “텔레그램 서비스 출시 이후 이벤트 발표 이후의 정합률(정확도) 증가 → 시장 반응의 즉시 반영 강화 → 정보 효율성 향상”을 의미한다고 해석할 수 있다.


In [6]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

def print_sample_summary(df, label_col="label_sign"):
    """
    중립 이벤트 제거 후 샘플 요약 통계 출력
    """
    n_total = len(df)
    n_neutral = (df[label_col] == 0).sum()
    df_nn = df[df[label_col] != 0]
    n_used = len(df_nn)
    pos_cnt = (df_nn[label_col] == 1).sum()
    neg_cnt = (df_nn[label_col] == -1).sum()
    
    print("=== Sample summary (neutral removed) ===")
    print(f"Total events:        {n_total:,}")
    print(f"Removed neutrals:    {n_neutral:,}")
    print(f"Used (non-neutral):  {n_used:,}")
    print(f"  - Positive (1):    {pos_cnt:,}")
    print(f"  - Negative (-1):   {neg_cnt:,}")
    print()
    return df_nn

def logistic_hit_delta(df_subset, t):
    """
    ΔCAR 기준: ΔCAR_{i,t} = CAR_{post,i,t} - CAR_{pre,i,t}
    hit_{i,t} = 1 if sign(ΔCAR_{i,t}) == label_sign_i else 0
    logistic regression on period_dummy
    """
    delta = df_subset[f"abn_ret_{t}m"] - df_subset[f"abn_ret_minus_{t}m"]
    realized_sign = np.sign(delta)
    hit = (realized_sign == df_subset["label_sign"]).astype(int)
    X = sm.add_constant(df_subset["period_dummy"])
    model = sm.Logit(hit, X).fit(disp=0)
    beta = float(model.params["period_dummy"])
    pval = float(model.pvalues["period_dummy"])
    odds_ratio = float(np.exp(beta))
    X0 = X.copy(); X0["period_dummy"] = 0
    X1 = X.copy(); X1["period_dummy"] = 1
    p0 = float(model.predict(X0).mean())
    p1 = float(model.predict(X1).mean())
    diff_pp = (p1 - p0) * 100.0
    return {
        "window": t,
        "beta": beta,
        "p_value": pval,
        "odds_ratio": odds_ratio,
        "p_before": p0,
        "p_after": p1,
        "diff_pp": diff_pp,
        "n_obs": int(model.nobs)
    }

def logistic_hit_postCAR(df_subset, h):
    """
    post CAR 기준: 0→+h 누적초과수익의 부호 부합 여부
    hit = 1 if sign(abn_ret_{h}m) == label_sign_i else 0
    logistic regression on period_dummy
    """
    realized_sign = np.sign(df_subset[f"abn_ret_{h}m"])
    hit = (realized_sign == df_subset["label_sign"]).astype(int)
    X = sm.add_constant(df_subset["period_dummy"])
    model = sm.Logit(hit, X).fit(disp=0)
    beta = float(model.params["period_dummy"])
    pval = float(model.pvalues["period_dummy"])
    odds_ratio = float(np.exp(beta))
    X0 = X.copy(); X0["period_dummy"] = 0
    X1 = X.copy(); X1["period_dummy"] = 1
    p0 = float(model.predict(X0).mean())
    p1 = float(model.predict(X1).mean())
    diff_pp = (p1 - p0) * 100.0
    return {
        "window": h,
        "beta": beta,
        "p_value": pval,
        "odds_ratio": odds_ratio,
        "p_before": p0,
        "p_after": p1,
        "diff_pp": diff_pp,
        "n_obs": int(model.nobs)
    }

def run_logistic_table(logistic_fn, df_nn, windows, label="ΔCAR"):
    """
    창(t)별 로지스틱 회귀 수행 및 요약 테이블 리턴
    """
    results = []
    for w in windows:
        results.append(logistic_fn(df_nn, w))
    results_df = pd.DataFrame(results)
    results_df["sig_0.05"] = np.where(results_df["p_value"] < 0.05, "*", "")
    print(f"\n--- {label} 기준 결과 ---")
    print(
        results_df[
            ["window", "beta", "p_value", "sig_0.05", "odds_ratio", "p_before", "p_after", "diff_pp", "n_obs"]
        ]
        .rename(columns={"window": "t"})
        .round({
            "beta": 4, "p_value": 12, "odds_ratio": 3, "p_before": 3, "p_after": 3, "diff_pp": 2
        })
        .to_string(index=False)
    )
    return results_df

# === 전체 파이프라인 ===

df = df_total.copy()
df["label_sign"] = df["label"]
df_nn = print_sample_summary(df, "label_sign")

# ΔCAR 기준 (pre/post window 차이 사용)
windows = list(range(1, 11))
results_delta = run_logistic_table(logistic_hit_delta, df_nn, windows, label="ΔCAR")

# post CAR 기준 (공시 이후만 사용, 기존 코드 - file_context_0)
results_post = run_logistic_table(logistic_hit_postCAR, df_nn, windows, label="post CAR")


=== Sample summary (neutral removed) ===
Total events:        8,656
Removed neutrals:    2,980
Used (non-neutral):  5,676
  - Positive (1):    4,906
  - Negative (-1):   770


--- ΔCAR 기준 결과 ---
 t    beta  p_value sig_0.05  odds_ratio  p_before  p_after  diff_pp  n_obs
 1 -0.0338 0.593081                0.967     0.772    0.766    -0.60   5676
 2 -0.0492 0.435317                0.952     0.771    0.762    -0.88   5676
 3  0.0118 0.847610                1.012     0.747    0.749     0.22   5676
 4 -0.0058 0.923831                0.994     0.734    0.733    -0.11   5676
 5  0.0675 0.260747                1.070     0.720    0.734     1.34   5676
 6  0.0191 0.745786                1.019     0.712    0.716     0.39   5676
 7 -0.0165 0.776751                0.984     0.700    0.697    -0.35   5676
 8 -0.0187 0.745908                0.981     0.693    0.689    -0.40   5676
 9 -0.0412 0.473389                0.960     0.689    0.680    -0.89   5676
10  0.0659 0.247844                1.068     

정보의 선반영(anticipation)·사전확산 강화
- 텔레그램 도입 이후, 공시 직전에 감성 방향으로 미리 움직이는 비율/크기가 커졌을 가능성
- 그래서 공시 순간의 post CAR 방향 정합은 높아지지만(β_post>0), “직전 대비 추가로 얼마나 같은 방향으로 더 가느냐”를 보는 ΔCAR 기준에서는 추가분이 줄어 **정합성↓**로 관측(β_Δ<0)

과잉반응 감소 / 즉시 반영 후 빠른 수렴
- 도입 전에는 공시 직후 같은 방향으로 과도하게 더 밀던 패턴이, 도입 후에는 이미 반영되어 추가 밀림이 줄어듦 → Δ 기준 정합성 감소(β_Δ<0).
- 하지만 “초기 방향” 자체는 더 정확해짐 → post 기준 정합성 증가(β_post>0).


도입 후 시장은 감성 방향으로 “빠르게 맞게 움직이지만”, 직전에 이미 반영된 탓에 공시 순간의 “추가” 같은 방향 밀림은 줄어든다. 이는 “정보의 더 빠른 확산·반영”이라는 효율성 개선 스토리와 일관됩니다.