In [1]:
import sys
import os
sys.path.append('../dags')

from database import create_database_engine


# 환경 변수 설정
os.environ["POSTGRES_USER"] = "airflow"
os.environ["POSTGRES_PASSWORD"] = "airflow"
os.environ["POSTGRES_DB"] = "events"
os.environ["POSTGRES_PORT"] = "5433"

engine = create_database_engine(host="127.0.0.1")
print("Successfully connected to the database")

Successfully connected to the database


In [2]:
import pandas as pd
from datetime import datetime


# 나머지는 변수로 관리
label_table = "kind"
return_table = "abnormal_return_kind"
label_col = "label"

# 포함할 abnormal return 컬럼
abn_return_cols = [
    "abn_ret_minus_10m", "abn_ret_minus_20m", "abn_ret_minus_30m", 
    "abn_ret_minus_40m", "abn_ret_minus_50m", "abn_ret_minus_60m",
    "abn_ret_10m", "abn_ret_20m", "abn_ret_30m", 
    "abn_ret_40m", "abn_ret_50m", "abn_ret_60m"
]

# SQL 컬럼 문자열을 만듬 (event_id, event_ts는 하드코딩)
sql_columns = (
    [f"ar.event_ts"] +
    [f"ar.{col}" for col in abn_return_cols] +
    [f"k.{label_col}"]
)
sql_columns_str = ",\n    ".join(sql_columns)

query = f"""
SELECT {sql_columns_str}
FROM {return_table} ar
JOIN "{label_table}" k ON ar.event_id = k.id
ORDER BY ar.event_ts ASC
"""

df_before_telegram = pd.read_sql(query, engine)
df_before_telegram["period_dummy"] = 0
df_before_telegram

DETAIL:  The database was created using collation version 2.36, but the operating system provides version 2.41.
HINT:  Rebuild all objects in this database that use the default collation and run ALTER DATABASE events REFRESH COLLATION VERSION, or build PostgreSQL with the right library version.


Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,abn_ret_40m,abn_ret_50m,abn_ret_60m,label,period_dummy
0,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
1,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
2,2021-01-04 10:08:00+00:00,,,,,,,-1.91,-0.66,-1.47,-1.37,-0.60,-0.95,1,0
3,2021-01-04 10:14:00+00:00,-8.40,,,,,,-4.90,-4.20,-3.46,-5.53,-5.16,-6.00,1,0
4,2021-01-04 10:16:00+00:00,0.22,1.55,1.55,1.55,,,0.03,-0.83,-0.54,-0.96,-0.57,-1.83,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5067,2022-06-29 14:04:00+00:00,2.30,2.19,2.24,2.43,2.88,2.47,0.55,0.43,-0.82,-0.84,-0.85,-1.63,1,0
5068,2022-06-29 14:20:00+00:00,-0.16,0.05,0.02,0.06,0.08,0.17,-0.03,0.34,0.11,0.32,0.54,0.24,1,0
5069,2022-06-29 14:21:00+00:00,0.00,0.12,0.06,-0.14,-0.51,-0.28,0.06,-0.04,-0.11,-0.32,0.03,,1,0
5070,2022-06-29 14:35:00+00:00,0.14,0.27,0.27,0.19,0.18,0.06,-0.22,0.03,0.03,-0.08,,,1,0


In [3]:
label_table = "label"
return_table = "abnormal_return"
label_col = "label"


all_sql_columns = (
    [f"ar.event_ts"] +
    [f"ar.{col}" for col in abn_return_cols] +
    [f"k.{label_col}"]
)
all_sql_columns_str = ",\n    ".join(all_sql_columns)

query = f"""
SELECT {all_sql_columns_str}
FROM {return_table} ar
JOIN "{label_table}" k ON ar.event_id = k.id
JOIN disclosure_events de ON ar.event_id = de.id
WHERE k.{label_col} IS NOT NULL
ORDER BY ar.event_ts ASC
"""

df_after_telegram = pd.read_sql(query, engine)
df_after_telegram["period_dummy"] = 1
label_map = {0: 1, 1: 0, 2: -1}
df_after_telegram['label'] = df_after_telegram['label'].map(label_map)
df_after_telegram

Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,abn_ret_40m,abn_ret_50m,abn_ret_60m,label,period_dummy
0,2022-07-01 01:03:17+00:00,-1.80,-1.61,-2.03,-2.74,-2.32,-1.18,-1.27,-2.41,-2.36,-1.79,-2.98,-1.98,1,1
1,2022-07-01 01:24:45+00:00,-1.57,-1.62,-1.91,-1.33,-2.52,-1.48,0.53,2.43,1.19,1.57,1.38,1.72,1,1
2,2022-07-01 01:27:02+00:00,-0.05,0.39,0.10,-1.01,-0.14,-0.14,0.29,0.24,1.25,1.39,2.02,1.63,1,1
3,2022-07-01 01:43:15+00:00,-0.89,-0.92,-0.82,-0.81,-0.59,0.16,0.08,0.36,0.31,0.12,-0.18,0.23,1,1
4,2022-07-01 02:30:09+00:00,-1.91,-0.73,0.47,0.44,0.08,1.81,-0.40,-1.02,-1.80,-1.24,-1.82,-1.60,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8076,2023-12-28 05:53:08+00:00,-0.08,0.80,0.02,0.26,0.25,0.86,0.24,0.24,,,,,1,1
8077,2023-12-28 05:55:03+00:00,-0.31,0.84,0.80,1.16,0.76,0.93,-2.10,-3.25,,,,,0,1
8078,2023-12-28 05:56:27+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1
8079,2023-12-28 05:56:32+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1


In [4]:
df_total = pd.concat([df_before_telegram, df_after_telegram])



In [5]:
df_total

Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,abn_ret_40m,abn_ret_50m,abn_ret_60m,label,period_dummy
0,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
1,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
2,2021-01-04 10:08:00+00:00,,,,,,,-1.91,-0.66,-1.47,-1.37,-0.60,-0.95,1,0
3,2021-01-04 10:14:00+00:00,-8.40,,,,,,-4.90,-4.20,-3.46,-5.53,-5.16,-6.00,1,0
4,2021-01-04 10:16:00+00:00,0.22,1.55,1.55,1.55,,,0.03,-0.83,-0.54,-0.96,-0.57,-1.83,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8076,2023-12-28 05:53:08+00:00,-0.08,0.80,0.02,0.26,0.25,0.86,0.24,0.24,,,,,1,1
8077,2023-12-28 05:55:03+00:00,-0.31,0.84,0.80,1.16,0.76,0.93,-2.10,-3.25,,,,,0,1
8078,2023-12-28 05:56:27+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1
8079,2023-12-28 05:56:32+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1


# 회귀분석

### 📘 모형식

$$
\left| CAR_{post,i} - CAR_{pre,i} \right| = \alpha + \beta \cdot Telegram_i + \epsilon_i
$$

---

### 📊 변수 설명

**종속변수 (Dependent Variable)**  
이벤트 전후 CAR 변화폭의 절댓값:

$$
\left| CAR_{post,i} - CAR_{pre,i} \right|
$$


**설명변수 (Key Independent Variable)**  
텔레그램 도입 여부 변수:

$$
Telegram_i =
\begin{cases}
1, & \text{텔레그램 도입 이후 (After introduction)} \\
0, & \text{도입 이전 (Before introduction)}
\end{cases}
$$

---

### 📑 가설 설정

$$
\begin{aligned}
H_0 &: \beta = 0 \quad &(\text{텔레그램 도입 이후와 이전 간 CAR 변화폭의 차이가 없다.}) \\
H_1 &: \beta < 0 \quad &(\text{텔레그램 도입 이후 CAR 변화폭이 감소했다.})
\end{aligned}
$$
---

### 📈 해석

이 회귀모형은 **텔레그램 도입 여부**가 이벤트 전후 누적초과수익률(CAR) 변화폭의 크기에 미치는 영향을 분석하기 위한 것입니다.  
β < 0 이고 통계적으로 유의하다면 귀무가설이 기각되어 공시 정보가 도입 이후 시장에 보다 신속하게 반영되어 정보 비효율성이 감소했음을 시사하며, 텔레그램 공시 알림 서비스가 시장 효율성 향상에 실질적으로 기여했음을 보여줍니다.


In [6]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# df_total이 이미 존재한다고 가정
horizons = [10, 20, 30, 40, 50, 60]
rows = []

for h in horizons:
    post_col = f"abn_ret_{h}m"
    pre_col  = f"abn_ret_minus_{h}m"
    if post_col not in df_total.columns or pre_col not in df_total.columns:
        raise KeyError(f"'{post_col}' 또는 '{pre_col}' 컬럼을 찾을 수 없습니다.")

    y = (df_total[post_col] - df_total[pre_col]).abs()
    X = sm.add_constant(df_total["period_dummy"], has_constant="add")

    valid = ~(y.isna() | X.isna().any(axis=1))
    y_ = y[valid]
    X_ = X[valid]

    model = sm.OLS(y_, X_).fit(cov_type="HC1")

    beta = model.params.get("period_dummy", np.nan)
    se   = model.bse.get("period_dummy", np.nan)
    tval = model.tvalues.get("period_dummy", np.nan)
    pval = model.pvalues.get("period_dummy", np.nan)

    rows.append({
        "horizon_min": h,
        "beta": beta,
        "std_err": se,
        "t_value": tval,
        "p_value": pval,
        "sig_0.05": "*" if (not pd.isna(pval) and pval < 0.05) else "",
        "n_obs": int(model.nobs),
        "r_squared": model.rsquared
    })

result_table = pd.DataFrame(rows).sort_values("horizon_min").reset_index(drop=True)

# ✅ p-value 포맷 조정 (지수표기 없이 소수점 8자리, 0은 "<1e-8"로 표시)
result_table["p_value"] = result_table["p_value"].apply(
    lambda x: f"{x:.12f}" if (x > 0 and not pd.isna(x)) else "<1e-8"
)

# 보기 좋게 출력
display_cols = ["horizon_min", "beta", "std_err", "t_value", "p_value", "sig_0.05", "n_obs", "r_squared"]
print(result_table[display_cols].to_string(index=False))

 horizon_min      beta  std_err   t_value        p_value sig_0.05  n_obs  r_squared
          10 -0.284615 0.033934 -8.387221 0.000000000000        *  12329   0.005971
          20 -0.278708 0.038763 -7.190126 0.000000000001        *  11454   0.004773
          30 -0.268489 0.043548 -6.165326 0.000000000703        *  10596   0.003830
          40 -0.444945 0.057169 -7.782917 0.000000000000        *   9728   0.006982
          50 -0.396699 0.058895 -6.735650 0.000000000016        *   8895   0.005539
          60 -0.371746 0.063477 -5.856352 0.000000004731        *   8069   0.004634


# 방향 정합성

### 📘 모형식 정의

$$
\text{logit}^{-1}(x) = \frac{1}{1 + e^{-x}}
$$

$$
Pr(hit_{i,t} = 1) = \text{logit}^{-1}(\alpha + \beta \cdot period\_dummy_i)
$$
---

### 📊 변수 설명


이벤트 *i*, 창 *t* 에 대해 Delta CAR, post CAR 2개의 지표를 사용

$$
t \in \{10,\,20,\,\dots,\,60\}
$$

$$
\Delta CAR_{i,t} = CAR_{post,i,t} - CAR_{pre,i,t}  
$$

$$
CAR_{post,i,t}
$$

$$
label\_sign_i =
\begin{cases}
+1, & \text{긍정 (1)} \\
0, & \text{중립 (0)} \\
-1, & \text{부정 (-1)} 
\end{cases}
$$



$$
hit_{i,t} =
\begin{cases}
1, & \text{if } \operatorname{sign}(\Delta CAR_{i,t}) = label\_sign_i \\
0, & \text{otherwise}
\end{cases}
$$

---

### 📑 가설 설정

$$
\begin{aligned}
H_0 &: \beta = 0 \quad &(\text{도입 전후 차이 없음}) \\
H_1 &: \beta > 0 \quad &(\text{도입 후 정합률 상승})
\end{aligned}
$$

---

### 📈 해석

- delta CAR: β > 0 이고 **통계적으로 유의**하다면  → “텔레그램 서비스 출시 이후 과잉반응 감소 / 즉시 반영 후 빠른 수렴 → 정보의 선반영 및 사전 확산이 강화 → 정보 효율성 향상”을 의미한다고 볼 수 있다.
- Post CAR: β > 0 이고 **통계적으로 유의**하다면  → “텔레그램 서비스 출시 이후 이벤트 발표 이후의 정합률(정확도) 증가 → 시장 반응의 즉시 반영 강화 → 정보 효율성 향상”을 의미한다고 해석할 수 있다.


In [10]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

def print_sample_summary(df, label_col="label_sign"):
    """
    중립 이벤트 제거 후 샘플 요약 통계 출력
    """
    n_total = len(df)
    n_neutral = (df[label_col] == 0).sum()
    df_nn = df[df[label_col] != 0]
    n_used = len(df_nn)
    pos_cnt = (df_nn[label_col] == 1).sum()
    neg_cnt = (df_nn[label_col] == -1).sum()
    
    print("=== Sample summary (neutral removed) ===")
    print(f"Total events:        {n_total:,}")
    print(f"Removed neutrals:    {n_neutral:,}")
    print(f"Used (non-neutral):  {n_used:,}")
    print(f"  - Positive (1):    {pos_cnt:,}")
    print(f"  - Negative (-1):   {neg_cnt:,}")
    print()
    return df_nn

def logistic_hit_delta(df_subset, t):
    """
    ΔCAR 기준: ΔCAR_{i,t} = CAR_{post,i,t} - CAR_{pre,i,t}
    hit_{i,t} = 1 if sign(ΔCAR_{i,t}) == label_sign_i else 0
    logistic regression on period_dummy
    """
    delta = df_subset[f"abn_ret_{t}m"] - df_subset[f"abn_ret_minus_{t}m"]
    realized_sign = np.sign(delta)
    hit = (realized_sign == df_subset["label_sign"]).astype(int)
    X = sm.add_constant(df_subset["period_dummy"])
    model = sm.Logit(hit, X).fit(disp=0)
    beta = float(model.params["period_dummy"])
    pval = float(model.pvalues["period_dummy"])
    odds_ratio = float(np.exp(beta))
    X0 = X.copy(); X0["period_dummy"] = 0
    X1 = X.copy(); X1["period_dummy"] = 1
    p0 = float(model.predict(X0).mean())
    p1 = float(model.predict(X1).mean())
    diff_pp = (p1 - p0) * 100.0
    return {
        "window": t,
        "beta": beta,
        "p_value": pval,
        "odds_ratio": odds_ratio,
        "p_before": p0,
        "p_after": p1,
        "diff_pp": diff_pp,
        "n_obs": int(model.nobs)
    }

def logistic_hit_postCAR(df_subset, h):
    """
    post CAR 기준: 0→+h 누적초과수익의 부호 부합 여부
    hit = 1 if sign(abn_ret_{h}m) == label_sign_i else 0
    logistic regression on period_dummy
    """
    realized_sign = np.sign(df_subset[f"abn_ret_{h}m"])
    hit = (realized_sign == df_subset["label_sign"]).astype(int)
    X = sm.add_constant(df_subset["period_dummy"])
    model = sm.Logit(hit, X).fit(disp=0)
    beta = float(model.params["period_dummy"])
    pval = float(model.pvalues["period_dummy"])
    odds_ratio = float(np.exp(beta))
    X0 = X.copy(); X0["period_dummy"] = 0
    X1 = X.copy(); X1["period_dummy"] = 1
    p0 = float(model.predict(X0).mean())
    p1 = float(model.predict(X1).mean())
    diff_pp = (p1 - p0) * 100.0
    return {
        "window": h,
        "beta": beta,
        "p_value": pval,
        "odds_ratio": odds_ratio,
        "p_before": p0,
        "p_after": p1,
        "diff_pp": diff_pp,
        "n_obs": int(model.nobs)
    }

def run_logistic_table(logistic_fn, df_nn, windows, label="ΔCAR"):
    """
    창(t)별 로지스틱 회귀 수행 및 요약 테이블 리턴
    """
    results = []
    for w in windows:
        results.append(logistic_fn(df_nn, w))
    results_df = pd.DataFrame(results)
    results_df["sig_0.05"] = np.where(results_df["p_value"] < 0.05, "*", "")
    print(f"\n--- {label} 기준 결과 ---")
    print(
        results_df[
            ["window", "beta", "p_value", "sig_0.05", "odds_ratio", "p_before", "p_after", "diff_pp", "n_obs"]
        ]
        .rename(columns={"window": "t"})
        .round({
            "beta": 4, "p_value": 12, "odds_ratio": 3, "p_before": 3, "p_after": 3, "diff_pp": 2
        })
        .to_string(index=False)
    )
    return results_df

# === 전체 파이프라인 ===

df = df_total.copy()
df["label_sign"] = df["label"]
df_nn = print_sample_summary(df, "label_sign")

# ΔCAR 기준 (pre/post window 차이 사용)
windows = [10, 20, 30, 40, 50, 60]
results_delta = run_logistic_table(logistic_hit_delta, df_nn, windows, label="ΔCAR")

# post CAR 기준 (공시 이후만 사용, 기존 코드 - file_context_0)
results_post = run_logistic_table(logistic_hit_postCAR, df_nn, windows, label="post CAR")


=== Sample summary (neutral removed) ===
Total events:        13,153
Removed neutrals:    3,690
Used (non-neutral):  9,463
  - Positive (1):    6,995
  - Negative (-1):   2,468


--- ΔCAR 기준 결과 ---
 t    beta      p_value sig_0.05  odds_ratio  p_before  p_after  diff_pp  n_obs
10 -0.3571 0.000000e+00        *       0.700     0.667    0.584    -8.34   9463
20 -0.3164 1.000000e-12        *       0.729     0.608    0.531    -7.74   9463
30 -0.3387 0.000000e+00        *       0.713     0.561    0.476    -8.43   9463
40 -0.3724 0.000000e+00        *       0.689     0.527    0.434    -9.27   9463
50 -0.3166 1.000000e-12        *       0.729     0.468    0.391    -7.74   9463
60 -0.3276 0.000000e+00        *       0.721     0.430    0.352    -7.78   9463

--- post CAR 기준 결과 ---
 t   beta  p_value sig_0.05  odds_ratio  p_before  p_after  diff_pp  n_obs
10 0.1994 0.000006        *       1.221     0.429    0.478     4.94   9463
20 0.2077 0.000003        *       1.231     0.398    0.448     5.06 

정보의 선반영(anticipation)·사전확산 강화
- 텔레그램 도입 이후, 공시 직전에 감성 방향으로 미리 움직이는 비율/크기가 커졌을 가능성
- 그래서 공시 순간의 post CAR 방향 정합은 높아지지만(β_post>0), “직전 대비 추가로 얼마나 같은 방향으로 더 가느냐”를 보는 ΔCAR 기준에서는 추가분이 줄어 **정합성↓**로 관측(β_Δ<0)

과잉반응 감소 / 즉시 반영 후 빠른 수렴
- 도입 전에는 공시 직후 같은 방향으로 과도하게 더 밀던 패턴이, 도입 후에는 이미 반영되어 추가 밀림이 줄어듦 → Δ 기준 정합성 감소(β_Δ<0).
- 하지만 “초기 방향” 자체는 더 정확해짐 → post 기준 정합성 증가(β_post>0).


도입 후 시장은 감성 방향으로 “빠르게 맞게 움직이지만”, 직전에 이미 반영된 탓에 공시 순간의 “추가” 같은 방향 밀림은 줄어든다. 이는 “정보의 더 빠른 확산·반영”이라는 효율성 개선 스토리와 일관됩니다.