In [1]:
import sys
import os
sys.path.append('../dags')

from database import create_database_engine


# 환경 변수 설정
os.environ["POSTGRES_USER"] = "airflow"
os.environ["POSTGRES_PASSWORD"] = "airflow"
os.environ["POSTGRES_DB"] = "events"
os.environ["POSTGRES_PORT"] = "5433"

engine = create_database_engine(host="127.0.0.1")
print("Successfully connected to the database")

Successfully connected to the database


In [2]:
import pandas as pd
from datetime import datetime


# 나머지는 변수로 관리
label_table = "kind"
return_table = "abnormal_return_kind"
label_col = "label"

# 포함할 abnormal return 컬럼
abn_return_cols = [
    "abn_ret_minus_10m", "abn_ret_minus_20m", "abn_ret_minus_30m", 
    "abn_ret_minus_40m", "abn_ret_minus_50m", "abn_ret_minus_60m",
    "abn_ret_10m", "abn_ret_20m", "abn_ret_30m", 
    "abn_ret_40m", "abn_ret_50m", "abn_ret_60m"
]

# SQL 컬럼 문자열을 만듬 (event_id, event_ts는 하드코딩)
sql_columns = (
    [f"ar.event_ts"] +
    [f"ar.{col}" for col in abn_return_cols] +
    [f"k.{label_col}"]
)
sql_columns_str = ",\n    ".join(sql_columns)

query = f"""
SELECT {sql_columns_str}
FROM {return_table} ar
JOIN "{label_table}" k ON ar.event_id = k.id
ORDER BY ar.event_ts ASC
"""

df_before_telegram = pd.read_sql(query, engine)
df_before_telegram["period_dummy"] = 0
df_before_telegram

DETAIL:  The database was created using collation version 2.36, but the operating system provides version 2.41.
HINT:  Rebuild all objects in this database that use the default collation and run ALTER DATABASE events REFRESH COLLATION VERSION, or build PostgreSQL with the right library version.


Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,abn_ret_40m,abn_ret_50m,abn_ret_60m,label,period_dummy
0,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
1,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
2,2021-01-04 10:08:00+00:00,,,,,,,-1.91,-0.66,-1.47,-1.37,-0.60,-0.95,1,0
3,2021-01-04 10:14:00+00:00,-8.40,,,,,,-4.90,-4.20,-3.46,-5.53,-5.16,-6.00,1,0
4,2021-01-04 10:16:00+00:00,0.22,1.55,1.55,1.55,,,0.03,-0.83,-0.54,-0.96,-0.57,-1.83,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5067,2022-06-29 14:04:00+00:00,2.30,2.19,2.24,2.43,2.88,2.47,0.55,0.43,-0.82,-0.84,-0.85,-1.63,1,0
5068,2022-06-29 14:20:00+00:00,-0.16,0.05,0.02,0.06,0.08,0.17,-0.03,0.34,0.11,0.32,0.54,0.24,1,0
5069,2022-06-29 14:21:00+00:00,0.00,0.12,0.06,-0.14,-0.51,-0.28,0.06,-0.04,-0.11,-0.32,0.03,,1,0
5070,2022-06-29 14:35:00+00:00,0.14,0.27,0.27,0.19,0.18,0.06,-0.22,0.03,0.03,-0.08,,,1,0


In [None]:
label_table = "label"
return_table = "abnormal_return"
label_col = "label"


all_sql_columns = (
    [f"ar.event_ts"] +
    [f"ar.{col}" for col in abn_return_cols] +
    [f"k.{label_col}"]
)
all_sql_columns_str = ",\n    ".join(all_sql_columns)

query = f"""
SELECT {all_sql_columns_str}
FROM {return_table} ar
JOIN "{label_table}" k ON ar.event_id = k.id
JOIN disclosure_events de ON ar.event_id = de.id
WHERE k.{label_col} IS NOT NULL
ORDER BY ar.event_ts ASC
"""

df_after_telegram = pd.read_sql(query, engine)
df_after_telegram["period_dummy"] = 1
label_map = {0: 1, 1: 0, 2: -1}
df_after_telegram['label'] = df_after_telegram['label'].map(label_map)
df_after_telegram

Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,abn_ret_40m,abn_ret_50m,abn_ret_60m,label,period_dummy
0,2022-07-01 01:03:17+00:00,-1.80,-1.61,-2.03,-2.74,-2.32,-1.18,-1.27,-2.41,-2.36,-1.79,-2.98,-1.98,1,1
1,2022-07-01 01:24:45+00:00,-1.57,-1.62,-1.91,-1.33,-2.52,-1.48,0.53,2.43,1.19,1.57,1.38,1.72,1,1
2,2022-07-01 01:27:02+00:00,-0.05,0.39,0.10,-1.01,-0.14,-0.14,0.29,0.24,1.25,1.39,2.02,1.63,1,1
3,2022-07-01 01:43:15+00:00,-0.89,-0.92,-0.82,-0.81,-0.59,0.16,0.08,0.36,0.31,0.12,-0.18,0.23,1,1
4,2022-07-01 02:30:09+00:00,-1.91,-0.73,0.47,0.44,0.08,1.81,-0.40,-1.02,-1.80,-1.24,-1.82,-1.60,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8076,2023-12-28 05:53:08+00:00,-0.08,0.80,0.02,0.26,0.25,0.86,0.24,0.24,,,,,1,1
8077,2023-12-28 05:55:03+00:00,-0.31,0.84,0.80,1.16,0.76,0.93,-2.10,-3.25,,,,,0,1
8078,2023-12-28 05:56:27+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1
8079,2023-12-28 05:56:32+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1


In [4]:
df_total = pd.concat([df_before_telegram, df_after_telegram])



In [16]:
df_total

Unnamed: 0,event_ts,abn_ret_minus_10m,abn_ret_minus_20m,abn_ret_minus_30m,abn_ret_minus_40m,abn_ret_minus_50m,abn_ret_minus_60m,abn_ret_10m,abn_ret_20m,abn_ret_30m,abn_ret_40m,abn_ret_50m,abn_ret_60m,label,period_dummy
0,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
1,2021-01-04 10:08:00+00:00,-2.14,-2.14,-2.14,,,,-0.45,0.17,0.05,0.15,-0.15,0.18,0,0
2,2021-01-04 10:08:00+00:00,,,,,,,-1.91,-0.66,-1.47,-1.37,-0.60,-0.95,1,0
3,2021-01-04 10:14:00+00:00,-8.40,,,,,,-4.90,-4.20,-3.46,-5.53,-5.16,-6.00,1,0
4,2021-01-04 10:16:00+00:00,0.22,1.55,1.55,1.55,,,0.03,-0.83,-0.54,-0.96,-0.57,-1.83,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8076,2023-12-28 05:53:08+00:00,-0.08,0.80,0.02,0.26,0.25,0.86,0.24,0.24,,,,,1,1
8077,2023-12-28 05:55:03+00:00,-0.31,0.84,0.80,1.16,0.76,0.93,-2.10,-3.25,,,,,0,1
8078,2023-12-28 05:56:27+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1
8079,2023-12-28 05:56:32+00:00,0.05,0.08,-0.02,-0.16,-0.06,0.12,-0.28,-0.54,,,,,1,1


# 회귀분석

## 📘 모형식 (절댓값 기준)

$$
\left| CAR_{post,i} - CAR_{pre,i} \right| = \alpha + \beta \cdot Telegram_i + \epsilon_i
$$

---

## 📊 변수 설명

**종속변수 (Dependent Variable)**  
이벤트 전후 CAR 변화폭의 절댓값:

$$
\left| CAR_{post,i} - CAR_{pre,i} \right|
$$

---

**설명변수 (Key Independent Variable)**  
텔레그램 도입 여부 변수:

$$
Telegram_i =
\begin{cases}
1, & \text{텔레그램 도입 이후 (After introduction)} \\
0, & \text{도입 이전 (Before introduction)}
\end{cases}
$$

---

## 📈 해석

이 회귀모형은 **텔레그램 도입 여부**가  
이벤트 전후 누적초과수익률(CAR) 변화폭의 크기에 미치는 영향을 분석하기 위한 것입니다.

- β가 **양(+)**이면 → 텔레그램 도입 이후 변동 폭이 **커졌음**  
- β가 **음(-)**이면 → 텔레그램 도입 이후 변동 폭이 **작아졌음**


In [17]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# df_total이 이미 존재한다고 가정
horizons = [10, 20, 30, 40, 50, 60]
rows = []

for h in horizons:
    post_col = f"abn_ret_{h}m"
    pre_col  = f"abn_ret_minus_{h}m"
    if post_col not in df_total.columns or pre_col not in df_total.columns:
        raise KeyError(f"'{post_col}' 또는 '{pre_col}' 컬럼을 찾을 수 없습니다.")

    y = (df_total[post_col] - df_total[pre_col]).abs()
    X = sm.add_constant(df_total["period_dummy"], has_constant="add")

    valid = ~(y.isna() | X.isna().any(axis=1))
    y_ = y[valid]
    X_ = X[valid]

    model = sm.OLS(y_, X_).fit(cov_type="HC1")

    beta = model.params.get("period_dummy", np.nan)
    se   = model.bse.get("period_dummy", np.nan)
    tval = model.tvalues.get("period_dummy", np.nan)
    pval = model.pvalues.get("period_dummy", np.nan)

    rows.append({
        "horizon_min": h,
        "coef_period_dummy": beta,
        "std_err": se,
        "t_value": tval,
        "p_value": pval,
        "sig_0.05": "*" if (not pd.isna(pval) and pval < 0.05) else "",
        "n_obs": int(model.nobs),
        "r_squared": model.rsquared
    })

result_table = pd.DataFrame(rows).sort_values("horizon_min").reset_index(drop=True)

# ✅ p-value 포맷 조정 (지수표기 없이 소수점 8자리, 0은 "<1e-8"로 표시)
result_table["p_value"] = result_table["p_value"].apply(
    lambda x: f"{x:.12f}" if (x > 0 and not pd.isna(x)) else "<1e-8"
)

# 보기 좋게 출력
display_cols = ["horizon_min", "coef_period_dummy", "std_err", "t_value", "p_value", "sig_0.05", "n_obs", "r_squared"]
print(result_table[display_cols].to_string(index=False))

 horizon_min  coef_period_dummy  std_err   t_value        p_value sig_0.05  n_obs  r_squared
          10          -0.284615 0.033934 -8.387221 0.000000000000        *  12329   0.005971
          20          -0.278708 0.038763 -7.190126 0.000000000001        *  11454   0.004773
          30          -0.268489 0.043548 -6.165326 0.000000000703        *  10596   0.003830
          40          -0.444945 0.057169 -7.782917 0.000000000000        *   9728   0.006982
          50          -0.396699 0.058895 -6.735650 0.000000000016        *   8895   0.005539
          60          -0.371746 0.063477 -5.856352 0.000000004731        *   8069   0.004634


# 방향 정합성

In [15]:
import numpy as np, statsmodels.api as sm

def sign_from_label(lbl):
    # 1(긍정)->+1, 0(부정)->-1, -1(중립)->0
    return 1 if lbl==1 else (-1 if lbl==0 else 0)

df = df_total.copy()
df["label_sign"] = df["label"].apply(sign_from_label)

def logistic_hit(h):
    delta = df[f"abn_ret_{h}m"] - df[f"abn_ret_minus_{h}m"]
    realized_sign = np.sign(delta)
    mask = df["label_sign"]!=0  # 중립 제외 버전
    hit = (realized_sign[mask] == df.loc[mask,"label_sign"]).astype(int)
    X = sm.add_constant(df.loc[mask,"period_dummy"])
    m = sm.Logit(hit, X).fit(disp=0)
    return m.params["period_dummy"], m.pvalues["period_dummy"]

for h in [10,20,30,40,50,60]:
    beta, p = logistic_hit(h)
    print(h, beta, p)


10 0.21184568390232825 6.919639020658233e-08
20 0.15386054812677097 8.356090787243819e-05
30 0.05919899858405471 0.13098669157847329
40 0.019897955534859117 0.61406862414535
50 -0.009608897682253046 0.8104761557083726
60 -0.05060412029326476 0.21499638276085986
