In [1]:
import json
import logging
import warnings
import kagglehub
import pandas as pd
import statsmodels.formula.api as smf
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import StandardScaler

warnings.simplefilter("ignore")
logging.getLogger().setLevel(logging.WARNING)


df_companies = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS, "andrewmvd/sp-500-stocks", "sp500_companies.csv"
).drop_duplicates(subset=["Shortname"], keep="first")
companies = df_companies[df_companies["Country"] == "United States"]["Symbol"].values
with open("companies.json", "r", encoding="utf-8") as f:
    data = json.load(f)
missing_companies = [entry["ticker"] for entry in data["missing_companies"]]
companies = [c for c in companies if c not in missing_companies]
search_queries = ["IT capability", "organizational resilience"]
financials_file = "financials.csv"



In [2]:
def rename_columns(df):
    return df.rename(
        columns={
            "IT capability": "IT_similarity",
            "organizational resilience": "OR_similarity",
            "Net Income": "Net_Income",
        }
    )


# 合併資料
def process_data(root):
    global search_queries, financials_file, df_companies

    # 載入 similarity 資料
    df_report_IT = pd.read_csv(root + search_queries[0] + "_report_similarity.csv")
    df_filter_IT = pd.read_csv(root + search_queries[0] + "_filter_similarity.csv")
    df_report_OR = pd.read_csv(root + search_queries[1] + "_report_similarity.csv")
    df_filter_OR = pd.read_csv(root + search_queries[1] + "_filter_similarity.csv")

    # 長格式轉換
    df_report_IT_long = df_report_IT.melt(
        id_vars=["ticker"], var_name="year", value_name="IT_similarity"
    )
    df_filter_IT_long = df_filter_IT.melt(
        id_vars=["ticker"], var_name="year", value_name="IT_similarity"
    )
    df_report_OR_long = df_report_OR.melt(
        id_vars=["ticker"], var_name="year", value_name="OR_similarity"
    )
    df_filter_OR_long = df_filter_OR.melt(
        id_vars=["ticker"], var_name="year", value_name="OR_similarity"
    )

    # 合併 IT 與 OR
    df_report = pd.merge(df_report_IT_long, df_report_OR_long, on=["ticker", "year"])
    df_filter = pd.merge(df_filter_IT_long, df_filter_OR_long, on=["ticker", "year"])
    df_report = df_report.sort_values(["ticker", "year"])
    df_filter = df_filter.sort_values(["ticker", "year"])

    # 計算成長率
    df_report["IT_growth"] = df_report.groupby("ticker")["IT_similarity"].pct_change()
    df_report["OR_growth"] = df_report.groupby("ticker")["OR_similarity"].pct_change()
    df_filter["IT_growth"] = df_filter.groupby("ticker")["IT_similarity"].pct_change()
    df_filter["OR_growth"] = df_filter.groupby("ticker")["OR_similarity"].pct_change()

    # 合併財務資料
    df_financials = pd.read_csv(financials_file)
    df_financials["year"] = df_financials["year"].astype(str)
    df_report = pd.merge(df_report, df_financials, on=["ticker", "year"], how="left")
    df_filter = pd.merge(df_filter, df_financials, on=["ticker", "year"], how="left")

    # 合併公司資料
    df_comp = df_companies[["Symbol", "Sector", "Industry", "Marketcap"]].rename(
        columns={"Symbol": "ticker"}
    )
    df_report = pd.merge(df_report, df_comp, on="ticker", how="left")
    df_filter = pd.merge(df_filter, df_comp, on="ticker", how="left")

    return df_report, df_filter


# 定義迴歸模型函數
def run_regressions(df, label, control=" + C(Sector) + Marketcap"):
    results = {}
    numeric_cols = ["IT_similarity", "OR_similarity", "Net_Income", "Marketcap"]
    df_clean = df.dropna(subset=numeric_cols + ["Sector", "Industry"])
    scaler = StandardScaler()
    df_scaled = df_clean.copy()
    df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

    # IT -> OR
    model = smf.ols(f"OR_similarity ~ IT_similarity{control}", data=df_scaled).fit()
    results[f"{label}：IT->OR{control}"] = model

    # IT growth -> OR growth
    model = smf.ols(f"OR_growth ~ IT_growth{control}", data=df_clean).fit()
    results[f"{label}：IT growth -> OR growth{control}"] = model

    # IT -> Revenues
    model = smf.ols(f"Revenues ~ IT_similarity{control}", data=df_scaled).fit()
    results[f"{label}：IT->Revenues{control}"] = model

    # IT -> Net Income
    model = smf.ols(f"Net_Income ~ IT_similarity{control}", data=df_scaled).fit()
    results[f"{label}：IT->Net_Income{control}"] = model

    # OR -> Revenues
    model = smf.ols(f"Revenues ~ OR_similarity{control}", data=df_scaled).fit()
    results[f"{label}：OR->Revenues{control}"] = model

    # OR -> Net Income
    model = smf.ols(f"Net_Income ~ OR_similarity{control}", data=df_scaled).fit()
    results[f"{label}：OR->Net_Income{control}"] = model

    # IT + OR -> Revenues
    model_IT_OR_Net_Income = smf.ols(
        f"Revenues ~ IT_similarity + OR_similarity{control}", data=df_scaled
    ).fit()
    results[f"{label}：IT+OR->Net_Income{control}"] = model_IT_OR_Net_Income

    # IT + OR -> Net Income
    model_IT_OR_Net_Income = smf.ols(
        f"Net_Income ~ IT_similarity + OR_similarity{control}", data=df_scaled
    ).fit()
    results[f"{label}：IT+OR->Net_Income{control}"] = model_IT_OR_Net_Income

    return results


# 定義時間遞移迴歸模型函數
def run_lag_regressions(df, label):
    results = {}

    # 建立 IT_similarity 的 lag 變數（t-1 到 t-5）
    df = df[["ticker", "year", "IT_similarity", "OR_similarity"]].copy()
    df = df.rename(columns={"IT_similarity": "IT", "OR_similarity": "OR"})
    for lag in range(1, 6):
        df[f"IT_lag{lag}"] = df.groupby("ticker")["IT"].shift(lag)

    # 執行迴歸：OR ~ IT_lag{t-lag}
    for lag in range(1, 6):
        col = f"IT_lag{lag}"
        df_lag = df.dropna(subset=["OR", col])
        model = smf.ols(formula=f"OR ~ {col}", data=df_lag).fit()
        results[f"{label}_OR~{col}"] = model

    return results


# 定義成長率時間遞移迴歸模型函數
def run_growth_lag_regressions(df, label):
    results = {}

    # 建立 IT_growth 的 lag 變數（t-1 到 t-5）
    for lag in range(1, 6):
        df[f"IT_growth_lag{lag}"] = df.groupby("ticker")["IT_growth"].shift(lag)

    # 執行迴歸：OR_growth_t ~ IT_growth_{t-lag}
    for lag in range(1, 6):
        col = f"IT_growth_lag{lag}"
        df_lag = df.dropna(subset=["OR_growth", col])
        model = smf.ols(formula=f"OR_growth ~ {col}", data=df_lag).fit()
        results[f"{label}_ORgrowth~{col}"] = model

    return results

In [92]:
df_7_report, df_7_filter = process_data("paper_7/")
df_17_report, df_17_filter = process_data("paper_17/")

In [108]:
df_7_report.to_csv("report.csv", index=False)
df_7_filter.to_csv("filter.csv", index=False)

In [112]:
# 迴歸分析
report_7_results = run_regressions(rename_columns(df_7_report), "Item7_report")
filter_7_results = run_regressions(rename_columns(df_7_filter), "Item7_filter")
report_17_results = run_regressions(rename_columns(df_17_report), "Item17_report")
filter_17_results = run_regressions(rename_columns(df_17_filter), "Item17_filter")

# 顯示所有模型的 summary
all_models = {
    **report_7_results,
    **filter_7_results,
    **report_17_results,
    **filter_17_results,
}
model_summaries = {name: model.summary() for name, model in all_models.items()}
model_summaries

{'Item7_report：IT->OR + C(Sector) + Marketcap': <class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 Dep. Variable:          OR_similarity   R-squared:                       0.497
 Model:                            OLS   Adj. R-squared:                  0.460
 Method:                 Least Squares   F-statistic:                     13.51
 Date:                Fri, 18 Apr 2025   Prob (F-statistic):           1.35e-10
 Time:                        05:10:31   Log-Likelihood:                -95.700
 No. Observations:                  89   AIC:                             205.4
 Df Residuals:                      82   BIC:                             222.8
 Df Model:                           6                                         
 Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
 -

In [87]:
# 時間遞移迴歸
report_7_results = run_lag_regressions(rename_columns(df_7_report), "Item7_report")
filter_7_results = run_lag_regressions(rename_columns(df_7_filter), "Item7_filter")
report_17_results = run_lag_regressions(rename_columns(df_17_report), "Item17_report")
filter_17_results = run_lag_regressions(rename_columns(df_17_filter), "Item17_filter")

# 顯示所有模型的 summary
all_models = {
    **report_7_results,
    **filter_7_results,
    **report_17_results,
    **filter_17_results,
}
model_summaries = {name: model.summary() for name, model in all_models.items()}
model_summaries

{'Item7_report_OR~IT_lag1': <class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 Dep. Variable:                     OR   R-squared:                       0.198
 Model:                            OLS   Adj. R-squared:                  0.189
 Method:                 Least Squares   F-statistic:                     21.69
 Date:                Wed, 16 Apr 2025   Prob (F-statistic):           1.13e-05
 Time:                        10:19:54   Log-Likelihood:                 75.467
 No. Observations:                  90   AIC:                            -146.9
 Df Residuals:                      88   BIC:                            -141.9
 Df Model:                           1                                         
 Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
 ------------------------------------------

In [93]:
def run_growth_regressions(df, label):
    results = {}

    # 執行回歸：OR_growth ~ IT_growth
    df_growth = df.dropna(subset=["IT_growth", "OR_growth"])
    model = smf.ols("OR_growth ~ IT_growth", data=df_growth).fit()

    results[f"{label}_ORgrowth~ITgrowth"] = model
    return results

In [94]:
# 成長率迴歸
report_7_results = run_growth_regressions(rename_columns(df_7_report), "Item7_report")
filter_7_results = run_growth_regressions(rename_columns(df_7_filter), "Item7_filter")
report_17_results = run_growth_regressions(
    rename_columns(df_17_report), "Item17_report"
)
filter_17_results = run_growth_regressions(
    rename_columns(df_17_filter), "Item17_filter"
)

# 顯示所有模型的 summary
all_models = {
    **report_7_results,
    **filter_7_results,
    **report_17_results,
    **filter_17_results,
}
model_summaries = {name: model.summary() for name, model in all_models.items()}
model_summaries

{'Item7_report_ORgrowth~ITgrowth': <class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 Dep. Variable:              OR_growth   R-squared:                       0.033
 Model:                            OLS   Adj. R-squared:                  0.022
 Method:                 Least Squares   F-statistic:                     3.012
 Date:                Wed, 16 Apr 2025   Prob (F-statistic):             0.0862
 Time:                        10:38:50   Log-Likelihood:                 20.203
 No. Observations:                  90   AIC:                            -36.41
 Df Residuals:                      88   BIC:                            -31.41
 Df Model:                           1                                         
 Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
 -----------------------------------

In [89]:
# 成長率時間遞移迴歸
report_7_results = run_growth_regressions(rename_columns(df_7_report), "Item7_report")
filter_7_results = run_growth_regressions(rename_columns(df_7_filter), "Item7_filter")
report_17_results = run_growth_regressions(
    rename_columns(df_17_report), "Item17_report"
)
filter_17_results = run_growth_regressions(
    rename_columns(df_17_filter), "Item17_filter"
)

# 顯示所有模型的 summary
all_models = {
    **report_7_results,
    **filter_7_results,
    **report_17_results,
    **filter_17_results,
}
model_summaries = {name: model.summary() for name, model in all_models.items()}
model_summaries

{'Item7_report_ORgrowth~ITgrowth': <class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 Dep. Variable:              OR_growth   R-squared:                       0.033
 Model:                            OLS   Adj. R-squared:                  0.022
 Method:                 Least Squares   F-statistic:                     3.012
 Date:                Wed, 16 Apr 2025   Prob (F-statistic):             0.0862
 Time:                        10:19:58   Log-Likelihood:                 20.203
 No. Observations:                  90   AIC:                            -36.41
 Df Residuals:                      88   BIC:                            -31.41
 Df Model:                           1                                         
 Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
 -----------------------------------