<a href="https://colab.research.google.com/github/cc-huang-0716/batting-analysis/blob/main/batter_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import statsmodels.api as sm
from statsmodels.stats.diagnostic import linear_rainbow
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
#輸入數據集
df = pd.read_csv("Hitters.csv")

#觀看數據樣式
df.head()

#將缺失值以一般平均數替代
n_df = df.select_dtypes(include=np.number)
df[n_df.columns] = n_df.fillna(n_df.mean())

#計算打擊率
df["Batting_Avg"] = df["Hits"] / df["AtBat"]

#計算上壘率
df["OBP"] = (df["Hits"] + df["Walks"]) / (df["AtBat"] + df["Walks"])

#定義自變數矩陣,排除薪資本身
column_data = [col for col in df.select_dtypes(include = np.number).columns if col != "Salary"]

#定義關聯性較強的變數矩陣
data1 = []

#單獨分析每個自變數跟應變數的相關係數
for column in column_data:
    stat, p = stats.pearsonr(df[column], df["Salary"])
    print(f"{column}: 相關係數={stat:.3f}, p值={p:.3f}")
    if abs(stat) >= 0.45:
        print(f"{column} 與薪資關聯性較強")
        data1.append(column)
    else:
        print(f"{column} 與薪資關聯性較弱")

# 進行多元迴歸分析
if data1:
    X = df[data1]

    # 添加常數項
    X = sm.add_constant(X)

    y = df["Salary"]
    model = sm.OLS(y, X).fit()
    print("\n--- 多元迴歸結果 ---")
    print(model.summary())

    # 檢查共線性 (VIF)
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print("\n--- 共線性檢測 (VIF) ---")
    print(vif_data)

    # 殘差分析: 殘差是否符合常態分配
    residuals = model.resid
    stat, p = stats.shapiro(residuals)
    print("\n--- 殘差常態性檢定（Shapiro-Wilk） ---")
    print(f"Stat = {stat:.3f}, p = {p:.3f}")
    if p > 0.05:
        print("殘差服從常態分配")
    else:
        print("殘差不服從常態分配")

    # Global test
    rainbow_stat, rainbow_p = linear_rainbow(model)
    print("\n--- Global Test (Rainbow Test) ---")
    print(f"Stat = {rainbow_stat:.3f}, p = {rainbow_p:.3f}")
    if rainbow_p > 0.05:
        print("模型解釋度良好")
    else:
        print("模型解釋度不顯著")

AtBat: 相關係數=0.342, p值=0.000
AtBat 與薪資關聯性較弱
Hits: 相關係數=0.385, p值=0.000
Hits 與薪資關聯性較弱
HmRun: 相關係數=0.312, p值=0.000
HmRun 與薪資關聯性較弱
Runs: 相關係數=0.372, p值=0.000
Runs 與薪資關聯性較弱
RBI: 相關係數=0.402, p值=0.000
RBI 與薪資關聯性較弱
Walks: 相關係數=0.402, p值=0.000
Walks 與薪資關聯性較弱
Years: 相關係數=0.352, p值=0.000
Years 與薪資關聯性較弱
CAtBat: 相關係數=0.468, p值=0.000
CAtBat 與薪資關聯性較強
CHits: 相關係數=0.491, p值=0.000
CHits 與薪資關聯性較強
CHmRun: 相關係數=0.452, p值=0.000
CHmRun 與薪資關聯性較強
CRuns: 相關係數=0.504, p值=0.000
CRuns 與薪資關聯性較強
CRBI: 相關係數=0.497, p值=0.000
CRBI 與薪資關聯性較強
CWalks: 相關係數=0.438, p值=0.000
CWalks 與薪資關聯性較弱
PutOuts: 相關係數=0.271, p值=0.000
PutOuts 與薪資關聯性較弱
Assists: 相關係數=0.024, p值=0.663
Assists 與薪資關聯性較弱
Errors: 相關係數=-0.005, p值=0.928
Errors 與薪資關聯性較弱
Batting_Avg: 相關係數=0.158, p值=0.004
Batting_Avg 與薪資關聯性較弱
OBP: 相關係數=0.200, p值=0.000
OBP 與薪資關聯性較弱

--- 多元迴歸結果 ---
                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.332
Model:                            OLS