### 只有男性、各科目分開
只要跑"自動調整參數"的即可

In [2]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import GridSearchCV

In [4]:
# 讀取資料
df = pd.read_csv("./dataset/student_scores_clean.csv")

df_male = df[df["gender"] == 1]

career_columns = [col for col in df.columns if col.startswith("career_")]

features = [
    "part_time_job",
    "absence_days",
    "extracurricular_activities",
    "weekly_self_study_hours"
] + career_columns

### xgboost

### 數學

In [28]:
X = df_male[features]
y = df_male["math_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# 手動調整參數
model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=300,
    max_depth=2,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

model.fit(X_train, y_train)

In [7]:
# 手動調整結果
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 131.45
MAE: 8.95
R² 分數: 0.2919
RMSE: 11.46


In [14]:
# GridSearchCV 自動調整參數
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# 定義超參數範圍
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# 輸出最佳參數與結果
print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

# 使用最佳模型預測測試集
math_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}

最佳模型的負 MSE (Best negative MSE):
-115.15100402832032


In [15]:
# 自動調整結果
y_pred = math_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 131.46
MAE: 8.95
R² 分數: 0.2918
RMSE: 11.47


### 歷史

In [None]:
X = df_male[features]
y = df_male["history_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

history_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 0.1, 'subsample': 0.8}

最佳模型的負 MSE (Best negative MSE):
-136.17791595458985


In [17]:
y_pred = history_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 149.23
MAE: 10.36
R² 分數: 0.1240
RMSE: 12.22


### 物理

In [18]:
X = df_male[features]
y = df_male["physics_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

physics_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0.1, 'subsample': 1}

最佳模型的負 MSE (Best negative MSE):
-125.2232894897461


In [19]:
y_pred = physics_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 149.87
MAE: 10.51
R² 分數: 0.0937
RMSE: 12.24


### 化學

In [20]:
X = df_male[features]
y = df_male["chemistry_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

chemistry_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1}

最佳模型的負 MSE (Best negative MSE):
-146.84246215820312


In [21]:
y_pred = chemistry_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 145.09
MAE: 10.39
R² 分數: 0.1161
RMSE: 12.05


### 生物學

In [22]:
X = df_male[features]
y = df_male["biology_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

biology_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 1}

最佳模型的負 MSE (Best negative MSE):
-148.5482208251953


In [23]:
y_pred = biology_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 172.36
MAE: 10.40
R² 分數: 0.1799
RMSE: 13.13


### 英文

In [24]:
X = df_male[features]
y = df_male["english_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

english_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 1}

最佳模型的負 MSE (Best negative MSE):
-119.53385925292969


In [25]:
y_pred = english_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 138.10
MAE: 9.89
R² 分數: 0.1381
RMSE: 11.75


### 地理

In [26]:
X = df_male[features]
y = df_male["geography_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("最佳參數組合 (Best parameters):")
print(grid_search.best_params_)

print("\n最佳模型的負 MSE (Best negative MSE):")
print(grid_search.best_score_)

geography_best_model = grid_search.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
最佳參數組合 (Best parameters):
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}

最佳模型的負 MSE (Best negative MSE):
-122.80546569824219


In [27]:
y_pred = geography_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² 分數: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

MSE: 134.17
MAE: 9.92
R² 分數: 0.0642
RMSE: 11.58


### Ordinary Least Squares (OLS)
用來分析特徵P-value、coef

In [30]:
X = df_male[features]
y = df_male["math_score"]

X = sm.add_constant(X) 
model = sm.OLS(y, X).fit()
print(model.summary().tables[1])

                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                           77.8609      1.111     70.056      0.000      75.680      80.042
part_time_job                   -0.0777      1.050     -0.074      0.941      -2.139       1.983
absence_days                    -0.0642      0.154     -0.417      0.677      -0.366       0.238
extracurricular_activities      -0.6776      0.854     -0.793      0.428      -2.354       0.998
weekly_self_study_hours          0.0791      0.052      1.522      0.128      -0.023       0.181
career_Accountant                7.5702      1.372      5.519      0.000       4.879      10.262
career_Artist                    3.8915      2.188      1.778      0.076      -0.403       8.186
career_Banker                    7.9642      1.320      6.036      0.000       5.375      10.554
career_Business Owner         