In [None]:
import pandas as pd
from sklearn.model_selection import KFold
import gurobipy as gp
from gurobipy import GRB
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

Read the CSV file

In [None]:
df = pd.read_csv("/Users/calvinli/Desktop/MS FIANL/StudentPerformanceFactors-1.csv")

In [None]:
df

In [None]:
df

選擇我們要使用的變數們

In [None]:
income_dummies = pd.get_dummies(df['Family_Income'], prefix='income', drop_first=True)
distance_dummies = pd.get_dummies(df['Distance_from_Home'], prefix='distance', drop_first=True)

In [None]:
df = pd.concat([df, income_dummies, distance_dummies], axis=1)
df.drop(['Family_Income', 'Distance_from_Home'], axis=1, inplace=True)


In [None]:
selected_cols = [
    'Hours_Studied',
    'Sleep_Hours',
    'Previous_Scores',
    'Attendance',
    'income_Low',  # dummy
    'income_Medium',    # dummy
    'distance_Moderate',  # dummy
    'distance_Near',       # dummy
    'Exam_Score'  # 目標變數
]

df = df[selected_cols].dropna()  # 移除缺漏值

In [None]:
df

切分 5-Fold Cross Validation

In [None]:
kf = KFold(n_splits=5, shuffle=False)
folds = []

# 把 index 分成 5 折
for train_index, test_index in kf.split(df):
    train_df = df.iloc[train_index].copy()
    test_df = df.iloc[test_index].copy()
    folds.append((train_df, test_df))

In [None]:
train_df

In [None]:
test_df

計算訓練資料中的節點位置

In [None]:
def compute_knots(train_df, feature_names, quantiles=[0.10, 0.25, 0.50, 0.75, 0.90]):
    knots_dict = {}
    for feature in feature_names:
        values = train_df[feature].values
        knots = np.quantile(values, quantiles)
        knots_dict[feature] = dict(zip(quantiles, knots))
    return knots_dict


In [None]:
# 要建立 spline 的數值變數（不要包含目標變數）
numeric_features = [
    'Hours_Studied',
    'Sleep_Hours',
    'Previous_Scores',
    'Attendance',
   
]

# 建立 spline knots
knots = compute_knots(train_df, numeric_features)

# 看看結果長怎樣
import pprint
pprint.pprint(knots)


建立 Hinge 特徵

In [None]:
def add_hinge_features(df, knots_dict):
    df_hinge = df.copy()
    
    for feature, knot_levels in knots_dict.items():
        for q, knot_val in knot_levels.items():
            new_col = f"{feature}_hinge_{q}"
            df_hinge[new_col] = (df_hinge[feature] - knot_val).clip(lower=0)
            
    return df_hinge

In [None]:
train_df_hinge = add_hinge_features(train_df, knots)
test_df_hinge = add_hinge_features(test_df, knots)

In [None]:
print([col for col in train_df_hinge.columns if 'hinge' in col])


In [None]:
train_df_hinge


In [None]:
test_df_hinge

In [None]:
features = [
    'income_Medium', 'income_Low',
    'distance_Moderate', 'distance_Near',
    'Hours_Studied_hinge_0.1', 'Hours_Studied_hinge_0.25', 'Hours_Studied_hinge_0.5', 'Hours_Studied_hinge_0.75', 'Hours_Studied_hinge_0.9',
    'Sleep_Hours_hinge_0.1', 'Sleep_Hours_hinge_0.25', 'Sleep_Hours_hinge_0.5', 'Sleep_Hours_hinge_0.75', 'Sleep_Hours_hinge_0.9',
    'Previous_Scores_hinge_0.1', 'Previous_Scores_hinge_0.25', 'Previous_Scores_hinge_0.5', 'Previous_Scores_hinge_0.75', 'Previous_Scores_hinge_0.9',
    'Attendance_hinge_0.1', 'Attendance_hinge_0.25', 'Attendance_hinge_0.5', 'Attendance_hinge_0.75', 'Attendance_hinge_0.9'
]
target_col = 'Exam_Score'


In [None]:
features

In [None]:
def train_lp_spline(train_df, features, target_col='Exam_Score'):
    m = gp.Model("spline_regression")
    m.setParam("OutputFlag", 0)  # 不輸出中間 log

    n = train_df.shape[0]

    # 建立 β 變數（每個特徵一個），再加上截距 β0
    beta = {f: m.addVar(lb=-GRB.INFINITY, name=f'beta_{f}') for f in features}
    beta0 = m.addVar(lb=-GRB.INFINITY, name='beta0')

    # 每一筆資料的誤差 |ei|（要使用 e_pos 和 e_neg 表示絕對值）
    e_pos = []
    e_neg = []

    for i in range(n):
        e_pos.append(m.addVar(lb=0, name=f"e_pos_{i}"))
        e_neg.append(m.addVar(lb=0, name=f"e_neg_{i}"))

    # 建立 constraint: y = β0 + Σ(βj * xij) + e_neg - e_pos
    for i in range(n):
        row = train_df.iloc[i]
        xi_expr = gp.LinExpr()
        for f in features:
            xi_expr += beta[f] * row[f]
        m.addConstr(beta0 + xi_expr + e_neg[i] - e_pos[i] == row[target_col])

    # 目標函數：最小化 Σ (e_pos + e_neg)，即 MAE
    m.setObjective(gp.quicksum(e_pos[i] + e_neg[i] for i in range(n)), GRB.MINIMIZE)
    m.optimize()

    # 儲存解
    coefficients = {'beta0': beta0.X}
    for f in features:
        coefficients[f] = beta[f].X

    return coefficients


In [None]:
# 訓練
coef = train_lp_spline(train_df_hinge, features, target_col='Exam_Score')

# 預測
def predict_lp(df, coef, features):
    y_pred = []
    for _, row in df.iterrows():
        y_hat = coef['beta0']
        for f in features:
            y_hat += coef[f] * row[f]
        y_pred.append(y_hat)
    return y_pred

train_preds = predict_lp(train_df_hinge, coef, features)

# 評估 MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(train_df_hinge['Exam_Score'], train_preds)
print(f"Gurobi LP Train MAE: {mae:.4f}")


In [None]:
def cross_validate_lp(df, features, target_col='Exam_Score', n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold_maes = []

    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(df)):
        train_df_fold = df.iloc[train_idx].copy()
        test_df_fold = df.iloc[test_idx].copy()

        # 1. 使用 Gurobi 訓練模型
        coef_lp = train_lp_spline(train_df_fold, features, target_col)

        # 2. 預測測試集
        preds = predict_lp(test_df_fold, coef_lp, features)

        # 3. 計算 MAE
        mae = mean_absolute_error(test_df_fold[target_col], preds)
        fold_maes.append(mae)
        print(f"Fold {fold_idx + 1} MAE: {mae:.4f}")

    # 4. 整體 CV-MAE
    cv_mae = np.mean(fold_maes)
    print(f"\n✅ Overall CV-MAE: {cv_mae:.4f}")

    return fold_maes, cv_mae

In [None]:
fold_maes, overall_cv_mae = cross_validate_lp(train_df_hinge, features)
