In [None]:
import pandas as pd
from sklearn.model_selection import KFold
import gurobipy as gp
from gurobipy import GRB
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [None]:
df = pd.read_csv("/Users/calvinli/Desktop/MS FIANL/StudentPerformanceFactors-1.csv")

In [None]:
df

In [None]:
income_dummies = pd.get_dummies(df['Family_Income'], prefix='income', drop_first=True)
distance_dummies = pd.get_dummies(df['Distance_from_Home'], prefix='distance', drop_first=True)

In [None]:
df = pd.concat([df, income_dummies, distance_dummies], axis=1)
df.drop(['Family_Income', 'Distance_from_Home'], axis=1, inplace=True)


In [None]:
df

In [None]:
selected_cols = [
    'Hours_Studied',
    'Sleep_Hours',
    'Previous_Scores',
    'Attendance',
    'income_Low',  # dummy
    'income_Medium',    # dummy
    'distance_Moderate',  # dummy
    'distance_Near',       # dummy
    'Exam_Score'  # 目標變數
]

df = df[selected_cols].dropna()  # 移除缺漏值

In [None]:
df

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = []

In [None]:
# 把 index 分成 5 折
for train_index, test_index in kf.split(df):
    train_df = df.iloc[train_index].copy()
    test_df = df.iloc[test_index].copy()
    folds.append((train_df, test_df))

In [None]:
train_df

In [None]:
test_df

In [None]:
import numpy as np

def compute_knots(train_df, feature_names, quantiles=[0.10, 0.25, 0.50, 0.75, 0.90]):
    knots_dict = {}
    for feature in feature_names:
        values = train_df[feature].values
        knots = np.quantile(values, quantiles)
        knots_dict[feature] = dict(zip(quantiles, knots))
    return knots_dict


In [None]:
# 要建立 spline 的數值變數（不要包含目標變數）
numeric_features = [
    'Hours_Studied',
    'Sleep_Hours',
    'Previous_Scores',
    'Attendance',
   
]

# 建立 spline knots
knots = compute_knots(train_df, numeric_features)

# 看看結果長怎樣
import pprint
pprint.pprint(knots)


In [None]:
def add_hinge_features(df, knots_dict):
    df_hinge = df.copy()
    
    for feature, knot_levels in knots_dict.items():
        for q, knot_val in knot_levels.items():
            new_col = f"{feature}_hinge_{q}"
            df_hinge[new_col] = (df_hinge[feature] - knot_val).clip(lower=0)
            
    return df_hinge

In [None]:
train_df_hinge = add_hinge_features(train_df, knots)
test_df_hinge = add_hinge_features(test_df, knots)

In [None]:
train_df_hinge

In [None]:
dummy_vars = ['income_Low', 'income_Medium', 'distance_Moderate', 'distance_Near']

hinge_vars = []
for f in ['Hours_Studied', 'Sleep_Hours', 'Previous_Scores', 'Attendance']:
    for q in ['0.1', '0.25', '0.5', '0.75', '0.9']:
        hinge_vars.append(f"{f}_hinge_{q}")

features = dummy_vars + hinge_vars

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
target_col = 'Exam_Score'

mae_list = []

for train_index, val_index in kf.split(df):
    train_df, val_df = df.iloc[train_index], df.iloc[val_index]
    
    # 先算節點（knots）只用 train 資料
    knots = compute_knots(train_df, ['Hours_Studied', 'Sleep_Hours', 'Previous_Scores', 'Attendance'])
    
    # 加 hinge features
    train_df_hinge = add_hinge_features(train_df, knots)
    val_df_hinge = add_hinge_features(val_df, knots)
    
    # 選取特徵欄位
    X_train = train_df_hinge[features]
    y_train = train_df_hinge[target_col]
    X_val = val_df_hinge[features]
    y_val = val_df_hinge[target_col]
    
    # 訓練線性回歸模型
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # 預測
    y_pred = model.predict(X_val)
    
    # 評估 MAE
    mae = mean_absolute_error(y_val, y_pred)
    mae_list.append(mae)
    print(f"Fold MAE: {mae:.4f}")

print(f"Overall CV-MAE: {np.mean(mae_list):.4f}")
