In [48]:
import pandas as pd
from sklearn.model_selection import KFold
import gurobipy as gp
from gurobipy import GRB
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [3]:
df = pd.read_csv("/Users/calvinli/Desktop/MS FIANL/StudentPerformanceFactors-1.csv")

In [10]:
df

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68


In [11]:
income_dummies = pd.get_dummies(df['Family_Income'], prefix='income', drop_first=True)
distance_dummies = pd.get_dummies(df['Distance_from_Home'], prefix='distance', drop_first=True)

In [12]:
df = pd.concat([df, income_dummies, distance_dummies], axis=1)
df.drop(['Family_Income', 'Distance_from_Home'], axis=1, inplace=True)


In [13]:
df

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,...,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Gender,Exam_Score,income_Low,income_Medium,distance_Moderate,distance_Near
0,23,84,Low,High,No,7,73,Low,Yes,0,...,Positive,3,No,High School,Male,67,True,False,False,True
1,19,64,Low,Medium,No,8,59,Low,Yes,2,...,Negative,4,No,College,Female,61,False,True,True,False
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,...,Neutral,4,No,Postgraduate,Male,74,False,True,False,True
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,...,Negative,4,No,High School,Male,71,False,True,True,False
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,...,Neutral,4,No,College,Female,70,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,...,Positive,2,No,High School,Female,68,False,False,False,True
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,...,Positive,2,No,High School,Female,69,True,False,False,True
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,...,Negative,2,No,Postgraduate,Female,68,True,False,False,True
6605,10,86,High,High,Yes,6,91,High,Yes,2,...,Positive,3,No,High School,Female,68,True,False,False,False


In [14]:
selected_cols = [
    'Hours_Studied',
    'Sleep_Hours',
    'Previous_Scores',
    'Attendance',
    'income_Low',  # dummy
    'income_Medium',    # dummy
    'distance_Moderate',  # dummy
    'distance_Near',       # dummy
    'Exam_Score'  # 目標變數
]

df = df[selected_cols].dropna()  # 移除缺漏值

In [15]:
df

Unnamed: 0,Hours_Studied,Sleep_Hours,Previous_Scores,Attendance,income_Low,income_Medium,distance_Moderate,distance_Near,Exam_Score
0,23,7,73,84,True,False,False,True,67
1,19,8,59,64,False,True,True,False,61
2,24,7,91,98,False,True,False,True,74
3,29,8,98,89,False,True,True,False,71
4,19,6,65,92,False,True,False,True,70
...,...,...,...,...,...,...,...,...,...
6602,25,7,76,69,False,False,False,True,68
6603,23,8,81,76,True,False,False,True,69
6604,20,6,65,90,True,False,False,True,68
6605,10,6,91,86,True,False,False,False,68


In [18]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = []

In [19]:
# 把 index 分成 5 折
for train_index, test_index in kf.split(df):
    train_df = df.iloc[train_index].copy()
    test_df = df.iloc[test_index].copy()
    folds.append((train_df, test_df))

In [20]:
train_df

Unnamed: 0,Hours_Studied,Sleep_Hours,Previous_Scores,Attendance,income_Low,income_Medium,distance_Moderate,distance_Near,Exam_Score
0,23,7,73,84,True,False,False,True,67
1,19,8,59,64,False,True,True,False,61
2,24,7,91,98,False,True,False,True,74
6,29,7,68,84,True,False,True,False,67
7,25,6,50,78,False,False,False,False,66
...,...,...,...,...,...,...,...,...,...
6602,25,7,76,69,False,False,False,True,68
6603,23,8,81,76,True,False,False,True,69
6604,20,6,65,90,True,False,False,True,68
6605,10,6,91,86,True,False,False,False,68


In [21]:
test_df

Unnamed: 0,Hours_Studied,Sleep_Hours,Previous_Scores,Attendance,income_Low,income_Medium,distance_Moderate,distance_Near,Exam_Score
3,29,8,98,89,False,True,True,False,71
4,19,6,65,92,False,True,False,True,70
5,19,8,89,88,False,True,False,True,71
9,23,8,71,98,False,False,True,False,72
16,14,10,65,60,False,False,False,True,60
...,...,...,...,...,...,...,...,...,...
6581,21,5,61,80,False,False,False,True,67
6584,30,7,97,62,False,False,False,True,66
6591,13,8,98,74,False,False,False,True,66
6593,16,10,53,75,True,False,False,True,64


In [22]:
import numpy as np

def compute_knots(train_df, feature_names, quantiles=[0.10, 0.25, 0.50, 0.75, 0.90]):
    knots_dict = {}
    for feature in feature_names:
        values = train_df[feature].values
        knots = np.quantile(values, quantiles)
        knots_dict[feature] = dict(zip(quantiles, knots))
    return knots_dict


In [30]:
# 要建立 spline 的數值變數（不要包含目標變數）
numeric_features = [
    'Hours_Studied',
    'Sleep_Hours',
    'Previous_Scores',
    'Attendance',
   
]

# 建立 spline knots
knots = compute_knots(train_df, numeric_features)

# 看看結果長怎樣
import pprint
pprint.pprint(knots)


{'Attendance': {0.1: np.float64(64.0),
                0.25: np.float64(70.0),
                0.5: np.float64(80.0),
                0.75: np.float64(90.0),
                0.9: np.float64(96.0)},
 'Hours_Studied': {0.1: np.float64(12.0),
                   0.25: np.float64(16.0),
                   0.5: np.float64(20.0),
                   0.75: np.float64(24.0),
                   0.9: np.float64(28.0)},
 'Previous_Scores': {0.1: np.float64(55.0),
                     0.25: np.float64(63.0),
                     0.5: np.float64(75.0),
                     0.75: np.float64(88.0),
                     0.9: np.float64(95.0)},
 'Sleep_Hours': {0.1: np.float64(5.0),
                 0.25: np.float64(6.0),
                 0.5: np.float64(7.0),
                 0.75: np.float64(8.0),
                 0.9: np.float64(9.0)}}


In [50]:
def add_hinge_features(df, knots_dict):
    df_hinge = df.copy()
    
    for feature, knot_levels in knots_dict.items():
        for q, knot_val in knot_levels.items():
            new_col = f"{feature}_hinge_{q}"
            df_hinge[new_col] = (df_hinge[feature] - knot_val).clip(lower=0)
            
    return df_hinge

In [29]:
train_df_hinge = add_hinge_features(train_df, knots)
test_df_hinge = add_hinge_features(test_df, knots)

In [51]:
train_df_hinge

Unnamed: 0,Hours_Studied,Sleep_Hours,Previous_Scores,Attendance,income_Low,income_Medium,distance_Moderate,distance_Near,Exam_Score,Hours_Studied_hinge_0.1,...,Previous_Scores_hinge_0.1,Previous_Scores_hinge_0.25,Previous_Scores_hinge_0.5,Previous_Scores_hinge_0.75,Previous_Scores_hinge_0.9,Attendance_hinge_0.1,Attendance_hinge_0.25,Attendance_hinge_0.5,Attendance_hinge_0.75,Attendance_hinge_0.9
0,23,7,73,84,True,False,False,True,67,11.0,...,18.0,10.0,0.0,0.0,0.0,20.0,14.0,4.0,0.0,0.0
1,19,8,59,64,False,True,True,False,61,7.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,7,91,98,False,True,False,True,74,12.0,...,36.0,28.0,16.0,3.0,0.0,34.0,28.0,18.0,8.0,2.0
3,29,8,98,89,False,True,True,False,71,17.0,...,43.0,35.0,23.0,10.0,3.0,25.0,19.0,9.0,0.0,0.0
4,19,6,65,92,False,True,False,True,70,7.0,...,10.0,2.0,0.0,0.0,0.0,28.0,22.0,12.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6597,16,6,72,91,False,False,False,True,70,4.0,...,17.0,9.0,0.0,0.0,0.0,27.0,21.0,11.0,1.0,0.0
6600,12,4,54,98,False,True,False,True,67,0.0,...,0.0,0.0,0.0,0.0,0.0,34.0,28.0,18.0,8.0,2.0
6604,20,6,65,90,True,False,False,True,68,8.0,...,10.0,2.0,0.0,0.0,0.0,26.0,20.0,10.0,0.0,0.0
6605,10,6,91,86,True,False,False,False,68,0.0,...,36.0,28.0,16.0,3.0,0.0,22.0,16.0,6.0,0.0,0.0


In [52]:
dummy_vars = ['income_Low', 'income_Medium', 'distance_Moderate', 'distance_Near']

hinge_vars = []
for f in ['Hours_Studied', 'Sleep_Hours', 'Previous_Scores', 'Attendance']:
    for q in ['0.1', '0.25', '0.5', '0.75', '0.9']:
        hinge_vars.append(f"{f}_hinge_{q}")

features = dummy_vars + hinge_vars

In [53]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
target_col = 'Exam_Score'

mae_list = []

for train_index, val_index in kf.split(df):
    train_df, val_df = df.iloc[train_index], df.iloc[val_index]
    
    # 先算節點（knots）只用 train 資料
    knots = compute_knots(train_df, ['Hours_Studied', 'Sleep_Hours', 'Previous_Scores', 'Attendance'])
    
    # 加 hinge features
    train_df_hinge = add_hinge_features(train_df, knots)
    val_df_hinge = add_hinge_features(val_df, knots)
    
    # 選取特徵欄位
    X_train = train_df_hinge[features]
    y_train = train_df_hinge[target_col]
    X_val = val_df_hinge[features]
    y_val = val_df_hinge[target_col]
    
    # 訓練線性回歸模型
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # 預測
    y_pred = model.predict(X_val)
    
    # 評估 MAE
    mae = mean_absolute_error(y_val, y_pred)
    mae_list.append(mae)
    print(f"Fold MAE: {mae:.4f}")

print(f"Overall CV-MAE: {np.mean(mae_list):.4f}")


Fold MAE: 1.3183
Fold MAE: 1.3449
Fold MAE: 1.2872
Fold MAE: 1.4684
Fold MAE: 1.4759
Overall CV-MAE: 1.3790
