任务：
使用线性回归 和决策树回归预测学生期末成绩，并比较模型性能。数据集采用UCI机器学习库中的"学生表现数据集"。  数据文件：student-mat.csv

实现步骤：
1. 将数据加载到Pandas DataFrame
2. 分离特征(X)和目标变量(y)
3. 按80-20划分训练测试集(random_state=42)
4. 使用StandardScaler标准化特征
5. 训练以下模型：
   - 线性回归
   - 决策树回归
   - 多项式回归（degree=2）
6. 计算各模型的MSE分数，R2分数
7. 输出评估指标


In [None]:
import pandas as pd
import numpy as np
import os
print(os.getcwd())
data =  pd.read_csv ("ml_test/datasets/student-mat.csv", delimiter=";")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data_num = data.select_dtypes(include=[np.number])
data_num.info()

In [None]:
data_str = data.select_dtypes(exclude=[np.number])
data_str.info()

In [None]:
# 仅使用数字作为测试依据
from sklearn.model_selection import train_test_split
X = data_num.iloc[:, :-3]
y = data_num.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred11 = lr.predict(X_test)

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
y_pred12 = dtr.predict(X_test)

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
pf = make_pipeline(
    PolynomialFeatures(degree=2),
    LinearRegression()
)
pf.fit(X_train, y_train)
y_pred13 = pf.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
mse11 = mean_squared_error(y_test, y_pred11)
print(mse11)
mse12 = mean_squared_error(y_test, y_pred12)
print(mse12)
mse13 = mean_squared_error(y_test, y_pred13)
print(mse13)

In [None]:
from sklearn.metrics import r2_score
r11 = r2_score(y_test, y_pred11)
print(r11)
r12 = r2_score(y_test, y_pred12)
print(r12)
r13 = r2_score(y_test, y_pred13)
print(r13)

In [None]:
# 使用所有的数据进行预测
# 分离数据
X = data.iloc[:, :-3]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 三元组（标签，列处理方法，列明）
ct = ColumnTransformer([
    ("num",StandardScaler(),X_train.select_dtypes(include=[np.number]).columns),
    ("str",OneHotEncoder(),X_train.select_dtypes(exclude=[np.number]).columns)
])


In [None]:
from sklearn.pipeline import Pipeline
# pipeline二元组，（标签，流水线/预测方法）
pl21 = Pipeline([
    ("ct",ct),
    ("reg",LinearRegression())
])
pl22 = Pipeline([
    ("ct",ct),
    ("reg",DecisionTreeRegressor())
])
pl23 = Pipeline([
    ("ct",ct),
    ("pl",PolynomialFeatures(degree=2)),
    ("reg",LinearRegression())
])
pl21,pl22,pl23

In [None]:
pl21.fit(X_train, y_train)
pl22.fit(X_train,y_train)
pl23.fit(X_train,y_train)
y_pred21 = pl21.predict(X_test)
y_pred22 = pl22.predict(X_test)
y_pred23 = pl23.predict(X_test)


In [None]:
mse21 = mean_squared_error(y_test, y_pred21)
mse22 = mean_squared_error(y_test, y_pred22)
mse23 = mean_squared_error(y_test, y_pred23)
print(mse21,mse22,mse23)

In [None]:
r21 = r2_score(y_test, y_pred21)
print(r21)
r22 = r2_score(y_test, y_pred22)
print(r22)
r23 = r2_score(y_test, y_pred23)
print(r23)