In [21]:
# 导入相关库
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 读取数据
df = pd.read_csv('stock_data.csv')
df.set_index('Date', inplace=True)

In [22]:
# 添加标签值
predict_count = 30  # 预测未来30天的收盘价
df['label'] = df['Close'].shift(-predict_count)

In [23]:
# 划分特征空间和目标变量
X = df.drop(['label'], axis=1)[:-predict_count]
y = df['label'][:-predict_count]
print(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                  Open        High         Low       Close        Volume
Date                                                                    
2020-01-01  100.000000  150.000000   80.000000  120.000000  10049.671415
2020-01-02  100.047985  150.047985   80.038388  120.047985   9995.770499
2020-01-03  100.095969  150.095969   80.076775  120.095969  10083.962712
2020-01-06  100.143954  150.143954   80.115163  120.143954  10181.093773
2020-01-07  100.191939  150.191939   80.153551  120.191939  10014.972378
...                ...         ...         ...         ...           ...
2023-11-13  148.368522  198.368522  118.694818  168.368522  19778.659686
2023-11-14  148.416507  198.416507  118.733205  168.416507  19629.777822
2023-11-15  148.464491  198.464491  118.771593  168.464491  19824.637679
2023-11-16  148.512476  198.512476  118.809981  168.512476  19722.255162
2023-11-17  148.560461  198.560461  118.848369  168.560461  19919.618218

[1013 rows x 5 columns]


In [24]:
# 建立线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [25]:
# 预测收盘价
y_pred = model.predict(X_test)
y_pred

array([154.50095969, 146.29558541, 166.54510557, 126.04606526,
       149.17466411, 137.13051823, 164.38579655, 156.51631478,
       131.66026871, 139.24184261, 150.99808061, 162.46641075,
       164.96161228, 152.77351248, 130.26871401, 168.80038388,
       160.25911708, 128.10940499, 129.02111324, 167.40882917,
       135.35508637, 164.57773512, 146.53550864, 146.05566219,
       166.01727447, 156.66026871, 133.29174664, 135.54702495,
       160.88291747, 139.72168906, 167.45681382, 136.74664107,
       124.2706334 , 138.47408829, 138.28214971, 142.07293666,
       153.54126679, 165.48944338, 153.34932821, 131.42034549,
       151.90978887, 157.42802303, 147.06333973, 126.57389635,
       137.17850288, 124.79846449, 133.96353167, 129.78886756,
       121.9193858 , 126.28598848, 167.6487524 , 147.7831094 ,
       140.82533589, 147.01535509, 134.87523992, 127.96545106,
       143.0806142 , 133.91554702, 122.87907869, 146.77543186,
       146.87140115, 125.08637236, 165.82533589, 124.03

In [26]:
# 创建一个DataFrame，用于存储预测结果
result_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
result_df

Unnamed: 0_level_0,Actual,Predicted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-08-23,154.500960,154.500960
2021-12-27,146.295585,146.295585
2023-08-09,166.545106,166.545106
2020-05-14,126.046065,126.046065
2022-03-21,149.174664,149.174664
...,...,...
2021-10-15,143.848369,143.848369
2022-11-29,157.859885,157.859885
2022-04-27,150.470250,150.470250
2023-01-02,159.011516,159.011516


In [27]:
# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
mse

1.4444781517315898e-27

In [28]:
# 计算R-squared（决定系数）
r2 = r2_score(y_test, y_pred)
r2

1.0

In [29]:
# 计算均方根误差
rmse = np.sqrt(mse)
rmse

np.float64(3.800629094941507e-14)

In [30]:
# 计算平均绝对误差
mae = np.mean(np.abs(y_test - y_pred))
mae

np.float64(3.108186942635315e-14)