In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('cleaned_data.csv')

# 转换日期列
df['Date'] = pd.to_datetime(df['Date'])

# 创建年、月、日等特征
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

#df = df.drop('Date', axis=1)

# 将 T10Y2Y 列转换为浮点数
df['T10Y2Y'] = pd.to_numeric(df['T10Y2Y'], errors='coerce')

print(df.dtypes)

Date                  datetime64[ns]
Open                         float64
High                         float64
Low                          float64
Close                        float64
Adj Close                    float64
Volume                         int64
sp500return                  float64
CPIAUCSL                     float64
FEDFUNDS                     float64
mktrf                        float64
smb                          float64
hml                          float64
rf                           float64
umd                          float64
GDP                          float64
PPIACO                       float64
T10Y2Y                       float64
consumer_sentiment           float64
VIXCLS                       float64
WM2NS                        float64
Year                           int32
Month                          int32
Day                            int32
dtype: object


In [3]:
window_size = 20
# 计算移动平均
df['target'] = df['Close'].rolling(window=window_size).mean()

# 计算移动平均的百分比变化
df['target'] = df['target'].pct_change()

# 根据条件设置 'label' 列
df['target'] = df['target'].apply(lambda x: 1 if x > 0.0001 else (-1 if x < -0.0001 else 0))


X = df.drop('target', axis=1)
y = df['target']

start_date = pd.to_datetime("2018-01-01")
end_date = pd.to_datetime("2021-12-31")

# 创建一个布尔序列，指示每行数据是否属于测试集
is_test = (df['Date'] >= start_date) & (df['Date'] <= end_date)

X_train = X[~is_test].drop('Date', axis=1)
y_train = y[~is_test]
X_test = X[is_test].drop('Date', axis=1)
y_test = y[is_test]



#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                         max_depth = 5, alpha = 10, n_estimators = 10)

model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)

# 评估模型
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: %f" % (rmse))

RMSE: 0.853252
