In [21]:
import numpy as np
import torch
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from datetime import datetime, timedelta
import xgboost as xgb

# 获取股票数据
ticker = "000001.SS"
stock = yf.Ticker(ticker)
data = stock.history(start="2007-01-01", end="2024-06-14")

if data.empty:
    raise ValueError("No data fetched for the ticker.")

# 过滤数据，只包含交易日
data = data[data['Volume'] > 0]
data.drop(columns=['Dividends', 'Stock Splits'], inplace=True, errors='ignore')

# 确保日期列为字符串格式
data.index = data.index.strftime('%Y-%m-%d')

# 数据归一化
scaler = MinMaxScaler()
features = ['Open', 'High', 'Low', 'Close', 'Volume']
data[features] = scaler.fit_transform(data[features])

# 定义时间序列长度
time_steps = 10

# 创建序列数据
def create_sequences(data, time_steps):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X_seq = data.iloc[i:(i + time_steps)][features].values
        X.append(X_seq)
        y.append(data.iloc[i + time_steps][features].values)
    return np.array(X), np.array(y)

# 创建训练和测试数据
train_data = data.loc[:'2016-12-31']
test_data = data.loc['2017-01-01':]

X_train, y_train = create_sequences(train_data, time_steps)
X_test, y_test = create_sequences(test_data, time_steps)

# 转换为张量
X_train_tensor = torch.Tensor(X_train)
y_train_tensor = torch.Tensor(y_train)
X_test_tensor = torch.Tensor(X_test)
y_test_tensor = torch.Tensor(y_test)

# DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=64, shuffle=False)

# 定义Attention-based CNN-LSTM模型
class AttentionCNNLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AttentionCNNLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=4)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.permute(0, 2, 1)
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        lstm_out, _ = self.lstm(x, (h_0, c_0))
        attn_output, _ = self.attention(lstm_out, lstm_out, lstm_out)
        out = self.fc(attn_output[:, -1, :])
        return out

# 模型实例化
model = AttentionCNNLSTM(input_size=5, hidden_size=128, num_layers=3, output_size=5)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
def train_model(model, train_loader, val_loader, epochs, patience):
    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                output = model(X_batch)
                loss = criterion(output, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

# 训练模型
train_model(model, train_loader, test_loader, epochs=50, patience=5)

# 加载最佳模型权重
model.load_state_dict(torch.load('best_model.pth'))

# 检查日期有效性
def is_holiday(date, holidays):
    return date in holidays

# 列出2024年的假期
chinese_holidays_2024 = [
    '2024-01-01',  # New Year's Day
    '2024-02-10', '2024-02-11', '2024-02-12', '2024-02-13', '2024-02-14',  # Chinese New Year
    '2024-04-04', '2024-04-05',  # Qingming Festival
    '2024-05-01',  # Labour Day
    '2024-06-10',  # Dragon Boat Festival
    '2024-09-19', '2024-09-20',  # Mid-Autumn Festival
    '2024-10-01', '2024-10-02', '2024-10-03', '2024-10-04', '2024-10-05', '2024-10-06', '2024-10-07',  # National Day
]

# 找到有效的交易日期
def find_valid_date(start_date, data, holidays):
    date = datetime.strptime(start_date, "%Y-%m-%d")
    while date.weekday() >= 5 or date.strftime("%Y-%m-%d") in holidays or data.loc[date.strftime("%Y-%m-%d")]['Volume'] == 0:
        date -= timedelta(days=1)
    return date.strftime("%Y-%m-%d")

# 预测未来数据
def predict_future(model, start_date, days):
    start_date = find_valid_date(start_date, data, chinese_holidays_2024)
    start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
    start_index = data.index.get_loc(start_date)

    initial_sequence = data.iloc[start_index - time_steps + 1:start_index][features].values
    initial_sequence_scaled = scaler.transform(initial_sequence)
    initial_sequence_tensor = torch.Tensor(initial_sequence_scaled).unsqueeze(0)

    predictions = []

    for i in range(days):
        current_date = start_date_dt + timedelta(days=i)
        if current_date.weekday() >= 5 or is_holiday(current_date.strftime("%Y-%m-%d"), chinese_holidays_2024):
            predictions.append([0.0, 0.0, 0.0, 0.0, 0.0])
        else:
            with torch.no_grad():
                output = model(initial_sequence_tensor)
            output_unscaled = scaler.inverse_transform(output.squeeze().numpy().reshape(1, -1))
            # 确保体积值非负
            output_unscaled[0, -1] = max(0, output_unscaled[0, -1])
            output_scaled = scaler.transform(output_unscaled)
            predictions.append(output_unscaled.squeeze())
            initial_sequence = np.vstack((initial_sequence[1:], output_scaled))
            initial_sequence_tensor = torch.Tensor(initial_sequence).unsqueeze(0)

    return predictions

# 设置预测开始日期和预测天数
start_date = '2024-06-13'
prediction_days = 10

# 进行预测
try:
    predictions = predict_future(model, start_date, prediction_days)
except ValueError as e:
    print(e)

# 输出预测结果
current_date = datetime.strptime(start_date, "%Y-%m-%d")
for i, pred in enumerate(predictions):
    current_date_str = (current_date + timedelta(days=i)).strftime("%Y-%m-%d")
    print(current_date_str)
    for feature, value in zip(features, pred):
        print(f"{feature}: {value:.2f}")



Epoch 0, Train Loss: 0.0404, Val Loss: 0.0120
Epoch 1, Train Loss: 0.0324, Val Loss: 0.0142
Epoch 2, Train Loss: 0.0322, Val Loss: 0.0132
Epoch 3, Train Loss: 0.0323, Val Loss: 0.0098
Epoch 4, Train Loss: 0.0321, Val Loss: 0.0113
Epoch 5, Train Loss: 0.0320, Val Loss: 0.0136
Epoch 6, Train Loss: 0.0321, Val Loss: 0.0166
Epoch 7, Train Loss: 0.0324, Val Loss: 0.0137
Early stopping at epoch 8
2024-06-13
Open: 2928.15
High: 2976.49
Low: 2903.94
Close: 2962.81
Volume: 168776.06
2024-06-14
Open: 2931.57
High: 2980.02
Low: 2906.67
Close: 2966.39
Volume: 169400.64
2024-06-15
Open: 0.00
High: 0.00
Low: 0.00
Close: 0.00
Volume: 0.00
2024-06-16
Open: 0.00
High: 0.00
Low: 0.00
Close: 0.00
Volume: 0.00
2024-06-17
Open: 2931.50
High: 2979.97
Low: 2906.61
Close: 2966.28
Volume: 169385.19
2024-06-18
Open: 2931.41
High: 2979.89
Low: 2906.51
Close: 2966.15
Volume: 169362.86
2024-06-19
Open: 2931.33
High: 2979.83
Low: 2906.42
Close: 2966.05
Volume: 169342.56
2024-06-20
Open: 2931.25
High: 2979.78
Low: 2

