In [None]:
#pip install torch torchaudio torchvision

In [None]:
import os
import akshare as ak
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

# ==================== 1. 数据获取 ====================
# 设置股票代码（贵州茅台）与时间区间
symbol = 'sz000025'  # 贵州茅台A股代码
start_date, end_date = '2017-01-01', '2025-05-07'
# 使用 AKShare 获取日行情数据
# 返回字段包括 日期、开盘、最高、最低、收盘、成交量、成交额
df = ak.stock_zh_a_daily(symbol=symbol, start_date=start_date, end_date=end_date)
df.reset_index(inplace=True)
print(df)

# 配置进度条和中文显示
tqdm.pandas()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 重命名列，方便后续使用
df.rename(columns={
    '日期': 'date',
    '开盘': 'open',
    '最高': 'high',
    '最低': 'low',
    '收盘': 'close',
    '成交量': 'volume',
    '成交额': 'amount',
}, inplace=True)
# 转换日期列为 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# ==================== 2. 计算日收益率 ====================
# 按日期排序
df.sort_values('date', inplace=True)
# 基于收盘价计算日收益率：today_close / yesterday_close - 1
df['return'] = df['close'].pct_change()
# 丢弃空值（第一行的 NaN）
df.dropna(inplace=True)

# ==================== 3. 合并情绪指标 ====================
# 从本地 Excel 文件读取情绪数据，假定包含 'date' 与 'sentiment' 两列
sent = pd.read_excel('daily_sentiment.xlsx')
sent['date'] = pd.to_datetime(sent['date'])
# 左连接，将情绪指标加入行情数据
df = df.merge(sent, on='date', how='left').dropna()

# 定义特征列和目标列
features = ['open', 'high', 'low', 'close', 'volume', 'amount']
target = 'return'

# ==================== 数据准备函数 ====================
def prepare_data(df, include_sentiment=False):
    """
    功能：
      - 组装特征矩阵 X 和目标向量 y
      - 标准化处理
      - 训练/测试集切分
      - 转换为 PyTorch Tensor
    参数：
      df: 原始 DataFrame
      include_sentiment: 是否在特征中加入情绪指标
    返回：
      X_train_t, X_test_t, y_train_t, y_test_t, scaler_y
    """
    data = df.copy()
    # 提取基本特征值
    X = data[features].values
    # 如果需要，拼接情绪指标列
    if include_sentiment:
        X = np.hstack([X, data[['sentiment_score']].values])
    # 提取目标收益率
    y = data[target].values

    # 对特征和目标分别进行标准化
    scaler_X = StandardScaler().fit(X)
    scaler_y = StandardScaler().fit(y.reshape(-1, 1))
    X_scaled = scaler_X.transform(X)
    y_scaled = scaler_y.transform(y.reshape(-1, 1)).flatten()

    # 80% 数据用于训练，20% 用于测试
    split = int(len(X_scaled) * 0.8)
    X_train, X_test = X_scaled[:split], X_scaled[split:]
    y_train, y_test = y_scaled[:split], y_scaled[split:]

    # 转换为 PyTorch tensor，LSTM 输入要加一维 seq_len=1
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    y_test_t = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

    return X_train_t, X_test_t, y_train_t, y_test_t, scaler_y

# ==================== 辅助函数：构建 DataLoader ====================
def get_dataloader(X, y, batch_size=32):
    """
    将特征和标签打包进 TensorDataset，再封装为 DataLoader
    """
    dataset = TensorDataset(X.unsqueeze(1), y)  # 加入序列维度
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# ==================== LSTM 模型定义 ====================
class LSTMRegressor(nn.Module):
    def __init__(self, input_size, hidden_size=50, num_layers=2):
        super().__init__()
        # LSTM 层：batch_first=True 表示输入维度 (batch, seq, feature)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # 全连接层，将最后一步隐藏状态映射到单个预测值
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)          # out: (batch, seq, hidden_size)
        out = out[:, -1, :]            # 取最后一个时间步的输出
        return self.fc(out)            # 预测结果

# ==================== 训练与评估函数 ====================
def train_and_evaluate(include_sentiment=False):
    """
    功能：
      - 准备数据
      - 训练 LSTM 模型
      - 测试集上预测并计算指标
    返回：
      y_true, y_pred, mse, mae, rmse, r2
    """
    # 获取数据张量和标签缩放器
    X_train, X_test, y_train, y_test, scaler_y = prepare_data(df, include_sentiment)
    train_loader = get_dataloader(X_train, y_train)

    # 模型、损失和优化器
    model = LSTMRegressor(X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # 训练循环
    for epoch in range(500):
        model.train()
        for xb, yb in train_loader:
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # 测试集预测
    model.eval()
    with torch.no_grad():
        pred_scaled = model(X_test.unsqueeze(1)).numpy()
    # 反标准化得到原始收益率
    y_pred = scaler_y.inverse_transform(pred_scaled)
    y_true = scaler_y.inverse_transform(y_test.numpy())

    # 计算评估指标
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    return y_true, y_pred, mse, mae, rmse, r2

# ==================== 运行模型：未加入情绪 vs 加入情绪 ====================
y_true0, y_pred0, mse0, mae0, rmse0, r20 = train_and_evaluate(False)
y_true1, y_pred1, mse1, mae1, rmse1, r21 = train_and_evaluate(True)

# ==================== 可视化与保存 ====================
# 测试集对应的日期序列
times = df['date'].iloc[int(len(df) * 0.8):].reset_index(drop=True)

# 不含情绪指标的预测结果
plt.figure()
plt.plot(times, y_true0, label='真实收益率')
plt.plot(times, y_pred0, label='预测收益率')
plt.title('未加入情绪指标的收益率预测')
plt.xlabel('日期')
plt.ylabel('收益率')
plt.legend()
plt.savefig('pred_without_sentiment.png')

# 含情绪指标的预测结果
plt.figure()
plt.plot(times, y_true1, label='真实收益率')
plt.plot(times, y_pred1, label='预测收益率')
plt.title('加入情绪指标的收益率预测')
plt.xlabel('日期')
plt.ylabel('收益率')
plt.legend()
plt.savefig('pred_with_sentiment.png')

# ==================== 导出结果到 Excel ====================
# 构建预测值 DataFrame
results = pd.DataFrame({
    'date': times,
    'true_without': y_true0.flatten(),
    'pred_without': y_pred0.flatten(),
    'true_with': y_true1.flatten(),
    'pred_with': y_pred1.flatten(),
})
# 构建模型评估指标 DataFrame
metrics = pd.DataFrame({
    'model': ['no_sent', 'with_sent'],
    'MSE': [mse0, mse1],
    'MAE': [mae0, mae1],
    'RMSE': [rmse0, rmse1],
    'R2': [r20, r21]
})
# 写入 Excel 文件，包含两个 sheet
with pd.ExcelWriter('lstm_results.xlsx') as writer:
    results.to_excel(writer, sheet_name='predictions', index=False)
    metrics.to_excel(writer, sheet_name='metrics', index=False)

print('已完成，文件已保存：pred_without_sentiment.png, pred_with_sentiment.png, lstm_results.xlsx')
