In [142]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["font.sans-serif"]=["WenQuanYi Micro Hei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题

# 禁止随机，结果可复现
random.seed(42)
np.random.seed(42)

## 超参数选择

In [143]:
time_ratio = [1/2, 3/4, 7/8, 1][3]

In [144]:
emo_index = ['上证综合情绪值', '沪深300情绪值', '创业板情绪值'][0]
model = 'Deep-learning/BERT'

ROOT_PATH = '/data/public/fintechlab/zdh/Individual-Stock-Analysis/B_Temporal_Clustering'
Clustering_Method = 'Other_Clusters'
Emotion_Data_PATH = f'{ROOT_PATH}/data/Emotion_Data/{model}'   # 情绪数据路径
Financial_Data_PATH = f'{ROOT_PATH}/data/Financial_Data' # 金融数据路径

os.makedirs(f'{ROOT_PATH}/{Clustering_Method}/个股分析/基于表示的聚类/{emo_index}', exist_ok=True)

print(f"Running with: emo_index={emo_index}, model={model}")

Running with: emo_index=上证综合情绪值, model=Deep-learning/BERT


## 数据准备

In [145]:
"""读取股吧个股的数据"""
all_data = []
file_list = [f for f in os.listdir(Emotion_Data_PATH) if f.endswith('.csv')]

for file in file_list:
    file_path = os.path.join(Emotion_Data_PATH, file)
    df = pd.read_csv(file_path)
    stock_code = os.path.splitext(file)[0] # 获取股票编号（文件名去掉扩展名）
    
    # 提取每一行的日期和情绪值
    for _, row in df.iterrows():
        new_row = {
            '股票编号': stock_code,
            '日期': row['日期'],
            '上证综合情绪值': row['上证综合情绪值'],
            '沪深300情绪值': row['沪深300情绪值'],
            '创业板情绪值': row['创业板情绪值']
        }
        all_data.append(new_row)
        
guba_data = pd.DataFrame(all_data)
guba_data

Unnamed: 0,股票编号,日期,上证综合情绪值,沪深300情绪值,创业板情绪值
0,601933,2024-12-27,0.007,-0.044,-0.015
1,601933,2024-11-18,-0.155,-0.236,-0.354
2,601933,2024-11-17,0.001,0.022,0.004
3,601933,2024-11-17,-0.346,-0.337,-0.407
4,601933,2024-11-17,-0.247,-0.309,-0.358
...,...,...,...,...,...
495326,601919,2021-04-09,0.253,0.327,0.457
495327,601919,2021-04-09,-0.177,-0.190,-0.196
495328,601919,2021-04-08,0.218,0.194,0.167
495329,601919,2021-04-08,0.148,0.142,0.346


In [146]:
# 查看最早和最晚日期
earliest_date = guba_data["日期"].min()
latest_date = guba_data["日期"].max()

print("最早日期：", earliest_date)
print("最晚日期：", latest_date)


最早日期： 2021-01-01
最晚日期： 2024-12-31


In [147]:
guba_data = guba_data.sort_values(by="日期").reset_index(drop=True)
idx_ratio = int(len(guba_data) * time_ratio)  
guba_data = guba_data.iloc[:idx_ratio]
guba_data

Unnamed: 0,股票编号,日期,上证综合情绪值,沪深300情绪值,创业板情绪值
0,601919,2021-01-01,-0.129,-0.132,-0.159
1,601012,2021-01-01,-0.483,-0.513,-0.641
2,601318,2021-01-01,-0.151,-0.183,-0.188
3,000625,2021-01-01,0.131,0.069,0.199
4,000725,2021-01-01,-0.129,-0.132,-0.159
...,...,...,...,...,...
495326,002594,2024-12-31,0.049,0.059,0.135
495327,000725,2024-12-31,-0.103,-0.173,-0.111
495328,300999,2024-12-31,-0.064,-0.067,-0.008
495329,300676,2024-12-31,-0.012,-0.029,0.111


In [148]:
# 查看最早和最晚日期
earliest_date = guba_data["日期"].min()
latest_date = guba_data["日期"].max()

print("最早日期：", earliest_date)
print("最晚日期：", latest_date)


最早日期： 2021-01-01
最晚日期： 2024-12-31


In [149]:
"""读取股票回报率的数据"""
return_data = pd.read_csv(f'{Financial_Data_PATH}/日个股回报率.csv', dtype={'股票编号': str})
return_data

Unnamed: 0,股票编号,日期,交易量,收益率变化
0,000002,2021-06-01,60990961,-0.003745
1,000002,2021-06-02,85354506,0.006015
2,000002,2021-06-03,50594187,-0.003363
3,000002,2021-06-04,71422364,-0.012748
4,000002,2021-06-07,64745280,-0.014812
...,...,...,...,...
154877,688981,2024-11-20,58507495,-0.017071
154878,688981,2024-11-21,56197106,0.002358
154879,688981,2024-11-22,79240108,-0.050588
154880,688981,2024-11-25,76905909,-0.029402


In [150]:
# 进行左连接，guba_data 为主表
merged_data = pd.merge(guba_data, return_data[['股票编号', '日期', '交易量', '收益率变化']], 
                       on=['股票编号', '日期'], 
                       how='left')
merged_data = merged_data.dropna()
merged_data

Unnamed: 0,股票编号,日期,上证综合情绪值,沪深300情绪值,创业板情绪值,交易量,收益率变化
6439,601166,2021-06-01,-0.502,-0.540,-0.649,118168969.0,-0.022068
6440,002241,2021-06-01,-0.015,-0.039,0.031,119791643.0,0.034403
6441,601857,2021-06-01,0.081,0.104,0.160,128459929.0,0.019481
6442,300896,2021-06-01,0.080,0.086,0.235,2932326.0,0.004877
6443,300896,2021-06-01,-0.046,-0.017,-0.087,2932326.0,0.004877
...,...,...,...,...,...,...,...
495131,300236,2024-11-22,0.012,0.006,0.057,8053791.0,-0.045638
495132,000061,2024-11-22,-0.015,-0.036,0.016,27670515.0,0.027221
495133,600600,2024-11-22,-0.364,-0.373,-0.442,4378607.0,-0.029869
495134,300236,2024-11-22,0.005,-0.027,0.042,8053791.0,-0.045638


## 数据预处理

In [151]:
# Step 1: 对情绪值列进行 Min-Max 标准化
def min_max_normalization(df, cols):
    for col in cols:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = 2 * (df[col] - min_val) / (max_val - min_val) - 1
    return df

# Step 2: 对每个股票编号的数据进行标准化和按日期汇总
def process_data(df):
    df_processed = pd.DataFrame()  # 用于存储结果
    for stock_code, stock_data in df.groupby('股票编号'):
        # 对每个股票编号内的数据进行标准化
        stock_data = min_max_normalization(stock_data, ['上证综合情绪值', '沪深300情绪值', '创业板情绪值'])
        
        # 按日期汇总数据，同时保留股票编号
        stock_summary = stock_data.groupby('日期').agg({
            '股票编号': 'first',  # 保留股票编号（在同一日期内它是相同的，使用 'first'）
            '上证综合情绪值': 'mean',  # 上证综合情绪值按日期取均值
            '沪深300情绪值': 'mean',  # 沪深300情绪值按日期取均值
            '创业板情绪值': 'mean',  # 创业板情绪值按日期取均值
            '交易量': 'mean',  # 交易量按日期求和
            '收益率变化': 'mean'  # 收益率变化按日期取均值
        }).reset_index(drop=False)
        
        df_processed = pd.concat([df_processed, stock_summary], ignore_index=True)
    
    return df_processed

# 调用处理函数
final_data = process_data(merged_data)
final_data

Unnamed: 0,日期,股票编号,上证综合情绪值,沪深300情绪值,创业板情绪值,交易量,收益率变化
0,2021-06-01,000002,0.316338,0.267833,0.280789,60990961.0,-0.003745
1,2021-06-08,000002,0.347227,0.287914,0.300312,44676494.0,0.004626
2,2021-06-10,000002,0.325725,0.281754,0.289027,53800776.0,-0.010035
3,2021-06-11,000002,0.285945,0.224180,0.235237,75853738.0,-0.014035
4,2021-06-15,000002,0.314849,0.268572,0.283593,89915501.0,-0.020957
...,...,...,...,...,...,...,...
85724,2024-11-12,688981,0.360433,0.328320,0.235728,108866759.0,-0.036864
85725,2024-11-13,688981,0.269057,0.242652,0.142033,80759477.0,-0.019484
85726,2024-11-14,688981,0.286646,0.247683,0.160005,76194102.0,-0.022897
85727,2024-11-15,688981,0.244977,0.218178,0.099522,71066743.0,-0.020233


## 基于表示学习的聚类（ LSTM-Autoencoder ）

In [152]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from torch import nn, optim
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 1. 数据准备：构建每个股票的时间序列（2维: 情绪值 + 收益率）
def build_sequences(df, seq_len=100):
    stock_ids = df['股票编号'].unique()
    sequences = []
    valid_ids = []

    for stock_id in stock_ids:
        sub_df = df[df['股票编号'] == stock_id].sort_values('日期')
        sub_seq = sub_df[['上证综合情绪值', '收益率变化']].values

        if len(sub_seq) >= seq_len:
            sub_seq = sub_seq[-seq_len:]  # 取最近 seq_len 天
            sequences.append(sub_seq)
            valid_ids.append(stock_id)

    sequences = np.stack(sequences)  # shape: (num_stocks, seq_len, 2)
    return sequences, valid_ids

# 2. PyTorch Dataset
class StockDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx]

# 3. LSTM-Autoencoder 模型定义
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64, bottleneck_dim=16):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.hidden_to_bottleneck = nn.Linear(hidden_dim, bottleneck_dim)
        
        self.bottleneck_to_hidden = nn.Linear(bottleneck_dim, hidden_dim)
        self.decoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        enc_out, (h, _) = self.encoder(x)  # h: (1, B, H)
        bottleneck = self.hidden_to_bottleneck(h[-1])  # (B, bottleneck_dim)

        dec_input = x
        h_dec = self.bottleneck_to_hidden(bottleneck).unsqueeze(0)  # (1, B, H)
        dec_out, _ = self.decoder(dec_input, (h_dec, torch.zeros_like(h_dec)))
        out = self.output_layer(dec_out)  # (B, T, input_dim)
        return out, bottleneck

# 4. 加载和处理数据
final_data['日期'] = pd.to_datetime(final_data['日期'])
sequences, stock_ids = build_sequences(final_data, seq_len=100)

# 标准化
scaler = MinMaxScaler()
for i in range(sequences.shape[0]):
    sequences[i] = scaler.fit_transform(sequences[i])

dataset = StockDataset(sequences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 5. 模型训练
model = LSTMAutoencoder()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 100
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        batch = batch.to(device)
        recon, _ = model(batch)
        loss = loss_fn(recon, batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch.size(0)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(dataset):.4f}")

Epoch 1/100, Loss: 0.3049
Epoch 2/100, Loss: 0.2352
Epoch 3/100, Loss: 0.1589
Epoch 4/100, Loss: 0.0653
Epoch 5/100, Loss: 0.0445
Epoch 6/100, Loss: 0.0310
Epoch 7/100, Loss: 0.0332
Epoch 8/100, Loss: 0.0294
Epoch 9/100, Loss: 0.0293
Epoch 10/100, Loss: 0.0282
Epoch 11/100, Loss: 0.0278
Epoch 12/100, Loss: 0.0273
Epoch 13/100, Loss: 0.0271
Epoch 14/100, Loss: 0.0266
Epoch 15/100, Loss: 0.0262
Epoch 16/100, Loss: 0.0258
Epoch 17/100, Loss: 0.0254
Epoch 18/100, Loss: 0.0249
Epoch 19/100, Loss: 0.0243
Epoch 20/100, Loss: 0.0238
Epoch 21/100, Loss: 0.0230
Epoch 22/100, Loss: 0.0222
Epoch 23/100, Loss: 0.0212
Epoch 24/100, Loss: 0.0200
Epoch 25/100, Loss: 0.0193
Epoch 26/100, Loss: 0.0190
Epoch 27/100, Loss: 0.0186
Epoch 28/100, Loss: 0.0184
Epoch 29/100, Loss: 0.0181
Epoch 30/100, Loss: 0.0179
Epoch 31/100, Loss: 0.0176
Epoch 32/100, Loss: 0.0174
Epoch 33/100, Loss: 0.0172
Epoch 34/100, Loss: 0.0170
Epoch 35/100, Loss: 0.0168
Epoch 36/100, Loss: 0.0166
Epoch 37/100, Loss: 0.0164
Epoch 38/1

In [153]:
# 6. 提取 bottleneck 表示
model.eval()
all_bottlenecks = []

with torch.no_grad():
    for batch in DataLoader(dataset, batch_size=32):
        batch = batch.to(device)
        _, bottleneck = model(batch)
        all_bottlenecks.append(bottleneck.cpu().numpy())

embeddings = np.vstack(all_bottlenecks)  # shape: (num_stocks, bottleneck_dim)

# 7. KMeans 聚类
k = 3
kmeans = KMeans(n_clusters=k, random_state=0)
cluster_labels = kmeans.fit_predict(embeddings)

In [154]:
# 8. 输出聚类结果
result_df = pd.DataFrame({
    '股票编号': stock_ids,
    '聚类标签': cluster_labels
})

os.makedirs('output', exist_ok=True)
result_df.to_csv(f'output/{time_ratio}.csv')