In [2]:
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

# 数据路径
root_dir = 'D:\\PythonProject\\DLA_ML_project\\sh-DLAndML-project\\Data\\qwen-characterSplit'

# 存储 latent 和 tokenid
latents = []
tokenids = []

# 遍历文件夹收集数据
total_files = sum(
    len([file_name for file_name in os.listdir(os.path.join(root_dir, sub_dir))
         if file_name.endswith('.pkl') and 'latent' in file_name and '_tokenid' in file_name and '_tokenEmbedding' in file_name])
    for sub_dir in os.listdir(root_dir)
    if os.path.isdir(os.path.join(root_dir, sub_dir))
)

with tqdm(total=total_files, desc="Processing files") as pbar:
    for sub_dir in os.listdir(root_dir):
        sub_dir_path = os.path.join(root_dir, sub_dir)
        if os.path.isdir(sub_dir_path):
            for file_name in os.listdir(sub_dir_path):
                if file_name.endswith('.pkl') and 'latent' in file_name and '_tokenid' in file_name and '_tokenEmbedding' in file_name:
                    file_path = os.path.join(sub_dir_path, file_name)
                    with open(file_path, 'rb') as file:
                        loaded_data = pickle.load(file)
                    for eeg, character, latent, tokenid, embed in loaded_data:
                        # 收集 latent 和 tokenid
                        latents.append(latent.flatten())  # 将 latent 展平为 (64,)
                        tokenids.append(tokenid[0])       # tokenid 是 int16
                    pbar.update(1)

# 转换为 NumPy 数组
latents = np.array(latents)  # shape: (num_samples, 64)
tokenids = np.array(tokenids)  # shape: (num_samples,)

Processing files:   0%|          | 0/58 [00:00<?, ?it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002459E8549D0>>
Traceback (most recent call last):
  File "D:\software\coding\anaconda3\envs\DLA_ML_project\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Processing files:   3%|▎         | 2/58 [00:05<02:22,  2.54s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002459E8549D0>>
Traceback (most recent call last):
  File "D:\software\coding\anaconda3\envs\DLA_ML_project\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Processing files: 100%|██████████| 58/58 [02:23<00:00,  2.48s/it]


In [11]:
from sklearn.decomposition import PCA

In [0]:
# 使用 PCA 将数据降维到 32 维
pca = PCA(n_components=2)
latents_reduced = pca.fit_transform(latents)
# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(latents_reduced, tokenids, test_size=0.1, random_state=42)

In [19]:
# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(latents, tokenids, test_size=0.1, random_state=42)

In [20]:

# 遍历不同的 k 值，寻找最佳 k
best_k = None
best_accuracy = 0
accuracy_results = []
ks = [5,10,50,100,1000]

for k in ks:  # 遍历 k 从 1 到 20
    knn = KNeighborsClassifier(n_neighbors=k)  # 创建 KNN 模型
    knn.fit(X_train, y_train)  # 训练模型
    accuracy = knn.score(X_test, y_test)  # 评估模型
    accuracy_results.append((k, accuracy))  # 保存 k 和对应的准确率

    if accuracy > best_accuracy:  # 如果当前 k 的准确率更高，更新最佳 k
        best_k = k
        best_accuracy = accuracy

    print(f"k={k}, 测试集准确率: {accuracy:.6f}")

# 输出最佳 k 和对应的准确率
print(f"最佳 k 值为: {best_k}, 对应的测试集准确率为: {best_accuracy:.6f}")

# 可选：打印所有 k 值和对应的准确率
print("所有 k 值和对应的准确率:")
for k, acc in accuracy_results:
    print(f"k={k}, 准确率: {acc:.6f}")

k=5, 测试集准确率: 0.016413
k=10, 测试集准确率: 0.018081
k=50, 测试集准确率: 0.026045
k=100, 测试集准确率: 0.028575
k=1000, 测试集准确率: 0.032664
最佳 k 值为: 1000, 对应的测试集准确率为: 0.032664
所有 k 值和对应的准确率:
k=5, 准确率: 0.016413
k=10, 准确率: 0.018081
k=50, 准确率: 0.026045
k=100, 准确率: 0.028575
k=1000, 准确率: 0.032664


In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(latents, tokenids, test_size=0.1, random_state=42)

# 超参数搜索的例子
max_depths = [5, 10, 15, 20]
for max_depth in tqdm(max_depths, desc="训练决策树"):
    tree_clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    tree_clf.fit(X_train, y_train)
    y_pred = tree_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"max_depth={max_depth}, 测试集准确率: {accuracy:.6f}")


训练决策树:  25%|██▌       | 1/4 [01:36<04:48, 96.20s/it]

max_depth=5, 测试集准确率: 0.030619


训练决策树:  50%|█████     | 2/4 [04:46<05:03, 151.82s/it]

max_depth=10, 测试集准确率: 0.029274


训练决策树:  75%|███████▌  | 3/4 [09:32<03:32, 212.76s/it]

max_depth=15, 测试集准确率: 0.029382


训练决策树: 100%|██████████| 4/4 [15:51<00:00, 237.82s/it]

max_depth=20, 测试集准确率: 0.028521





In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor

# 创建并训练随机森林回归模型
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# 预测并计算 MSE 和 R²
y_pred = rf_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"随机森林回归测试集均方误差（MSE）: {mse:.6f}")
print(f"随机森林回归测试集决定系数（R²）: {r2:.6f}")



In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np

# 定义 MLP 模型
class LatentToEmbedModel(nn.Module):
    def __init__(self, input_dim=64, hidden_dims=[512, 1024, 2048], output_dim=1456):
        super(LatentToEmbedModel, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3))  # 防止过拟合
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, output_dim))  # 输出层，维度为类别数
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

# 数据准备
# 假设 latents 和 tokenids 已经是 NumPy 数组
# latents: shape (num_samples, 64)
# tokenids: shape (num_samples,)
# tokenids 是 np.int64 类型，包含 1456 个不同的值

# 获取 tokenids 的唯一值并创建映射
unique_tokenids = np.unique(tokenids)  # 获取所有唯一的 tokenid 值
print(len(unique_tokenids))
id_to_class = {tokenid: idx for idx, tokenid in enumerate(unique_tokenids)}  # 原始值 -> 类别索引
class_to_id = {idx: tokenid for tokenid, idx in id_to_class.items()}  # 类别索引 -> 原始值

In [20]:

# 将 tokenids 映射为类别索引
mapped_tokenids = np.array([id_to_class[tokenid] for tokenid in tokenids], dtype=np.int64)

# 检查映射后的范围
print(f"Mapped tokenids min value: {mapped_tokenids.min()}, max value: {mapped_tokenids.max()}")

# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(latents, mapped_tokenids, test_size=0.1, random_state=42)

# 转换为 PyTorch 张量
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # CrossEntropyLoss 需要 Long 类型标签
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# 创建数据加载器
batch_size = 2048
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 模型、损失函数和优化器
input_dim = X_train.shape[1]  # 输入维度
hidden_dims = [512, 1024, 2048]
output_dim = len(unique_tokenids)  # 输出类别数量

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LatentToEmbedModel(input_dim=input_dim, hidden_dims=hidden_dims, output_dim=output_dim).to(device)
criterion = nn.CrossEntropyLoss()  # 多分类问题使用交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 训练模型
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # 前向传播
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计训练损失和准确率
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
    
    train_loss = running_loss / len(train_loader)
    train_accuracy = correct / total
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

# 测试模型
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy:.4f}")

# 将预测结果映射回原始 tokenid
predicted_tokenids = [class_to_id[idx] for idx in predicted.cpu().numpy()]
print(f"Predicted tokenids: {predicted_tokenids}")


1456
Mapped tokenids min value: 0, max value: 1455


Epoch 1/20: 100%|██████████| 2614/2614 [00:43<00:00, 59.49it/s]


Train Loss: 6.1771, Train Accuracy: 0.0309


Epoch 2/20: 100%|██████████| 2614/2614 [00:43<00:00, 60.37it/s]


Train Loss: 5.8725, Train Accuracy: 0.0321


Epoch 3/20: 100%|██████████| 2614/2614 [00:45<00:00, 57.33it/s]


Train Loss: 5.8417, Train Accuracy: 0.0324


Epoch 4/20: 100%|██████████| 2614/2614 [00:50<00:00, 51.56it/s]


Train Loss: 5.8257, Train Accuracy: 0.0325


Epoch 5/20: 100%|██████████| 2614/2614 [00:57<00:00, 45.17it/s]


Train Loss: 5.8143, Train Accuracy: 0.0348


Epoch 6/20: 100%|██████████| 2614/2614 [00:59<00:00, 44.07it/s]


Train Loss: 5.8015, Train Accuracy: 0.0347


Epoch 7/20: 100%|██████████| 2614/2614 [00:59<00:00, 44.25it/s]


Train Loss: 5.7949, Train Accuracy: 0.0363


Epoch 8/20: 100%|██████████| 2614/2614 [01:01<00:00, 42.68it/s]


Train Loss: 5.7874, Train Accuracy: 0.0363


Epoch 9/20: 100%|██████████| 2614/2614 [01:01<00:00, 42.75it/s]


Train Loss: 5.7810, Train Accuracy: 0.0362


Epoch 10/20: 100%|██████████| 2614/2614 [01:02<00:00, 41.92it/s]


Train Loss: 5.7770, Train Accuracy: 0.0366


Epoch 11/20: 100%|██████████| 2614/2614 [01:02<00:00, 41.79it/s]


Train Loss: 5.7722, Train Accuracy: 0.0365


Epoch 12/20: 100%|██████████| 2614/2614 [01:03<00:00, 41.41it/s]


Train Loss: 5.7683, Train Accuracy: 0.0370


Epoch 13/20: 100%|██████████| 2614/2614 [01:05<00:00, 40.16it/s]


Train Loss: 5.7632, Train Accuracy: 0.0373


Epoch 14/20: 100%|██████████| 2614/2614 [01:04<00:00, 40.33it/s]


Train Loss: 5.7615, Train Accuracy: 0.0368


Epoch 15/20: 100%|██████████| 2614/2614 [01:07<00:00, 38.46it/s]


Train Loss: 5.7572, Train Accuracy: 0.0370


Epoch 16/20: 100%|██████████| 2614/2614 [01:17<00:00, 33.88it/s]


Train Loss: 5.7563, Train Accuracy: 0.0374


Epoch 17/20: 100%|██████████| 2614/2614 [01:07<00:00, 38.86it/s]


Train Loss: 5.7505, Train Accuracy: 0.0373


Epoch 18/20: 100%|██████████| 2614/2614 [01:04<00:00, 40.43it/s]


Train Loss: 5.7489, Train Accuracy: 0.0376


Epoch 19/20: 100%|██████████| 2614/2614 [01:05<00:00, 40.16it/s]


Train Loss: 5.7460, Train Accuracy: 0.0372


Epoch 20/20: 100%|██████████| 2614/2614 [01:05<00:00, 39.84it/s]


Train Loss: 5.7453, Train Accuracy: 0.0375
Test Accuracy: 0.0264
Predicted tokenids: [np.int64(1940), np.int64(1940), np.int64(1940), np.int64(1940), np.int64(35946), np.int64(35946), np.int64(1940), np.int64(1940), np.int64(1940), np.int64(35946), np.int64(9370), np.int64(1940), np.int64(1940), np.int64(1940), np.int64(1940), np.int64(35946), np.int64(35946), np.int64(1940), np.int64(1940), np.int64(1940), np.int64(35946), np.int64(1940), np.int64(35946)]


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import Normalize
import matplotlib.cm as cm

# 假设以下变量已经存在：
# X_train, y_train: 训练集特征和标签
# X_test, y_test: 测试集特征和标签

# 1. 使用最佳 k 值训练 KNN 模型
best_k = 5  # 假设之前找到的最佳 k 值为 5
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

# 2. 获取测试集的预测结果
y_pred = knn.predict(X_test)

# 3. 使用 PCA 将测试集特征降维到 2 维
pca = PCA(n_components=2)
X_test_2d = pca.fit_transform(X_test)  # shape: (num_samples, 2)

# 4. 定义颜色映射
num_classes = len(np.unique(y_test))  # 类别总数
norm = Normalize(vmin=0, vmax=num_classes - 1)  # 归一化类别索引范围
cmap = cm.get_cmap('hsv', num_classes)  # 使用 HSV 色轮作为颜色映射

# 5. 检测并移除离群点（使用分位数方法）
# 计算每个点到数据中心的距离
center = np.mean(X_test_2d, axis=0)  # 计算数据中心
distances = np.linalg.norm(X_test_2d - center, axis=1)  # 计算欧几里得距离

# 使用分位数定义距离阈值（例如，95% 分位数）
distance_threshold = np.percentile(distances, 95)

# 筛选出非离群点的索引
inlier_mask = distances <= distance_threshold

# 过滤掉离群点
X_test_2d_filtered = X_test_2d[inlier_mask]
y_test_filtered = y_test[inlier_mask]
y_pred_filtered = y_pred[inlier_mask]

# 6. 绘制过滤后的散点图
plt.figure(figsize=(16, 8))

# 左图：真实类别
plt.subplot(1, 2, 1)
sc1 = plt.scatter(
    X_test_2d_filtered[:, 0],  # 第一主成分
    X_test_2d_filtered[:, 1],  # 第二主成分
    c=y_test_filtered,  # 类别索引
    cmap=cmap,  # 使用颜色映射
    norm=norm,  # 归一化
    s=5,
    alpha=0.7
)
plt.title("True Labels (PCA 2D, Outliers Removed)", fontsize=16)
plt.xlabel("Principal Component 1", fontsize=14)
plt.ylabel("Principal Component 2", fontsize=14)
plt.grid(alpha=0.3)

# 添加颜色条
cbar1 = plt.colorbar(sc1, ticks=np.linspace(0, num_classes - 1, min(num_classes, 10)))  # 最多显示 10 个类别
cbar1.set_label("Class Index", fontsize=12)

# # 右图：预测类别
# plt.subplot(1, 2, 2)
# sc2 = plt.scatter(
#     X_test_2d_filtered[:, 0],  # 第一主成分
#     X_test_2d_filtered[:, 1],  # 第二主成分
#     c=y_pred_filtered,  # 类别索引
#     cmap=cmap,  # 使用颜色映射
#     norm=norm,  # 归一化
#     s=5,
#     alpha=0.7
# )
# plt.title("KNN Predictions (PCA 2D, Outliers Removed)", fontsize=16)
# plt.xlabel("Principal Component 1", fontsize=14)
# plt.ylabel("Principal Component 2", fontsize=14)
# plt.grid(alpha=0.3)
# 
# # 添加颜色条
# cbar2 = plt.colorbar(sc2, ticks=np.linspace(0, num_classes - 1, min(num_classes, 10)))  # 最多显示 10 个类别
# cbar2.set_label("Class Index", fontsize=12)
plt.savefig("example_plot.pdf", format="pdf", bbox_inches="tight")
plt.tight_layout()
plt.show()


NameError: name 'X_train' is not defined

In [3]:
import os
import pickle
import numpy as np
from tqdm import tqdm
import random

# 数据路径
root_dir = 'D:\\PythonProject\\DLA_ML_project\\sh-DLAndML-project\\Data\\qwen-characterSplit'

# 无效的子文件夹
invalid_subs = {'sub11', 'sub12', 'sub15'}

# 获取有效的子文件夹
sub_dirs = [sub_dir for sub_dir in os.listdir(root_dir) 
            if os.path.isdir(os.path.join(root_dir, sub_dir)) and sub_dir not in invalid_subs]

# 确保每次运行生成的分组一致（可选）
random.seed(44)

# 生成三组训练集和测试集
splits = []
for _ in range(3):
    random.shuffle(sub_dirs)
    train_subs = sub_dirs[:8]
    test_subs = sub_dirs[8:]
    splits.append((train_subs, test_subs))

print(splits)

# 存储三组数据
all_data = []

for split_idx, (train_subs, test_subs) in enumerate(splits):
    print(f"\nProcessing Split {split_idx + 1}...")

    # 初始化存储
    train_latents, train_tokenids = [], []
    test_latents, test_tokenids = [], []

    # 处理训练集
    print("Processing training data...")
    for sub_dir in tqdm(train_subs, desc="Training Subfolders", leave=False):
        sub_dir_path = os.path.join(root_dir, sub_dir)
        for file_name in tqdm(os.listdir(sub_dir_path), desc=f"Files in {sub_dir}", leave=False):
            if file_name.endswith('.pkl') and 'latent' in file_name and '_tokenid' in file_name and '_tokenEmbedding' in file_name:
                file_path = os.path.join(sub_dir_path, file_name)
                with open(file_path, 'rb') as file:
                    loaded_data = pickle.load(file)
                for eeg, character, latent, tokenid, embed in loaded_data:
                    train_latents.append(latent.flatten())  # 将 latent 展平为 (64,)
                    train_tokenids.append(tokenid[0])       # tokenid 是 int16

    # 处理测试集
    print("Processing testing data...")
    for sub_dir in tqdm(test_subs, desc="Testing Subfolders", leave=False):
        sub_dir_path = os.path.join(root_dir, sub_dir)
        for file_name in tqdm(os.listdir(sub_dir_path), desc=f"Files in {sub_dir}", leave=False):
            if file_name.endswith('.pkl') and 'latent' in file_name and '_tokenid' in file_name and '_tokenEmbedding' in file_name:
                file_path = os.path.join(sub_dir_path, file_name)
                with open(file_path, 'rb') as file:
                    loaded_data = pickle.load(file)
                for eeg, character, latent, tokenid, embed in loaded_data:
                    test_latents.append(latent.flatten())  # 将 latent 展平为 (64,)
                    test_tokenids.append(tokenid[0])       # tokenid 是 int16

    # 转换为 NumPy 数组
    train_latents = np.array(train_latents)
    train_tokenids = np.array(train_tokenids)
    test_latents = np.array(test_latents)
    test_tokenids = np.array(test_tokenids)

    # 保存到 all_data
    all_data.append({
        "train_latents": train_latents,
        "train_tokenids": train_tokenids,
        "test_latents": test_latents,
        "test_tokenids": test_tokenids
    })




[(['sub08', 'sub09', 'sub04', 'sub06', 'sub14', 'sub07', 'sub13', 'sub05'], ['sub10']), (['sub13', 'sub07', 'sub06', 'sub04', 'sub14', 'sub10', 'sub05', 'sub08'], ['sub09']), (['sub07', 'sub08', 'sub04', 'sub14', 'sub09', 'sub13', 'sub06', 'sub10'], ['sub05'])]

Processing Split 1...
Processing training data...


Training Subfolders:   0%|          | 0/8 [00:00<?, ?it/s]
Files in sub08:   0%|          | 0/7 [00:00<?, ?it/s][A
Files in sub08:  14%|█▍        | 1/7 [00:02<00:17,  2.84s/it][A
Files in sub08:  29%|██▊       | 2/7 [00:05<00:12,  2.60s/it][A
Files in sub08:  43%|████▎     | 3/7 [00:07<00:09,  2.38s/it][A
Files in sub08:  57%|█████▋    | 4/7 [00:10<00:07,  2.53s/it][A
Files in sub08:  71%|███████▏  | 5/7 [00:11<00:04,  2.03s/it][A
Files in sub08:  86%|████████▌ | 6/7 [00:13<00:02,  2.07s/it][A
Files in sub08: 100%|██████████| 7/7 [00:16<00:00,  2.24s/it][A
Training Subfolders:  12%|█▎        | 1/8 [00:16<01:52, 16.04s/it]
Files in sub09:   0%|          | 0/4 [00:00<?, ?it/s][A
Files in sub09:  25%|██▌       | 1/4 [00:02<00:07,  2.45s/it][A
Files in sub09:  50%|█████     | 2/4 [00:03<00:03,  1.79s/it][A
Files in sub09:  75%|███████▌  | 3/4 [00:06<00:02,  2.02s/it][A
Files in sub09: 100%|██████████| 4/4 [00:08<00:00,  2.31s/it][A
Training Subfolders:  25%|██▌       | 2/8 [00

Processing testing data...


Testing Subfolders:   0%|          | 0/1 [00:00<?, ?it/s]
Files in sub10:   0%|          | 0/7 [00:00<?, ?it/s][A
Files in sub10:  14%|█▍        | 1/7 [00:02<00:15,  2.62s/it][A
Files in sub10:  29%|██▊       | 2/7 [00:04<00:12,  2.44s/it][A
Files in sub10:  43%|████▎     | 3/7 [00:06<00:09,  2.26s/it][A
Files in sub10:  57%|█████▋    | 4/7 [00:09<00:07,  2.36s/it][A
Files in sub10:  71%|███████▏  | 5/7 [00:10<00:03,  1.95s/it][A
Files in sub10:  86%|████████▌ | 6/7 [00:13<00:02,  2.06s/it][A
Files in sub10: 100%|██████████| 7/7 [00:15<00:00,  2.26s/it][A
                                                                 


Processing Split 2...
Processing training data...


Training Subfolders:   0%|          | 0/8 [00:00<?, ?it/s]
Files in sub13:   0%|          | 0/5 [00:00<?, ?it/s][A
Files in sub13:  20%|██        | 1/5 [00:00<00:01,  2.19it/s][A
Files in sub13:  40%|████      | 2/5 [00:00<00:01,  2.47it/s][A
Files in sub13:  60%|██████    | 3/5 [00:01<00:01,  1.97it/s][A
Files in sub13:  80%|████████  | 4/5 [00:02<00:00,  1.85it/s][A
Files in sub13: 100%|██████████| 5/5 [00:02<00:00,  1.76it/s][A
Training Subfolders:  12%|█▎        | 1/8 [00:02<00:18,  2.67s/it]
Files in sub07:   0%|          | 0/7 [00:00<?, ?it/s][A
Files in sub07:  14%|█▍        | 1/7 [00:00<00:02,  2.61it/s][A
Files in sub07:  29%|██▊       | 2/7 [00:00<00:01,  2.50it/s][A
Files in sub07:  43%|████▎     | 3/7 [00:01<00:02,  1.97it/s][A
Files in sub07:  57%|█████▋    | 4/7 [00:02<00:01,  1.77it/s][A
Files in sub07:  71%|███████▏  | 5/7 [00:02<00:00,  2.00it/s][A
Files in sub07:  86%|████████▌ | 6/7 [00:03<00:00,  1.90it/s][A
Files in sub07: 100%|██████████| 7/7 [00:03<0

Processing testing data...


Testing Subfolders:   0%|          | 0/1 [00:00<?, ?it/s]
Files in sub09:   0%|          | 0/4 [00:00<?, ?it/s][A
Files in sub09:  25%|██▌       | 1/4 [00:02<00:08,  2.76s/it][A
Files in sub09:  50%|█████     | 2/4 [00:04<00:03,  1.98s/it][A
Files in sub09:  75%|███████▌  | 3/4 [00:06<00:02,  2.14s/it][A
Files in sub09: 100%|██████████| 4/4 [00:09<00:00,  2.42s/it][A
                                                                 


Processing Split 3...
Processing training data...


Training Subfolders:   0%|          | 0/8 [00:00<?, ?it/s]
Files in sub07:   0%|          | 0/7 [00:00<?, ?it/s][A
Files in sub07:  14%|█▍        | 1/7 [00:02<00:15,  2.51s/it][A
Files in sub07:  29%|██▊       | 2/7 [00:05<00:13,  2.62s/it][A
Files in sub07:  43%|████▎     | 3/7 [00:07<00:09,  2.42s/it][A
Files in sub07:  57%|█████▋    | 4/7 [00:10<00:07,  2.51s/it][A
Files in sub07:  71%|███████▏  | 5/7 [00:11<00:04,  2.10s/it][A
Files in sub07:  86%|████████▌ | 6/7 [00:13<00:02,  2.15s/it][A
Files in sub07: 100%|██████████| 7/7 [00:16<00:00,  2.37s/it][A
Training Subfolders:  12%|█▎        | 1/8 [00:16<01:55, 16.49s/it]
Files in sub08:   0%|          | 0/7 [00:00<?, ?it/s][A
Files in sub08:  14%|█▍        | 1/7 [00:00<00:03,  1.94it/s][A
Files in sub08:  29%|██▊       | 2/7 [00:00<00:02,  2.12it/s][A
Files in sub08:  43%|████▎     | 3/7 [00:01<00:02,  1.38it/s][A
Files in sub08:  57%|█████▋    | 4/7 [00:02<00:02,  1.23it/s][A
Files in sub08:  71%|███████▏  | 5/7 [00:03<0

Processing testing data...


Testing Subfolders:   0%|          | 0/1 [00:00<?, ?it/s]
Files in sub05:   0%|          | 0/7 [00:00<?, ?it/s][A
Files in sub05:  14%|█▍        | 1/7 [00:02<00:15,  2.64s/it][A
Files in sub05:  29%|██▊       | 2/7 [00:05<00:12,  2.52s/it][A
Files in sub05:  43%|████▎     | 3/7 [00:07<00:09,  2.42s/it][A
Files in sub05:  57%|█████▋    | 4/7 [00:10<00:07,  2.55s/it][A
Files in sub05:  71%|███████▏  | 5/7 [00:11<00:04,  2.16s/it][A
Files in sub05:  86%|████████▌ | 6/7 [00:13<00:02,  2.22s/it][A
Files in sub05: 100%|██████████| 7/7 [00:17<00:00,  2.50s/it][A
                                                                 

In [5]:
# 获取所有唯一的 tokenid 并创建映射
unique_tokenids = np.unique(tokenids)  # 获取所有唯一的 tokenid 值
print(f"Total unique tokenids: {len(unique_tokenids)}")

id_to_class = {tokenid: idx for idx, tokenid in enumerate(unique_tokenids)}  # 原始值 -> 类别索引
class_to_id = {idx: tokenid for tokenid, idx in id_to_class.items()}  # 类别索引 -> 原始值

Total unique tokenids: 1456


In [7]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

# 定义 MLP 模型
class LatentToTokenidModel(nn.Module):
    def __init__(self, input_dim=64, hidden_dims=[512, 1024, 2048], output_dim=1456):
        super(LatentToTokenidModel, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3))  # 防止过拟合
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, output_dim))  # 输出层，维度为类别数
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

# 更新 all_data 中的 tokenid 为映射后的类别索引
for split_data in all_data:
    split_data["train_tokenids"] = np.array([id_to_class[tokenid] for tokenid in split_data["train_tokenids"]], dtype=np.int64)
    split_data["test_tokenids"] = np.array([id_to_class[tokenid] for tokenid in split_data["test_tokenids"]], dtype=np.int64)

KeyError: np.int64(1)

In [13]:


# 模型、损失函数和优化器设置
input_dim = 64  # 输入维度
hidden_dims = [512, 1024, 2048]
output_dim = 1456  # 输出类别数量

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 训练、验证和测试
num_epochs = 200
batch_size = 2048

for split_idx, split_data in enumerate(all_data):
    print(f"\nProcessing Split {split_idx + 1}...")

    # 从 all_data 中获取训练集、验证集和测试集
    train_latents = split_data["train_latents"]
    train_tokenids = split_data["train_tokenids"]
    test_latents = split_data["test_latents"]
    test_tokenids = split_data["test_tokenids"]

    # 划分验证集（从训练集中划分 10%）
    val_size = int(0.1 * len(train_latents))
    val_latents = train_latents[:val_size]
    val_tokenids = train_tokenids[:val_size]
    train_latents = train_latents[val_size:]
    train_tokenids = train_tokenids[val_size:]

    # 转换为 PyTorch 张量
    train_dataset = TensorDataset(torch.tensor(train_latents, dtype=torch.float32),
                                   torch.tensor(train_tokenids, dtype=torch.long))
    val_dataset = TensorDataset(torch.tensor(val_latents, dtype=torch.float32),
                                 torch.tensor(val_tokenids, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(test_latents, dtype=torch.float32),
                                  torch.tensor(test_tokenids, dtype=torch.long))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # 初始化模型、损失函数和优化器
    model = LatentToTokenidModel(input_dim=input_dim, hidden_dims=hidden_dims, output_dim=output_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # 初始化绘图
    train_losses = []
    val_losses = []
    plt.figure(figsize=(10, 6))
    plt.title(f"Loss Curve for Split {split_idx + 1}")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.ion()  # 开启交互模式

    # 训练模型
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Train)", leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # 前向传播
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # 反向传播和优化
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 统计训练损失
            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)
        train_losses.append(train_loss)

        # 验证模型
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        val_losses.append(val_loss)

        # 更新实时绘图
        plt.clf()  # 清除当前图像
        plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss", marker='o')
        plt.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss", marker='o')
        plt.legend()
        plt.pause(0.1)  # 暂停以更新图像

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

        # 保存模型
        torch.save(model.state_dict(), f"model_split_{split_idx + 1}_epoch_{epoch + 1}.pth")

    # 测试模型
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    test_accuracy = correct / total
    print(f"Test Accuracy for Split {split_idx + 1}: {test_accuracy:.4f}")

    # 保存最终的 Loss 曲线
    plt.ioff()  # 关闭交互模式
    plt.savefig(f"loss_curve_split_{split_idx + 1}.png")
    plt.close()


KeyError: np.int64(1)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import numpy as np

# 定义 MLP 模型
class LatentToTokenidModel(nn.Module):
    def __init__(self, input_dim=64, hidden_dims=[512, 1024, 2048], output_dim=1456):
        super(LatentToTokenidModel, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3))  # 防止过拟合
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, output_dim))  # 输出层，维度为类别数
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

save_dir = "./parameter"
os.makedirs(save_dir, exist_ok=True)


# 模型、损失函数和优化器设置
input_dim = 64  # 输入维度
hidden_dims = [512, 1024, 2048]
output_dim = 1456  # 输出类别数量

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 训练、验证和测试
num_epochs = 200
batch_size = 2048

for split_idx, split_data in enumerate(all_data):
    print(f"\nProcessing Split {split_idx + 1}...")

    # 从 all_data 中获取训练集、验证集和测试集
    train_latents = split_data["train_latents"]
    train_tokenids = split_data["train_tokenids"]
    test_latents = split_data["test_latents"]
    test_tokenids = split_data["test_tokenids"]

    # 划分验证集（从训练集中划分 10%）
    val_size = int(0.1 * len(train_latents))
    val_latents = train_latents[:val_size]
    val_tokenids = train_tokenids[:val_size]
    train_latents = train_latents[val_size:]
    train_tokenids = train_tokenids[val_size:]

    # 转换为 PyTorch 张量
    train_dataset = TensorDataset(torch.tensor(train_latents, dtype=torch.float32),
                                   torch.tensor(train_tokenids, dtype=torch.long))
    val_dataset = TensorDataset(torch.tensor(val_latents, dtype=torch.float32),
                                 torch.tensor(val_tokenids, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(test_latents, dtype=torch.float32),
                                  torch.tensor(test_tokenids, dtype=torch.long))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # 初始化模型、损失函数和优化器
    model = LatentToTokenidModel(input_dim=input_dim, hidden_dims=hidden_dims, output_dim=output_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

cpu

Processing Split 1...

Processing Split 2...

Processing Split 3...


In [None]:

    # 初始化绘图
    train_losses = []
    val_losses = []
    plt.figure(figsize=(10, 6))
    plt.title(f"Loss Curve for Split {split_idx + 1}")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.ion()  # 开启交互模式

    # 训练模型
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Train)", leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # 前向传播
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # 反向传播和优化
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 统计训练损失和准确率
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        train_loss = running_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        train_losses.append(train_loss)

        # 验证模型
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val += y_batch.size(0)
                correct_val += (predicted == y_batch).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = correct_val / total_val
        val_losses.append(val_loss)

        # 更新实时绘图
        plt.clf()  # 清除当前图像
        plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss", marker='o')
        plt.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss", marker='o')
        plt.legend()
        plt.pause(0.1)  # 暂停以更新图像

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

        # 保存模型
        model_save_path = os.path.join(save_dir, f"model_split_{split_idx + 1}_epoch_{epoch + 1}.pth")
        torch.save(model.state_dict(), model_save_path)

    # 测试模型
    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total_test += y_batch.size(0)
            correct_test += (predicted == y_batch).sum().item()

    test_accuracy = correct_test / total_test
    print(f"Test Accuracy for Split {split_idx + 1}: {test_accuracy:.4f}")

    # 保存最终的 Loss 曲线
    plt.ioff()  # 关闭交互模式
    plt.savefig(f"loss_curve_split_{split_idx + 1}.png")
    plt.close()


In [10]:
# 加载第10轮模型并进行测试
split_idx = 1  # 假设我们要测试第一个数据分组（Split 1）
epoch_to_test = 10  # 要测试的轮次

# 构造模型保存路径
model_path = f"./parameter/model_split_{split_idx}_epoch_{epoch_to_test}.pth"

# 初始化模型
model = LatentToTokenidModel(input_dim=input_dim, hidden_dims=hidden_dims, output_dim=output_dim).to(device)

# 加载模型权重
model.load_state_dict(torch.load(model_path, map_location=device))
print(f"Loaded model from {model_path}")

# 从 all_data 中获取测试集
split_data = all_data[split_idx - 1]  # split_idx 从 1 开始，数组索引从 0 开始
test_latents = split_data["test_latents"]
test_tokenids = split_data["test_tokenids"]

# 转换为 PyTorch 张量
test_dataset = TensorDataset(torch.tensor(test_latents, dtype=torch.float32),
                              torch.tensor(test_tokenids, dtype=torch.long))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 测试模型
model.eval()
correct_test = 0
total_test = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total_test += y_batch.size(0)
        correct_test += (predicted == y_batch).sum().item()

test_accuracy = correct_test / total_test
print(f"Test Accuracy for Split {split_idx} (Epoch {epoch_to_test}): {test_accuracy:.4f}")



  model.load_state_dict(torch.load(model_path, map_location=device))


Loaded model from ./parameter/model_split_1_epoch_10.pth
Test Accuracy for Split 1 (Epoch 10): 0.0331
