In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

In [4]:
def process_text(text, target_length=1057):
    numbers = list(map(int, text.split()))
    if len(numbers) < target_length:
        numbers.extend([0] * (target_length - len(numbers)))  # 填充0
    elif len(numbers) > target_length:
        numbers = numbers[:target_length]  # 截断
    return numbers

In [4]:
train = pd.read_csv('/root/data/train_set.csv',sep='\t') 

In [3]:
# 1. 数据预处理，转换成嵌入层能够处理的格式
# 1.1 text转换成张量
text = train['text'].tolist()

text_numbers = [process_text(item) for item in text]
text_tensor = torch.tensor(text_numbers, dtype=torch.long)

NameError: name 'train' is not defined

In [6]:
text_tensor.to(device)

tensor([[2967, 6758,  339,  ..., 4741,  900, 1635],
        [4464,  486, 6352,  ...,    0,    0,    0],
        [7346, 4068, 5074,  ...,    0,    0,    0],
        ...,
        [6811, 1580, 7539,  ..., 7261, 2255, 5997],
        [6405, 3203, 6644,  ...,    0,    0,    0],
        [4350, 3878, 3268,  ..., 5028, 4939, 4109]], device='cuda:0')

In [6]:
## 1.2 转化成dataloader
class LabelDataset(Dataset):
    def __init__(self, label_sequences, multi_class_labels):
        self.label_sequences = [torch.tensor(seq, dtype=torch.long) for seq in label_sequences]
        self.multi_class_labels = multi_class_labels
 
    def __len__(self):
        return len(self.label_sequences)
 
    def __getitem__(self, idx):
        return self.label_sequences[idx], self.multi_class_labels[idx]

In [8]:
# 创建数据集和数据加载器
dataset = LabelDataset(text_tensor, train['label'])
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

  self.label_sequences = [torch.tensor(seq, dtype=torch.long) for seq in label_sequences]


In [7]:
# 2. 定义模型
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)  # 嵌入层
        lstm_out, _ = self.lstm(embedded)  # LSTM 层
        # 取 LSTM 的最后一个时间步的输出
        last_output = lstm_out[:, -1, :]
        output = self.fc(last_output)  # 全连接层
        return output

In [9]:
# 参数设置
vocab_size = 10000
embedding_dim = 128
hidden_dim = 20
num_classes = 14

In [None]:
# 实例化模型
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)
model.to(device)

In [11]:
# 3. 模型训练
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
# 训练循环
num_epochs = 5
for epoch in range(num_epochs):
    for label_seq, label in dataloader:
        # 前向传播
        outputs = model(label_seq.to(device))
        loss = criterion(outputs, label.to(device))

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 0.4295
Epoch [2/5], Loss: 0.0144
Epoch [3/5], Loss: 0.3233
Epoch [4/5], Loss: 0.1459
Epoch [5/5], Loss: 0.0215


In [10]:
# 4. 模型存储与加载
## 4.1 模型存储
# torch.save(model.state_dict(), 'lstm_classifier.pth')

## 4.2 模型加载
# 定义模型结构
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)
# 加载模型状态字典
model.load_state_dict(torch.load('lstm_classifier.pth'))
# 将模型移动到 GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMClassifier(
  (embedding): Embedding(10000, 128)
  (lstm): LSTM(128, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=14, bias=True)
)

In [11]:
# 5. 模型预测
test = pd.read_csv('/root/data/test_a.csv', sep = '\t')

## 5.1 对test数据集进行预处理
test_text = test['text'].tolist()
test_numbers = [process_text(item) for item in test_text]
test_tensor = torch.tensor(test_numbers, dtype=torch.long)
test_tensor.to(device)

tensor([[5399, 3117, 1070,  ...,    0,    0,    0],
        [2491, 4109, 1757,  ..., 5681,  900, 1635],
        [2673, 5076, 6835,  ...,    0,    0,    0],
        ...,
        [5338, 1952, 3117,  ...,    0,    0,    0],
        [ 893, 3469, 5775,  ...,    0,    0,    0],
        [2400, 4409, 4412,  ...,    0,    0,    0]], device='cuda:0')

In [None]:
## 5.2 记载到dotaloader里面

In [12]:
with torch.no_grad():  # 禁用梯度计算，减少内存消耗
    outputs = model(test_tensor.to(device))

OutOfMemoryError: CUDA out of memory. Tried to allocate 25.20 GiB. GPU 0 has a total capacity of 31.50 GiB of which 5.20 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 26.00 GiB is allocated by PyTorch, and 9.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
pred

In [None]:
## 5.2 结果保存
test['label'] = pred
test['label'].to_csv("/root/data/countVectorRidgeclassification.csv", index=False)  