# LSTM网络情感二分类

In [1]:
import torch
from torch import nn
import tools
import pandas
import data_process


## 一、定义网络

自己写的embedding+lstm的网络，效果较差

In [2]:
class lstm(nn.Module):
    def __init__(self,num_embeddings,embedding_dim,hidden_size,device,num_layers=1,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings,embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_size,num_layers)
        self.ff = nn.Linear(hidden_size,2)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device
    def forward(self,X):
        h0 = torch.randn(self.num_layers,X.shape[0],self.hidden_size).to(self.device)
        c0 = torch.randn(self.num_layers,X.shape[0],self.hidden_size).to(self.device)
        X = self.embedding(X)
        X = X.transpose(0,1)
        output,_ = self.lstm(X,(h0,c0))
        return self.ff(output[-1])
        
        

改进的lstm，加了四个地方:
- lstm改成双向
- lstm加了dropout层
- 隐藏状态初始化为0
- 增加最大池化

In [3]:
class LSTM(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, dropout=0.5, batch_first=True)#增加了droput层和双向
        self.ff = nn.Linear(2 * hidden_size, 2)  # 双向LSTM的输出需要乘以2
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        h0 = torch.zeros(self.num_layers * 2, X.shape[0], self.hidden_size).to(self.device)  # 隐藏状态初始化为0
        c0 = torch.zeros(self.num_layers * 2, X.shape[0], self.hidden_size).to(self.device)
        
        X = self.embedding(X)
        output, (hn, cn) = self.lstm(X, (h0, c0))

        # 使用最后一个时间步的输出进行分类
        output_pooled = torch.max(output, dim=1)[0]  # 这里使用最大池化
        return self.ff(output_pooled)


对四种优化进行单独测试

In [109]:
class LSTM_Bidirectional(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        # 双向LSTM
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            bidirectional=True,  # 双向
            batch_first=True  # 保证 batch_size 是第一维度
        )
        
        # 双向LSTM输出的维度是 2 * hidden_size，因此全连接层输入为 2 * hidden_size
        self.ff = nn.Linear(2 * hidden_size, 2)  # 输出 2 类
        self.device = device
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        h0 = torch.randn(self.num_layers * 2, batch_size, self.hidden_size).to(self.device)  # 双向LSTM需要乘以2
        c0 = torch.randn(self.num_layers * 2, batch_size, self.hidden_size).to(self.device)
        
        X = self.embedding(X)
        output, _ = self.lstm(X, (h0, c0))
        
        # 使用 LSTM 输出的最后一个时间步的隐藏状态
        last_hidden_state = output[:, -1, :]  # 获取最后一个时间步的输出
        
        return self.ff(last_hidden_state)


In [110]:
class LSTM_Dropout(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        # 添加Dropout层
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            dropout=0.5,  # Dropout层
            batch_first=True
        )
        
        self.ff = nn.Linear(hidden_size, 2)  # 假设是二分类任务
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        h0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(self.device)  # 隐藏状态初始化为0
        c0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(self.device)  # 细胞状态初始化为0
        
        X = self.embedding(X)  # 将X映射到embedding空间
        
        output, (hn, cn) = self.lstm(X, (h0, c0))  # 获取LSTM输出和最后的h0, c0

        # 使用LSTM输出的最后一个时间步的隐藏状态进行分类
        last_hidden_state = output[:, -1, :]  # 选择最后一个时间步的输出（适用于单向LSTM）
        
        return self.ff(last_hidden_state)  # 使用全连接层进行分类


In [111]:
class LSTM_ZeroHiddenState(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            batch_first=True
        )
        
        self.ff = nn.Linear(hidden_size, 2)  # 假设是二分类问题
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        # 将隐藏状态初始化为0
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
        
        # 获取嵌入层的输出
        X = self.embedding(X)
        
        # LSTM 层的输出
        output, _ = self.lstm(X, (h0, c0))
        
        # 取最后一个时间步的输出（对于 batch_first=True, 是 X.shape[0] 个序列的最后一个时间步）
        # 这里的 `output` 形状是 (batch_size, seq_len, hidden_size)
        last_hidden_state = output[:, -1, :]  # 获取每个序列的最后一个时间步的隐藏状态
        
        return self.ff(last_hidden_state)  # 使用全连接层进行分类


In [112]:
class LSTM_MaxPooling(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            batch_first=True
        )
        
        self.ff = nn.Linear(hidden_size, 2)  # 假设是二分类问题
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        
        # 将隐藏状态初始化为零
        h0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(self.device)
        c0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(self.device)
        
        # 通过嵌入层得到输入
        X = self.embedding(X)
        
        # LSTM 输出
        output, _ = self.lstm(X, (h0, c0))
        
        # 使用最大池化操作，在时间维度（dim=1）上进行池化
        output_pooled = torch.max(output, dim=1)[0]  # [0] 返回最大值
        
        # 通过全连接层输出分类结果
        return self.ff(output_pooled)


## 二、包装数据

In [113]:
data  = pandas.read_csv('./motionClassify.csv')
vocab = data_process.gen_vocab(data)
data_train  =  data_process.gen_dataset(data[:40000],vocab)
data_test = data_process.gen_dataset(data[40000:],vocab)
Batch_size = 64
train_iter = torch.utils.data.DataLoader(data_train,Batch_size,shuffle=True)
test_iter = torch.utils.data.DataLoader(data_test,Batch_size,shuffle=True)

## 三、训练参数设置

In [114]:


lr = 0.1
criterion = torch.nn.CrossEntropyLoss()
device = torch.device('cpu' if not torch.cuda.is_available() else 'cuda:0')
net1 = lstm(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer1 = torch.optim.SGD(net1.parameters(),lr)
net2 = LSTM(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer2 = torch.optim.SGD(net2.parameters(),lr)
net3=LSTM_Bidirectional(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer3 = torch.optim.SGD(net3.parameters(),lr)
net4=LSTM_Dropout(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer4 = torch.optim.SGD(net4.parameters(),lr)
net5=LSTM_ZeroHiddenState(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer5 = torch.optim.SGD(net5.parameters(),lr)
net6=LSTM_MaxPooling(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer6 = torch.optim.SGD(net6.parameters(),lr)



## 四、训练和测试

第一个网络训练和测试的结果

In [115]:
tools.train(net1,train_iter,device,optimizer1,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 16%|█▋        | 103/625 [00:03<00:18, 28.26it/s]

batch100,loss = 0.6900129318237305


 32%|███▏      | 203/625 [00:06<00:15, 28.12it/s]

batch200,loss = 0.6934603452682495


 48%|████▊     | 303/625 [00:10<00:10, 29.31it/s]

batch300,loss = 0.6932564377784729


 65%|██████▍   | 405/625 [00:13<00:07, 28.23it/s]

batch400,loss = 0.691300094127655


 81%|████████  | 505/625 [00:17<00:03, 32.20it/s]

batch500,loss = 0.6803222298622131


 96%|█████████▋| 603/625 [00:20<00:00, 27.01it/s]

batch600,loss = 0.6966476440429688


100%|██████████| 625/625 [00:21<00:00, 29.00it/s]


In [116]:
tools.test(net1,test_iter,device)

100%|██████████| 157/157 [00:02<00:00, 65.04it/s]


accuracy = 0.5006999969482422


准确率50%，接近自然概率，训练没有效果

第二个网络训练以及测试的结果

In [117]:
tools.train(net2,train_iter,device,optimizer2,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 17%|█▋        | 105/625 [00:03<00:18, 28.29it/s]

batch100,loss = 0.9045735597610474


 33%|███▎      | 206/625 [00:07<00:15, 27.52it/s]

batch200,loss = 0.6236743330955505


 49%|████▊     | 304/625 [00:11<00:12, 25.36it/s]

batch300,loss = 0.5665561556816101


 64%|██████▍   | 403/625 [00:15<00:08, 26.84it/s]

batch400,loss = 0.8793125748634338


 81%|████████  | 504/625 [00:19<00:04, 25.97it/s]

batch500,loss = 0.3266298770904541


 96%|█████████▋| 602/625 [00:23<00:00, 24.79it/s]

batch600,loss = 0.48741859197616577


100%|██████████| 625/625 [00:24<00:00, 25.70it/s]


In [118]:
tools.test(net2,test_iter,device)

100%|██████████| 157/157 [00:02<00:00, 60.28it/s]

accuracy = 0.8192999958992004





仅双向网络训练以及测试的结果

In [119]:
tools.train(net3,train_iter,device,optimizer3,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 16%|█▋        | 102/625 [00:04<00:21, 24.30it/s]

batch100,loss = 0.6927682757377625


 32%|███▏      | 203/625 [00:08<00:17, 23.96it/s]

batch200,loss = 0.6973859667778015


 49%|████▊     | 304/625 [00:11<00:13, 23.90it/s]

batch300,loss = 0.688214123249054


 65%|██████▍   | 404/625 [00:15<00:08, 24.97it/s]

batch400,loss = 0.7006278038024902


 81%|████████  | 505/625 [00:19<00:04, 26.56it/s]

batch500,loss = 0.7037380337715149


 96%|█████████▋| 603/625 [00:23<00:00, 26.11it/s]

batch600,loss = 0.6939540505409241


100%|██████████| 625/625 [00:24<00:00, 25.47it/s]


In [120]:
tools.test(net3,test_iter,device)

100%|██████████| 157/157 [00:02<00:00, 60.98it/s]

accuracy = 0.49289998412132263





正确率50%，无效


仅添加dropout层网络训练以及测试的结果

In [121]:
tools.train(net4,train_iter,device,optimizer4,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 17%|█▋        | 104/625 [00:03<00:17, 28.96it/s]

batch100,loss = 0.7074372172355652


 33%|███▎      | 205/625 [00:07<00:15, 27.69it/s]

batch200,loss = 0.688654899597168


 49%|████▉     | 305/625 [00:10<00:11, 28.26it/s]

batch300,loss = 0.6902520656585693


 65%|██████▍   | 405/625 [00:13<00:07, 30.56it/s]

batch400,loss = 0.6955899596214294


 80%|████████  | 503/625 [00:17<00:04, 29.51it/s]

batch500,loss = 0.6946659088134766


 96%|█████████▋| 603/625 [00:20<00:00, 29.44it/s]

batch600,loss = 0.6857205033302307


100%|██████████| 625/625 [00:21<00:00, 29.25it/s]


In [122]:
tools.test(net4,test_iter,device)

100%|██████████| 157/157 [00:02<00:00, 65.82it/s]

accuracy = 0.4992999732494354





正确率50%，无效

仅隐藏状态初始化为0网络训练以及测试的结果

In [123]:
tools.train(net5,train_iter,device,optimizer5,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 16%|█▋        | 103/625 [00:03<00:18, 27.87it/s]

batch100,loss = 0.7070851922035217


 33%|███▎      | 206/625 [00:06<00:13, 31.03it/s]

batch200,loss = 0.696898877620697


 48%|████▊     | 303/625 [00:10<00:11, 27.32it/s]

batch300,loss = 0.6916971802711487


 64%|██████▍   | 403/625 [00:13<00:07, 28.23it/s]

batch400,loss = 0.6911929249763489


 81%|████████  | 505/625 [00:17<00:04, 28.74it/s]

batch500,loss = 0.6963237524032593


 97%|█████████▋| 604/625 [00:20<00:00, 29.32it/s]

batch600,loss = 0.6935922503471375


100%|██████████| 625/625 [00:21<00:00, 29.51it/s]


In [124]:
tools.test(net5,test_iter,device)

100%|██████████| 157/157 [00:02<00:00, 63.19it/s]

accuracy = 0.5006999969482422





正确率50%，无效

仅增加最大池化网络训练以及测试的结果

In [125]:
tools.train(net6,train_iter,device,optimizer6,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 16%|█▋        | 103/625 [00:03<00:19, 27.22it/s]

batch100,loss = 0.7003673315048218


 33%|███▎      | 205/625 [00:07<00:14, 28.53it/s]

batch200,loss = 0.7299268245697021


 49%|████▉     | 305/625 [00:10<00:10, 29.74it/s]

batch300,loss = 0.6113128662109375


 64%|██████▍   | 403/625 [00:13<00:07, 28.76it/s]

batch400,loss = 0.5593522787094116


 81%|████████  | 505/625 [00:17<00:03, 31.66it/s]

batch500,loss = 0.39529335498809814


 96%|█████████▋| 603/625 [00:20<00:00, 28.45it/s]

batch600,loss = 0.441714346408844


100%|██████████| 625/625 [00:21<00:00, 29.19it/s]


In [126]:
tools.test(net6,test_iter,device)

100%|██████████| 157/157 [00:02<00:00, 60.92it/s]


accuracy = 0.7549999952316284


正确率75%,大大提高

综上可见，优化中起主要作用的为增加最大池化

改进：可以增加训练的轮数(epoch)
可以做的任务：
- 可以把第一个网络一步步修改成第二个，看看到底是哪一个改进起了作用，实验发现仅仅改成双向网络效果还是差
- 可以在test函数里加入更多的benchmark，例如召回率（recall），F1-score，等等，见[春招算法题](./2024春招算法题.pdf)
- 对于第二个网络，可以通过增加训练轮数,改变学习率(lr)、词嵌入维度(embedding_size)、lstm的隐藏层神经元个数(hidden_size),优化器(optimizer)的种类等等参数