# LSTM网络情感二分类

In [1]:
import torch
from torch import nn
import tools
import pandas
import data_process


## 一、定义网络

自己写的embedding+lstm的网络，效果较差

In [2]:
class lstm(nn.Module):
    def __init__(self,num_embeddings,embedding_dim,hidden_size,device,num_layers=1,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings,embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_size,num_layers)
        self.ff = nn.Linear(hidden_size,2)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device
    def forward(self,X):
        h0 = torch.randn(self.num_layers,X.shape[0],self.hidden_size).to(self.device)
        c0 = torch.randn(self.num_layers,X.shape[0],self.hidden_size).to(self.device)
        X = self.embedding(X)
        X = X.transpose(0,1)
        output,_ = self.lstm(X,(h0,c0))
        return self.ff(output[-1])
        
        

改进的lstm，加了四个地方:
- lstm改成双向
- lstm加了dropout层
- 隐藏状态初始化为0
- 增加最大池化

In [3]:
class LSTM(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, dropout=0.5, batch_first=True)#增加了droput层和双向
        self.ff = nn.Linear(2 * hidden_size, 2)  # 双向LSTM的输出需要乘以2
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        h0 = torch.zeros(self.num_layers * 2, X.shape[0], self.hidden_size).to(self.device)  # 隐藏状态初始化为0
        c0 = torch.zeros(self.num_layers * 2, X.shape[0], self.hidden_size).to(self.device)
        
        X = self.embedding(X)
        output, (hn, cn) = self.lstm(X, (h0, c0))

        # 使用最后一个时间步的输出进行分类
        output_pooled = torch.max(output, dim=1)[0]  # 这里使用最大池化
        return self.ff(output_pooled)


In [4]:
class LSTM_Bidirectional(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        # 双向LSTM
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            bidirectional=True,  # 双向
            batch_first=True  # 保证 batch_size 是第一维度
        )
        
        # 双向LSTM输出的维度是 2 * hidden_size，因此全连接层输入为 2 * hidden_size
        self.ff = nn.Linear(2 * hidden_size, 2)  # 输出 2 类
        self.device = device
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        h0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(self.device)  # 双向LSTM需要乘以2
        c0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(self.device)
        
        X = self.embedding(X)
        output, _ = self.lstm(X, (h0, c0))
        
        # 使用 LSTM 输出的最后一个时间步的隐藏状态
        last_hidden_state = output[:, -1, :]  # 获取最后一个时间步的输出
        
        return self.ff(last_hidden_state)


In [5]:
class LSTM_Dropout(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        # 添加Dropout层
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            dropout=0.5,  # Dropout层
            batch_first=True
        )
        
        self.ff = nn.Linear(hidden_size, 2)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        h0 = torch.randn(self.num_layers, X.shape[0], self.hidden_size).to(self.device)
        c0 = torch.randn(self.num_layers, X.shape[0], self.hidden_size).to(self.device)
        
        X = self.embedding(X)
        #X = X.transpose(0, 1) batch_first=true后就不用再把X转置 by zzy
        output, _ = self.lstm(X, (h0, c0))#因为batch_size在第一个维度，output 形状：(batch_size,num_step,num_hiddens) by zzy
        return self.ff(output[:,-1,:])#取最后一个num_step  by zzy


In [6]:
class LSTM_ZeroHiddenState(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            batch_first=True
        )
        
        self.ff = nn.Linear(hidden_size, 2)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        # 将隐藏状态初始化为0
        h0 = torch.zeros(self.num_layers, X.shape[0], self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers, X.shape[0], self.hidden_size).to(self.device)
        
        X = self.embedding(X)
       # X = X.transpose(0, 1)同理不用转置 by zzy
        output, _ = self.lstm(X, (h0, c0))
        return self.ff(output[:,-1,:])# by zzy


In [7]:
class LSTM_MaxPooling(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, device, num_layers=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers, 
            batch_first=True
        )
        
        self.ff = nn.Linear(hidden_size, 2)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

    def forward(self, X):
        batch_size = X.shape[0]  # 获取 batch_size
        h0 = torch.randn(self.num_layers, X.shape[0], self.hidden_size).to(self.device)
        c0 = torch.randn(self.num_layers, X.shape[0], self.hidden_size).to(self.device)
        
        X = self.embedding(X)
       # X = X.transpose(0, 1)
        output, _ = self.lstm(X, (h0, c0))
        
        # 使用最大池化
        output_pooled = torch.max(output, dim=1)[0]
        return self.ff(output_pooled)


## 二、包装数据

In [8]:
data  = pandas.read_csv('./motionClassify.csv')
vocab = data_process.gen_vocab(data)
data_train  =  data_process.gen_dataset(data[:40000],vocab)
data_test = data_process.gen_dataset(data[40000:],vocab)
Batch_size = 64
train_iter = torch.utils.data.DataLoader(data_train,Batch_size,shuffle=True)
test_iter = torch.utils.data.DataLoader(data_test,Batch_size,shuffle=True)

## 三、训练参数设置

In [9]:


lr = 0.1
criterion = torch.nn.CrossEntropyLoss()
device = torch.device('cpu' if not torch.cuda.is_available() else 'cuda:0')
net1 = lstm(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer1 = torch.optim.SGD(net1.parameters(),lr)
net2 = LSTM(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer2 = torch.optim.SGD(net2.parameters(),lr)
net3=LSTM_Bidirectional(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer3 = torch.optim.SGD(net3.parameters(),lr)
net4=LSTM_Dropout(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer4 = torch.optim.SGD(net4.parameters(),lr)
net5=LSTM_ZeroHiddenState(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer5 = torch.optim.SGD(net5.parameters(),lr)
net6=LSTM_MaxPooling(num_embeddings=len(vocab),embedding_dim=256,hidden_size=256,device=device)
optimizer6 = torch.optim.SGD(net6.parameters(),lr)



## 四、训练和测试

第一个网络训练和测试的结果

In [10]:
tools.train(net1,train_iter,device,optimizer1,criterion)

 17%|█▋        | 106/625 [00:01<00:08, 64.04it/s]

batch100,loss = 0.6998286247253418


 33%|███▎      | 208/625 [00:03<00:07, 54.19it/s]

batch200,loss = 0.6912508606910706


 50%|████▉     | 310/625 [00:05<00:06, 50.34it/s]

batch300,loss = 0.6914330124855042


 65%|██████▍   | 406/625 [00:07<00:04, 49.20it/s]

batch400,loss = 0.6928887367248535


 81%|████████  | 506/625 [00:09<00:02, 49.94it/s]

batch500,loss = 0.6951295137405396


 97%|█████████▋| 608/625 [00:11<00:00, 50.08it/s]

batch600,loss = 0.6934190988540649


100%|██████████| 625/625 [00:11<00:00, 52.34it/s]


In [11]:
tools.test(net1,test_iter,device)

100%|██████████| 157/157 [00:01<00:00, 153.43it/s]

accuracy = 0.5005999803543091





准确率50%，接近自然概率，训练没有效果

第二个网络训练以及测试的结果

In [12]:
tools.train(net2,train_iter,device,optimizer2,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]




OutOfMemoryError: CUDA out of memory. Tried to allocate 586.00 MiB. GPU 0 has a total capacity of 47.50 GiB of which 132.44 MiB is free. Process 2277446 has 4.09 GiB memory in use. Process 2520565 has 42.01 GiB memory in use. Process 2521085 has 1.25 GiB memory in use. Of the allocated memory 829.12 MiB is allocated by PyTorch, and 100.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [13]:
tools.test(net2,test_iter,device)

100%|██████████| 157/157 [00:01<00:00, 139.04it/s]

accuracy = 0.8289999961853027





仅双向

In [None]:
tools.train(net3,train_iter,device,optimizer3,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]

 17%|█▋        | 109/625 [00:02<00:09, 54.08it/s]

batch100,loss = 0.6975179314613342


 33%|███▎      | 205/625 [00:04<00:10, 39.92it/s]

batch200,loss = 0.6889088153839111


 48%|████▊     | 303/625 [00:06<00:09, 35.30it/s]

batch300,loss = 0.6896477937698364


 65%|██████▍   | 404/625 [00:09<00:06, 32.39it/s]

batch400,loss = 0.6942225694656372


 81%|████████▏ | 508/625 [00:12<00:02, 51.19it/s]

batch500,loss = 0.6928038597106934


 97%|█████████▋| 608/625 [00:14<00:00, 45.29it/s]

batch600,loss = 0.6925061941146851


100%|██████████| 625/625 [00:15<00:00, 40.99it/s]


In [15]:
tools.test(net3,test_iter,device)

100%|██████████| 157/157 [00:01<00:00, 142.62it/s]

accuracy = 0.49959999322891235





In [None]:
tools.train(net4,train_iter,device,optimizer4,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]


ValueError: Expected input batch_size (600) to match target batch_size (64).

In [48]:
tools.test(net4,test_iter,device)

100%|██████████| 157/157 [00:00<00:00, 169.21it/s]

accuracy = 0.7942999601364136





In [65]:
tools.train(net5,train_iter,device,optimizer5,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]




RuntimeError: Expected hidden[0] size (1, 600, 256), got [1, 64, 256]

In [50]:
tools.test(net5,test_iter,device)

100%|██████████| 157/157 [00:00<00:00, 169.50it/s]

accuracy = 0.8159999847412109





In [None]:
tools.train(net6,train_iter,device,optimizer6,criterion)

  0%|          | 0/625 [00:00<?, ?it/s]




RuntimeError: Expected hidden[0] size (1, 600, 256), got [1, 64, 256]

In [52]:
tools.test(net6,test_iter,device)

100%|██████████| 157/157 [00:00<00:00, 167.46it/s]

accuracy = 0.8211999535560608





改进：可以增加训练的轮数(epoch)
可以做的任务：
- 可以把第一个网络一步步修改成第二个，看看到底是哪一个改进起了作用，实验发现仅仅改成双向网络效果还是差
- 可以在test函数里加入更多的benchmark，例如召回率（recall），F1-score，等等，见[春招算法题](./2024春招算法题.pdf)
- 对于第二个网络，可以通过增加训练轮数,改变学习率(lr)、词嵌入维度(embedding_size)、lstm的隐藏层神经元个数(hidden_size),优化器(optimizer)的种类等等参数