In [None]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn

读入数据，查看数据

In [None]:
data_set=pd.read_csv('../data/IMDB_Dataset.csv')
data_set.head()

数据预处理：
- 大小写转换
- 删除HTML标签
- 标点
- stopword
- 分词
- 保留词干(stemmer)

In [None]:
## 大小写转换
data_set['review']= data_set['review'].str.lower()
data_set.drop_duplicates(inplace=True)


In [None]:
def remove_HTML(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
## 去除HTML标签
data_set['review'] = data_set['review'].apply(remove_HTML)

In [None]:
from string import punctuation ## punctuation是一个字符串，包含了所有的英文标点符号

def remove_punctuation(text):
    tt= str.maketrans('','',punctuation)  ## 创建一个映射表，用于字符的替换，将标点符号替换为空字符
    return text.translate(tt)         ## 使用映射表替换字符

In [None]:
## 去除标点符号
data_set['review'] = data_set['review'].apply(remove_punctuation)

In [None]:
# import nltk
# nltk.download('stopwords')  ## 下载stopwords
from nltk.corpus import stopwords  ## 从nltk.corpus语料库中导入stopwords

st_words= set(stopwords.words('english'))     ## 创建一个包含所有英文stopwords的集合
def remove_stopwords(text):   ## 定义一个函数，用于去除stopwords
    words= text.split()
    fil_words= [word for word in words if word.lower() not in st_words]
    return " ".join(fil_words)

In [None]:
## 去除stopwords
data_set['review'] = data_set['review'].apply(remove_stopwords)

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer= PorterStemmer()

data_set['review']= data_set['review'].apply(lambda x: word_tokenize(x))  ## 对review进行分词(空格分词)
## 对分词后的review进行词干提取
data_set['review'] = data_set['review'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [None]:
data_set.head()

In [None]:
def encode_label(label):
    if label == 'positive':
        return 1
    else:
        return 0
    

In [None]:
data_set['sentiment'] = data_set['sentiment'].apply(encode_label)

In [None]:
data_set.head()

建立词频表

In [None]:
vocab=dict()
def count_words(words):
    
    for word in words:
        # 使用字典的 get() 方法获取单词的计数，如果单词不存在，则默认为0
        count = vocab.get(word, 0)
        # 将单词的计数加1，并更新字典中的值
        vocab[word] = count + 1


In [None]:
for sentence in data_set['review']:
    count_words(sentence)

In [None]:
vocab_num=10000

In [None]:
## 词频表排序
sorted_vocab_list = sorted(vocab.items(), key=lambda x: x[1],reverse=True)
sorted_vocab = {key: value for key, value in sorted_vocab_list[:vocab_num]}
index_vocab = {key: i + 1 for i, key in enumerate(sorted_vocab)}

In [None]:
import json

json.dump(sorted_vocab, open('vocab_10000.json', 'w'))

In [None]:
import json 

with open("vocab_10000.json",'r', encoding='UTF-8') as f:
     sorted_vocab = json.load(f)


使用词频表对评论做one-hot encoding

In [None]:
def onehot_encoding(sentence, index_vocab):
    encoding = [index_vocab.get(word, 0) for word in sentence]
    return encoding

In [None]:
data_set['review'] = data_set['review'].apply(lambda x: onehot_encoding(x, index_vocab))

In [None]:
data_set.head()

In [None]:
def padding(sentence, seq_len):
    if len(sentence) <= seq_len:
        padding = [0] * (seq_len - len(sentence))
        return padding+ sentence
    else:
        return sentence[:seq_len]

In [None]:
## 对review进行padding，超过seq_len的截断，不足的补0
seq_len=128
data_set['review'] = data_set['review'].apply(lambda x: padding(x, seq_len))


In [None]:
train_on_gpu=torch.cuda.is_available()
train_on_gpu

In [None]:
class LSTM_model(nn.Module):
    ## 定义LSTM模型
    ###
    ## num_embeddings: 词汇表的大小
    ## embedding_dim: 词向量的维度
    ## state_dim: RNN的状态的维度
    ## n_layers: RNN的层数
    ## biderctional: 是否使用双向RNN
    ###
    def __init__(self, num_embeddings,embedding_dim,state_dim, n_layers,biderctional=False):
        super(LSTM_model, self).__init__()

        self.state_dim=state_dim
        self.n_layers=n_layers
        self.embedding=nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)

        self.LSTM=nn.LSTM(input_size=embedding_dim, hidden_size=state_dim, num_layers=n_layers, batch_first=True, bidirectional=biderctional)
        self.dropout=nn.Dropout(0.3)
        self.fc1=nn.Linear(state_dim, 1)
        self.fc2=nn.Linear(2*state_dim, 1)
        self.sigmoid=nn.Sigmoid()
        self.bidirectional=biderctional
    def forward(self, x):
        x=self.embedding(x)
        r_out, hidden=self.LSTM(x)
        last_layer_out=r_out[:,-1,:]
        if self.bidirectional:
            output=self.fc2(last_layer_out)
        else:
            output=self.fc1(last_layer_out)
        output=self.sigmoid(output)
        return output, hidden


In [None]:
num_embeddings=vocab_num+1
embedding_dim=128
state_dim=256
n_layers=1
bidirectional=False

lstm_model=LSTM_model(num_embeddings,embedding_dim,state_dim,n_layers,bidirectional)

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [None]:
batch_size=64

train_data,temp_data=train_test_split(data_set, test_size=0.3, random_state=42)
test_data,valid_data=train_test_split(temp_data, test_size=0.5, random_state=42)


train_data_set = TensorDataset(torch.from_numpy(np.array(train_data['review'].tolist())), torch.from_numpy(np.array(train_data['sentiment'].tolist())))
valid_data_set = TensorDataset(torch.from_numpy(np.array(valid_data['review'].tolist())), torch.from_numpy(np.array(valid_data['sentiment'].tolist())))
test_data_set = TensorDataset(torch.from_numpy(np.array(test_data['review'].tolist())), torch.from_numpy(np.array(test_data['sentiment'].tolist())))

train_data_loader=DataLoader(train_data_set, batch_size=batch_size, shuffle=True)
test_data_loader=DataLoader(test_data_set, batch_size=batch_size, shuffle=True)
valid_data_loader=DataLoader(valid_data_set, batch_size=batch_size, shuffle=True)

Training

In [None]:
lr=0.001
criterion=nn.BCELoss()
optimizer=torch.optim.Adam(lstm_model.parameters(), lr=lr)


In [None]:
def train(epochs, train_data_loader, valid_data_loader, rnn_model, optimizer, criterion, train_on_gpu,print_every=100):
    counter=0
    clip=5
    if train_on_gpu:
        device = torch.device('cuda')
        rnn_model.to(device)
    rnn_model.train()
    train_acc=[]
    train_loss=[]
    valid_acc=[]
    valid_loss=[]
    for i in range(epochs):
        correct = 0
        total = 0
        for input,label in train_data_loader:
            counter+=1
            if train_on_gpu:
                input, label=input.cuda(), label.cuda()
            rnn_model.zero_grad()
            output, _=rnn_model(input)
            loss=criterion(output.squeeze(), label.float())
            loss.backward()
            nn.utils.clip_grad_norm_(rnn_model.parameters(), clip)
            optimizer.step()

            predicted = torch.round(output).squeeze()
            total += label.size(0)
            correct += (predicted == label).sum().item()

            if counter % print_every ==0:
                val_losses=[]
                val_correct = 0
                val_total = 0
                rnn_model.eval()
                for val_input, val_label in valid_data_loader:
                    if train_on_gpu:
                        val_input, val_label=val_input.cuda(), val_label.cuda()
                    val_output, _=rnn_model(val_input)
                    val_loss=criterion(val_output.squeeze(), val_label.float())
                    val_losses.append(val_loss.item())

                    val_predicted = torch.round(val_output).squeeze()
                    val_total += val_label.size(0)
                    val_correct += (val_predicted == val_label).sum().item()

                rnn_model.train()
                print("Epoch: {}/{}...".format(i+1, epochs),
                    "Step: {}...".format(counter),
                    "Loss: {:.6f}...".format(loss.item()),
                    "Accuracy: {:.2f}%...".format(100 * correct / total),
                    "Val Loss: {:.6f}...".format(np.mean(val_losses)),
                    "Val Accuracy: {:.2f}%".format(100 * val_correct / val_total))
        train_acc.append(100 * correct / total)
        train_loss.append(loss.item())
        valid_acc.append(100 * val_correct / val_total)
        valid_loss.append(np.mean(val_losses))
    return train_acc, train_loss, valid_acc, valid_loss

In [None]:
epochs=15
train_acc, train_loss, valid_acc, valid_loss=train(epochs, train_data_loader, valid_data_loader, lstm_model, optimizer, criterion, train_on_gpu,print_every=100)

In [None]:
import matplotlib.pyplot as plt
def show(train_loss, valid_loss, train_acc, valid_acc, epochs):
    plt.figure(figsize=(10, 5))
    plt.plot(train_loss, label='Train Loss')
    plt.plot(valid_loss, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.xticks(range(epochs), range(1, epochs + 1))  # 设置x轴刻度为epoch数
    plt.legend()
    plt.show()

    # 绘制训练和验证准确率曲线
    plt.figure(figsize=(10, 5))
    plt.plot(train_acc, label='Train Accuracy')
    plt.plot(valid_acc, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xticks(range(epochs), range(1, epochs + 1))  # 设置x轴刻度为epoch数
    plt.legend()
    plt.show()

In [None]:
show(train_loss, valid_loss, train_acc, valid_acc, epochs)

双向LSTM模型

In [None]:
bidirectional=True
lstm_model=LSTM_model(num_embeddings,embedding_dim,state_dim,n_layers,bidirectional)

In [None]:
lr=0.001
criterion=nn.BCELoss()
optimizer=torch.optim.Adam(lstm_model.parameters(), lr=lr)


In [None]:
epochs=15
train_acc, train_loss, valid_acc, valid_loss=train(epochs, train_data_loader, valid_data_loader, lstm_model, optimizer, criterion, train_on_gpu,print_every=100)

In [None]:
show(train_loss, valid_loss, train_acc, valid_acc, epochs)