##### 导入必要的包

In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import re
import jieba
from   sklearn.model_selection import train_test_split
from   matplotlib import pyplot as plt

##### 读取数据

In [2]:
data = pd.read_csv("waimai_10k.csv")

In [3]:
'''自定义文本处理函数'''
def pre_text(text):
    text = text.replace('！','').replace('，','').replace('。','')
    '''对文本直接做分词'''
    return jieba.lcut(text)

In [4]:
'''在data的review属性上使用pre_text方法'''
data['review'] = data.review.apply(pre_text)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\20613\AppData\Local\Temp\jieba.cache
Loading model cost 0.562 seconds.
Prefix dict has been built successfully.


In [5]:
# 统计不同的单词出现的次数
word_count    = pd.value_counts(np.concatenate(data.review.values))

In [6]:
# 利用布尔过滤将word_count中的出现次数较少的单词删去
word_count    = word_count[word_count>2]

In [7]:
'''对文本进行编码'''
word_list     = list(word_count.index)

In [8]:
word_to_index = dict((w,word_list.index(w)+1) for w in word_list)

In [9]:
text          = data.review.apply(lambda t:[word_to_index.get(w,0) for w in t]) 

In [10]:
# 为了进单处理 将textlen人为设置为20
text_len = 20
pad_text = [l + (text_len - len(l)) * [0] if len(l)<=text_len else l[:text_len] for l in text]

In [11]:
pad_text = np.array(pad_text)

In [12]:
labels   = data.label.values

In [13]:
''' 切分数据集为测试&训练两部分 '''
x_train,x_test,y_train,y_test = train_test_split(pad_text,labels)

In [14]:
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self,text,label):
        self.text_array = text
        self.label_array= label
    
    def __getitem__(self,index):
        text = torch.LongTensor(self.text_array[index])
        label= self.label_array[index]
        return text,label
        
    def __len__(self):
        return len(self.text_array)

In [15]:
train_ds   = MyDataSet(x_train,y_train)
test_ds    = MyDataSet(x_test,y_test)

In [16]:
BATCH_SIZE = 32

In [17]:
train_dl   = torch.utils.data.DataLoader(train_ds,batch_size=BATCH_SIZE,shuffle=True)
test_dl    = torch.utils.data.DataLoader(test_ds ,batch_size=BATCH_SIZE,shuffle=True)

##### 创建模型 并测试(基于LSTM模型)

In [18]:
embedding_dim = 100
hidden_size   = 200
max_word      = len(word_to_index)+1

In [19]:
class Net(nn.Module):
    def __init__(self,max_word,embedding_dim):
        super(Net,self).__init__()
        self.em   = nn.Embedding(max_word,embedding_dim) #将max_word个单词编码到embedding_dim维向量中  batch*maxlen*100
        self.rnn  = nn.LSTM(embedding_dim,
                            hidden_size,
                            num_layers=3,                #优化策略 RNN循环层堆叠
                            dropout=0.5,
                            bidirectional=True           #设置双向RNN
                           )             
        #self.fc1 = nn.Linear(hidden_size,128)
        self.fc1  = nn.Linear(hidden_size*2,128)         #双向RNN的时候隐藏层会变成原来的两倍
        self.fc2  = nn.Linear(128,2)
    
    def forward(self,x):
        x     = self.em(x)
        r_o,_ = self.rnn(x)
        r_o   = r_o[-1]
        x     = F.dropout(F.relu(self.fc1(r_o)))
        x     = self.fc2(x)
        return x

In [20]:
model      = Net(max_word, embedding_dim)
model      = model.to("cuda")
loss_fn    = nn.CrossEntropyLoss()
optimizer  = torch.optim.Adam(model.parameters(), lr=0.01)
epochs     = 30
train_loss = []
train_acc  = []
test_loss  = []
test_acc   = []

In [21]:
def fit(epoch, model, trainloader, testloader):
    correct = 0
    total = 0
    running_loss = 0
    
    model.train()
    for x, y in trainloader:
        if torch.cuda.is_available():
            x = x.permute(1, 0)
            x, y = x.to('cuda'), y.to('cuda')
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            correct += (y_pred == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()
#    exp_lr_scheduler.step()
    epoch_loss = running_loss / len(trainloader.dataset)
    epoch_acc = correct / total
        
        
    test_correct = 0
    test_total = 0
    test_running_loss = 0 
    
    model.eval()
    with torch.no_grad():
        for x, y in testloader:
            if torch.cuda.is_available():
                x = x.permute(1, 0)
                x, y = x.to('cuda'), y.to('cuda')
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=1)
            test_correct += (y_pred == y).sum().item()
            test_total += y.size(0)
            test_running_loss += loss.item()
    
    epoch_test_loss = test_running_loss / len(testloader.dataset)
    epoch_test_acc = test_correct / test_total
    
        
    print('epoch:\t', epoch,'loss:\t', round(epoch_loss, 3),'accuracy:\t', round(epoch_acc, 3),'test_loss:\t', round(epoch_test_loss, 3),'test_accuracy:\t', round(epoch_test_acc, 3))
        
    return epoch_loss, epoch_acc, epoch_test_loss, epoch_test_acc

In [22]:
for epoch in range(epochs):
    epoch_loss, epoch_acc, epoch_test_loss, epoch_test_acc = fit(epoch,model,train_dl,test_dl)
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    test_loss.append(epoch_test_loss)
    test_acc.append(epoch_test_acc)

epoch:	 0 loss:	 0.015 accuracy:	 0.789 test_loss:	 0.011 test_accuracy:	 0.863
epoch:	 1 loss:	 0.01 accuracy:	 0.888 test_loss:	 0.011 test_accuracy:	 0.879
epoch:	 2 loss:	 0.009 accuracy:	 0.903 test_loss:	 0.01 test_accuracy:	 0.876
epoch:	 3 loss:	 0.008 accuracy:	 0.912 test_loss:	 0.012 test_accuracy:	 0.877
epoch:	 4 loss:	 0.008 accuracy:	 0.917 test_loss:	 0.01 test_accuracy:	 0.881
epoch:	 5 loss:	 0.007 accuracy:	 0.925 test_loss:	 0.011 test_accuracy:	 0.883
epoch:	 6 loss:	 0.007 accuracy:	 0.925 test_loss:	 0.012 test_accuracy:	 0.878
epoch:	 7 loss:	 0.008 accuracy:	 0.919 test_loss:	 0.013 test_accuracy:	 0.859
epoch:	 8 loss:	 0.008 accuracy:	 0.913 test_loss:	 0.011 test_accuracy:	 0.872
epoch:	 9 loss:	 0.008 accuracy:	 0.916 test_loss:	 0.012 test_accuracy:	 0.86
epoch:	 10 loss:	 0.009 accuracy:	 0.904 test_loss:	 0.013 test_accuracy:	 0.869
epoch:	 11 loss:	 0.009 accuracy:	 0.902 test_loss:	 0.014 test_accuracy:	 0.797
epoch:	 12 loss:	 0.01 accuracy:	 0.885 te