### 数据集: 某外卖平台收集的用户评价，正向 4000 条，负向 约 8000 条

## 字段说明

| 字段 | 说明 |
| ---- | ---- |
| label | 1 表示正向评论，0 表示负向评论 |
| review | 评论内容 |

安装 jieba 和 pandas

     pip install pandas jieba sklearn -i https://pypi.doubanio.com/simple

In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader

In [35]:
nn.GRU?

In [2]:
data = pd.read_csv('waimai_10k.csv')

In [3]:
data.head()

Unnamed: 0,label,review
0,1,很快，好吃，味道足，量大
1,1,没有送水没有送水没有送水
2,1,非常快，态度好。
3,1,方便，快捷，味道可口，快递给力
4,1,菜味道很棒！送餐很及时！


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11987 entries, 0 to 11986
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   11987 non-null  int64 
 1   review  11987 non-null  object
dtypes: int64(1), object(1)
memory usage: 187.4+ KB


In [5]:
data.label.value_counts()

0    7987
1    4000
Name: label, dtype: int64

In [6]:
def pre_text(text):
    text = text.replace('！', '').replace('，', '').replace('。', '')
    return jieba.lcut(text)

In [7]:
data['review'] = data.review.apply(pre_text)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\guanghua\AppData\Local\Temp\jieba.cache
Loading model cost 0.534 seconds.
Prefix dict has been built successfully.


In [8]:
data.review

0                                      [很快, 好吃, 味道, 足量, 大]
1                                 [没有, 送水, 没有, 送水, 没有, 送水]
2                                           [非常, 快, 态度, 好]
3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
                               ...                        
11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
11984                                  [凉皮, 太辣, ,, 吃不下, 都]
11985                                [本来, 迟到, 了, 还, 自己, 点]
11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 凉面, 没, 想象, 中, ...
Name: review, Length: 11987, dtype: object

In [9]:
from torchtext.vocab import build_vocab_from_iterator     # 创建词表工具

In [10]:
def yield_tokens(data):
    for text in data:
        yield text

In [11]:
vocab = build_vocab_from_iterator(yield_tokens(data.review), specials=["<pad>", "<unk>"], min_freq=2)

In [12]:
vocab.set_default_index(vocab["<unk>"])

In [13]:
vocab_size = len(vocab)

In [14]:
data.review[0]

['很快', '好吃', '味道', '足量', '大']

In [15]:
vocab(data.review[0])

[55, 14, 13, 5228, 114]

In [16]:
i = int(len(data)*0.8)

In [17]:
train_data = data.sample(i)

In [18]:
data.index

RangeIndex(start=0, stop=11987, step=1)

In [19]:
train_data.index

Int64Index([  392,  1105,  8297,  1976,  8521,  2179,   689,  2716,  9566,
             3510,
            ...
             1387,  7721,  6657,  5624,  1640,   955, 11862,  6361,  4117,
              453],
           dtype='int64', length=9589)

In [20]:
test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]

In [21]:
test_data.values

array([[1,
        list(['超级', '快', '就', '送到', '了', '这么', '冷', '的', '天气', '骑士', '们', '辛苦', '了', '谢谢你们', '麻辣', '香锅', '依然', '很', '好吃'])],
       [1,
        list(['经过', '上次', '晚', '了', '2', '小时', '这次', '超级', '快', '20', '分钟', '就', '送到', '了', '…', '…'])],
       [1, list(['味道', '好', '送', '餐快', '分量', '足'])],
       ...,
       [0, list(['谢谢', '速度', '很快', '辛苦', '了'])],
       [0,
        list(['外送员', '很', '赞', '商家', '能', '不能', '仔细', '看', '订单', '啊', '点', '的', '干', '拌面', '送来', '的', '是', '汤面', '说', '了', '粉', '汤', '羊血要', '多加', '辣椒', '送来', '的', '一点儿', '辣', '没有'])],
       [0, list(['肉夹馍', '肉太少'])]], dtype=object)

创建 dataset

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        precess_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(precess_text)
    label_list = torch.tensor(label_list)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list)
    return label_list.to(device), text_list.to(device)

In [24]:
train_dataloader = DataLoader(train_data.values, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data.values, batch_size=64, shuffle=False, collate_fn=collate_batch)

Embeding : 把文本映射为一个密集向量

In [25]:
embeding_dim = 100
hidden_size = 200

In [26]:
class BIRNN_Net(nn.Module):
    def __init__(self, vocab_size, embeding_dim, hidden_size):
        super(RNN_Net, self).__init__()
        self.em = nn.Embedding(vocab_size, embeding_dim)   
        self.rnn = nn.LSTM(embeding_dim, hidden_size, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size*2, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, inputs):
        x = self.em(inputs)
        x = F.dropout(x)
        x, _ = self.rnn(x)
        x = F.dropout(F.relu(self.fc1(x[-1])))
        x = self.fc2(x)
        return x

In [27]:
model = RNN_Net(vocab_size, embeding_dim, hidden_size).to(device)

In [28]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), betas=(0.5, 0.5), lr=0.005)

In [29]:
def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text in dataloader:
        predicted_label = model(text)
        loss = loss_fn(predicted_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [30]:
def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [31]:
def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc

In [32]:
EPOCHS = 25

In [33]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)

torch.Size([61, 64, 400])
torch.Size([90, 64, 400])
torch.Size([92, 64, 400])
torch.Size([61, 64, 400])
torch.Size([59, 64, 400])
torch.Size([78, 64, 400])
torch.Size([92, 64, 400])
torch.Size([46, 64, 400])
torch.Size([53, 64, 400])
torch.Size([153, 64, 400])
torch.Size([84, 64, 400])
torch.Size([47, 64, 400])
torch.Size([62, 64, 400])
torch.Size([47, 64, 400])
torch.Size([85, 64, 400])
torch.Size([43, 64, 400])
torch.Size([57, 64, 400])
torch.Size([61, 64, 400])
torch.Size([151, 64, 400])
torch.Size([60, 64, 400])
torch.Size([37, 64, 400])
torch.Size([66, 64, 400])
torch.Size([84, 64, 400])
torch.Size([62, 64, 400])
torch.Size([81, 64, 400])
torch.Size([80, 64, 400])
torch.Size([112, 64, 400])
torch.Size([87, 64, 400])
torch.Size([67, 64, 400])
torch.Size([51, 64, 400])
torch.Size([65, 64, 400])
torch.Size([63, 64, 400])
torch.Size([90, 64, 400])
torch.Size([68, 64, 400])
torch.Size([74, 64, 400])
torch.Size([52, 64, 400])
torch.Size([51, 64, 400])
torch.Size([163, 64, 400])
torch.Si

KeyboardInterrupt: 

In [None]:
optimizer = torch.optim.Adam(model.parameters(), betas=(0.5, 0.5), lr=0.0005)

In [None]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)

In [None]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)