### 数据集: 某外卖平台收集的用户评价，正向 4000 条，负向 约 8000 条

## 字段说明

| 字段 | 说明 |
| ---- | ---- |
| label | 1 表示正向评论，0 表示负向评论 |
| review | 评论内容 |

安装 jieba 和 pandas

     pip install pandas jieba sklearn -i https://pypi.doubanio.com/simple

In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader

In [112]:
data = pd.read_csv('waimai_10k.csv')

In [113]:
data.head()

Unnamed: 0,label,review
0,1,很快，好吃，味道足，量大
1,1,没有送水没有送水没有送水
2,1,非常快，态度好。
3,1,方便，快捷，味道可口，快递给力
4,1,菜味道很棒！送餐很及时！


In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11987 entries, 0 to 11986
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   11987 non-null  int64 
 1   review  11987 non-null  object
dtypes: int64(1), object(1)
memory usage: 187.4+ KB


In [115]:
data.label.value_counts()

0    7987
1    4000
Name: label, dtype: int64

In [116]:
def pre_text(text):
    text = text.replace('！', '').replace('，', '').replace('。', '')
    return jieba.lcut(text)

In [117]:
data['review'] = data.review.apply(pre_text)

In [118]:
data.review

0                                      [很快, 好吃, 味道, 足量, 大]
1                                 [没有, 送水, 没有, 送水, 没有, 送水]
2                                           [非常, 快, 态度, 好]
3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
                               ...                        
11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
11984                                  [凉皮, 太辣, ,, 吃不下, 都]
11985                                [本来, 迟到, 了, 还, 自己, 点]
11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 凉面, 没, 想象, 中, ...
Name: review, Length: 11987, dtype: object

In [119]:
from torchtext.vocab import build_vocab_from_iterator     # 创建词表工具

In [120]:
def yield_tokens(data):
    for text in data:
        yield text

In [121]:
vocab = build_vocab_from_iterator(yield_tokens(data.review), specials=["<pad>", "<unk>"], min_freq=2)

In [122]:
vocab.set_default_index(vocab["<unk>"])

In [161]:
vocab_size = len(vocab)

In [124]:
data.review[0]

['很快', '好吃', '味道', '足量', '大']

In [125]:
vocab(data.review[0])

[55, 14, 13, 5228, 114]

In [128]:
i = int(len(data)*0.8)

In [130]:
train_data = data.sample(i)

In [135]:
data.index

RangeIndex(start=0, stop=11987, step=1)

In [134]:
train_data.index

Int64Index([ 3168,  8984,  7230,  6565,  8710,  5368,  5727,  3483, 10759,
             4860,
            ...
             3625,  2622, 11514,  3844,   341,  8928,  6757,  8872,  8203,
             4401],
           dtype='int64', length=9589)

In [148]:
test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]

In [155]:
test_data.values

array([[1, list(['非常', '快', '态度', '好'])],
       [1, list(['方便快捷', '味道', '可口', '快', '递给', '力'])],
       [1,
        list(['超级', '快', '就', '送到', '了', '这么', '冷', '的', '天气', '骑士', '们', '辛苦', '了', '谢谢你们', '麻辣', '香锅', '依然', '很', '好吃'])],
       ...,
       [0,
        list(['差', '的', '无法形容', '我花', '18', '元', '买', '的', '羊肉汤', '结果', '拿来', '里面', '就', '一块', '肉', '粉丝', '也', '就', '那么', '几根', '从来', '没见', '过', '这么', '坑爹', '的', '商家', '了', '简直', '像', '别人', '吃', '剩下', '的', '给', '我', '送来', '的', '？', '恶心', '的', '要命', '卤蛋', '也', '是', '不', '新鲜', '的', '？', '这个', '好', '在', '没', '几元', '钱', '啊', '等', '倒闭', '吧', '你们'])],
       [0,
        list(['肉夹馍', '没', '送来', '告诉', '我', '十分钟', '内', '送到', '可是', '二十分钟', '了', '还', '没到'])],
       [0, list(['本来', '迟到', '了', '还', '自己', '点'])]], dtype=object)

创建 dataset

In [149]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [271]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        precess_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(precess_text)
    label_list = torch.tensor(label_list)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return label_list.to(device), text_list.to(device)

In [272]:
train_dataloader = DataLoader(train_data.values, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data.values, batch_size=64, shuffle=False, collate_fn=collate_batch)

Embeding : 把文本映射为一个密集向量

In [273]:
embeding_dim = 100
hidden_size = 16

In [350]:
class RNN_Net(nn.Module):
    def __init__(self, vocab_size, embeding_dim, hidden_size):
        super(RNN_Net, self).__init__()
        self.em = nn.Embedding(vocab_size, embeding_dim)   
        self.conv1 = nn.Conv1d(in_channels=embeding_dim, out_channels=64, kernel_size=7)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=64,out_channels=128, kernel_size=7)
        self.avgpool = nn.AdaptiveAvgPool1d(output_size=5)
        self.fc1 = nn.Linear(128*5, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, inputs):
        x = self.em(inputs)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.avgpool(x)
        x = x.view(-1, 128*5)
        x = F.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [380]:
model = RNN_Net(vocab_size, embeding_dim, hidden_size).to(device)

In [381]:
weight = torch.tensor([1, 2], dtype=torch.float32)
loss_fn = nn.CrossEntropyLoss(weight=weight)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [382]:
def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text in dataloader:
        predicted_label = model(text)
        loss = loss_fn(predicted_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [383]:
def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [384]:
def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc

In [385]:
EPOCHS = 25

In [386]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)

epoch: 0, train_loss: 0.63849, train_acc: 60.5% ,test_loss: 0.54427, test_acc: 78.9%
epoch: 1, train_loss: 0.51010, train_acc: 75.1% ,test_loss: 0.55379, test_acc: 71.1%
epoch: 2, train_loss: 0.39867, train_acc: 83.8% ,test_loss: 0.40319, test_acc: 83.5%
epoch: 3, train_loss: 0.34196, train_acc: 86.7% ,test_loss: 0.45021, test_acc: 84.2%
epoch: 4, train_loss: 0.29826, train_acc: 88.8% ,test_loss: 0.40772, test_acc: 83.3%
epoch: 5, train_loss: 0.28663, train_acc: 88.6% ,test_loss: 0.39047, test_acc: 87.6%
epoch: 6, train_loss: 0.23601, train_acc: 91.1% ,test_loss: 0.44591, test_acc: 83.9%
epoch: 7, train_loss: 0.20755, train_acc: 92.2% ,test_loss: 0.48363, test_acc: 88.2%
epoch: 8, train_loss: 0.18460, train_acc: 93.5% ,test_loss: 0.65321, test_acc: 86.1%
epoch: 9, train_loss: 0.26408, train_acc: 90.5% ,test_loss: 0.62031, test_acc: 65.6%
epoch:10, train_loss: 0.22891, train_acc: 90.5% ,test_loss: 0.66760, test_acc: 84.2%
epoch:11, train_loss: 0.13704, train_acc: 95.2% ,test_loss: 0.668