# 推荐评论展示任务

**任务描述**<br>
本次推荐评论展示任务的目标是从真实的用户评论中，挖掘合适作为推荐理由的短句。点评软件展示的推荐理由具有长度限制，而真实用户评论语言通顺、信息完整。综合来说，两者都具有用户情感的正负向，但是展示推荐理由的内容相关性高于评论，需要较强的文本吸引力。

**数据集**<br>
本次推荐评论展示任务所采用的数据集是点评软件中，用户中文评论的集合。

**数据样例**<br>
本次任务要求将这些评论分为两类，即“展示”和“不展示”，分别以数字1和0作为标注，如下图所示：

**文档说明**<br>
数据集文件分为训练集和测试集部分，对应文件如下：

- 带标签的训练数据：`train_shuffle.txt` 
- 不带标签的测试数据：`test_handout.txt`

`test_handout.txt`文件的行索引从0开始，对应于ID一列，评论内容为“展示”的预测概率应于Prediction一列。

需要注意的是，由于数据在标注时存在主观偏好，标记为“不展示”（0）的评论不一定是真正的负面评论，反之亦然。但是这种情况的存在，不会对任务造成很大的歧义，通过基准算法我们可以在测试集上实现很高的性能。

## 读取数据

In [1]:
import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import pandas as pd
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
import argparse
import torch
import torchtext.data as data
from torchtext.vocab import Vectors
import sys 
import re
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 


In [2]:
train = pd.read_csv('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/train.csv', header=0)
valid = pd.read_csv('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/valid.csv', header=0)
test = pd.read_csv('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/test.csv', header=0)
train[:5]

Unnamed: 0,LABEL,TEXT
0,0,酸菜鱼不错
1,0,轻食素食都是友善的饮食方式
2,0,完爆中午吃的农家乐
3,1,烤鱼很入味
4,0,有种入口即化的感觉


# 预处理数据
## 词语切分

In [3]:
def load_word_vectors(model_name, model_path):
    vectors = Vectors(name=model_name, cache=model_path)
    print(len(vectors))
    return vectors

regex = re.compile(r'[^\u4e00-\u9fa5aA-Za-z0-9]')
def word_cut(text):
    text = regex.sub(' ', text)
    return [word for word in jieba.cut(text) if word.strip()]

## 构建字典

In [4]:
import jieba
def read_train(data_root):
    label, train = [], []
    with open(data_root, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip()
            label.append(int(line.split('\t')[0]))
            train.append(list(jieba.cut(line.split('\t')[1])))
    return label, train
train_label, train = read_train('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/review_train.txt')
valid_label, valid = read_train('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/review_valid.txt')
# all_label, all_train = read_train('/home/kesci/train_shuffle.txt')

test = []
with open('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/test_handout.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        line = line.strip()
        test.append(list(jieba.cut(line)))

def get_vocab(train):
    '''
    @params: data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    @counter: Counter({'的': 6935, '很': 3553, '不错': 2728, '恰到好处': 2243, '好': 1851, '味道': 1590, '了': 872, '都': 826, '好吃': 817, 
                     '是': 806, '一如既往': 770, '和': 711, '非常': 634, '装修': 514, '环境': 514, '还': 486, '吃': 484, '特别': 412, 
    '''
    counter = collections.Counter([word for sen in train for word in sen])
#     print(counter)
    return Vocab.Vocab(counter, min_freq=3)

vocab = get_vocab(train)
print('# words in vocab:', len(vocab))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.694 seconds.
Prefix dict has been built successfully.


# words in vocab: 2510


## 交叉验证

In [5]:
def MyDataset(root, state='Train', k=0):
    if state == 'Train':
        path = root + 'train_shuffle.csv'
        train_data = pd.read_csv(path, names = ['all'], header = None)
        data = pd.concat(
            [train_data[:int((k % 5) * len(train_data) / 5)],       # 10折：[:0]; [:1]
             train_data[int((k % 5 + 1) * len(train_data) / 5):]])  # 10折：[1:]; [2:]
        
        data['label'] = data['all'].str[0]
        data['text'] = data['all'].str[2:]
        data.pop('all')
        data = data.reset_index(drop = True)

        label = data['label'].tolist()
        label = [int(x) for x in label]
        text = []
        for sen in data['text'].values:
            text.append(list(jieba.cut(sen)))

    if state == 'Valid':
        path = root + 'train_shuffle.csv'
        train_data = pd.read_csv(path, names = ['all'], header = None)
        data = train_data[int((k % 10) * len(train_data) /10) : int((k % 10 + 1) * len(train_data) /10)]# [:1]; [1:2]
        
        data['label'] = data['all'].str[0]
        data['text'] = data['all'].str[2:]
        data.pop('all')
        data = data.reset_index(drop = True)

        label = data['label'].tolist()
        label = [int(x) for x in label]
        text = []
        for sen in data['text'].values:
            text.append(list(jieba.cut(sen)))
    
    return label, text


In [None]:
root = '/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/'
train_k0_label, train_k0_text = MyDataset(root, state='Train', k=0)
valid_k0_label, valid_k0_text = MyDataset(root, state='Valid', k=0)
# train_k0_label
# type(train_k0_text)
# len(train_k0_text)
# valid_k0_label
valid_k0_text

In [None]:
test[:5]
# len(all_train)
len(vocab)

词典和词语的索引创建好后，就可以将数据集的文本从字符串的形式转换为单词下标序列的形式，以待之后的使用。

## 文本截取（补0）

In [6]:
def preprocess_imdb(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_l = 12  # 将每条评论通过截断或者补0，使得长度变成500
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in data])
    return features

train_ = preprocess_imdb(train, vocab)
train_y_ = torch.tensor(train_label)
valid_ = preprocess_imdb(valid, vocab)
valid_y_ = torch.tensor(valid_label)
pre_ = preprocess_imdb(test, vocab)

all_train_ = preprocess_imdb(train, vocab)
all_train_y = torch.tensor(train_label)
pre_

tensor([[   0,    3,  219,  ...,    0,    0,    0],
        [ 331,   93,    0,  ...,    0,    0,    0],
        [   7,  871,  351,  ...,    0,    0,    0],
        ...,
        [  28,  112,    6,  ...,    0,    0,    0],
        [  30,   14,    4,  ...,    0,    0,    0],
        [1020,  108,    2,  ...,    0,    0,    0]])

## 创建数据迭代器

利用 [`torch.utils.data.TensorDataset`](https://pytorch.org/docs/stable/data.html?highlight=tensor%20dataset#torch.utils.data.TensorDataset)，可以创建 PyTorch 格式的数据集，从而创建数据迭代器。

In [7]:
train_set = Data.TensorDataset(train_, train_y_)
test_set = Data.TensorDataset(valid_, valid_y_)
pre_set = Data.TensorDataset(pre_)

# 上面的代码等价于下面的注释代码
# train_features, train_labels = preprocess_imdb(train_data, vocab)
# test_features, test_labels = preprocess_imdb(test_data, vocab)
# train_set = Data.TensorDataset(train_features, train_labels)
# test_set = Data.TensorDataset(test_features, test_labels)

# len(train_set) = features.shape[0] or labels.shape[0]
# train_set[index] = (features[index], labels[index])
# print(train_set)

batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
pre_iter = Data.DataLoader(pre_set, 1)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

X torch.Size([64, 12]) y torch.Size([64])
#batches: 219


## 定义预处理函数（交叉验证使用）

In [8]:
def package(vocab, state, k):  # state='Train','Valid'; k=0~9
    label, spl_train = MyDataset('/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/', state, k)
    idx_train = preprocess_imdb(spl_train, vocab)
    train_y = torch.tensor(label)
    train_set = Data.TensorDataset(idx_train, train_y)
    batch_size = 64
    train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
#     for X, y in train_iter:
#         print('X', X.shape, 'y', y.shape)
#         break
#     print('#batches:', len(train_iter))
    return train_iter

# 训练模型
训练时可以调用之前编写的 `train` 及 `evaluate_accuracy` 函数。

In [9]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    pre, lab = [], []
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            pre.append(y_hat)
            lab.append(y)
            
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
    return pre, lab

# TextCNN模型
## MaxPooling

In [10]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        '''
        @params:
            x: 输入，形状为 (batch_size, n_channels, seq_len) 的张量
        @return: 时序最大池化后的结果，形状为 (batch_size, n_channels, 1) 的张量
        '''
        return F.max_pool1d(x, kernel_size=x.shape[2]) # kenerl_size=seq_len

## 加载预训练的词向量

由于预训练词向量的词典及词语索引与我们使用的数据集并不相同，所以需要根据目前的词典及索引的顺序来加载预训练词向量。

In [11]:
cache_dir = '/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/.vector_cache/'
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

def load_pretrained_embedding(words, pretrained_vocab):
    '''
    @params:
        words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
    @return:
        embed: 加载到的词向量
    '''
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed


## TextCNN网络

In [14]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        '''
        @params:
            vocab: 在数据集上创建的词典，用于获取词典大小
            embed_size: 嵌入维度大小
            kernel_sizes: 卷积核大小列表
            num_channels: 卷积通道数列表
        '''
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size) # 参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size) # 不参与训练的嵌入层
        
        self.pool = GlobalMaxPool1d() # 时序最大池化层没有权重，所以可以共用一个实例
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))
        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.dropout = nn.Dropout(0.5) # 丢弃层用于防止过拟合

    def forward(self, inputs):
        '''
        @params:
            inputs: 词语下标序列，形状为 (batch_size, seq_len) 的整数张量
        @return:
            outputs: 对文本情感的预测，形状为 (batch_size, 2) 的张量
        '''
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2) # (batch_size, seq_len, 2*embed_size)
        # 根据一维卷积层要求的输入格式，需要将张量进行转置
        embeddings = embeddings.permute(0, 2, 1) # (batch_size, 2*embed_size, seq_len)
        
        encoding = torch.cat([
            self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # encoding = []
        # for conv in self.convs:
        #     out = conv(embeddings) # (batch_size, out_channels, seq_len-kernel_size+1)
        #     out = self.pool(F.relu(out)) # (batch_size, out_channels, 1)
        #     encoding.append(out.squeeze(-1)) # (batch_size, out_channels)
        # encoding = torch.cat(encoding) # (batch_size, out_channels_sum)
        
        # outputs = self.softmax(self.dropout(encoding))
        # print()
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        outputs = self.sigmoid(outputs)
        # print(outputs)
        return outputs

embed_size, kernel_sizes, nums_channels = 300, [3, 4], [100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

zhihu = Vectors(name='sgns.zhihu.bigram', cache='/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/')
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, zhihu))

# # net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, zhihu))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 190 oov words.


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6517,  0.5360,  0.3403,  ...,  0.8054,  0.1046,  0.1937],
        ...,
        [ 0.0179, -0.2923, -0.2269,  ...,  0.4407,  0.9959, -0.2199],
        [ 0.0693, -0.1167,  0.1685,  ...,  0.2784,  0.0289, -0.0813],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

# 训练并评价模型

In [18]:
lr, num_epochs = 0.001, 10
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

# for k in range(5):
#     t_iter = package(vocab, 'Train', k)
#     v_iter = package(vocab, 'Valid', k)
#     pre, lab = train(t_iter, v_iter, net, loss, optimizer, device, num_epochs)

pre, lab = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.4217, train acc 0.888, test acc 0.874, time 1.6 sec
epoch 2, loss 0.2099, train acc 0.889, test acc 0.877, time 1.6 sec
epoch 3, loss 0.1398, train acc 0.889, test acc 0.877, time 1.6 sec
epoch 4, loss 0.1046, train acc 0.892, test acc 0.873, time 1.6 sec
epoch 5, loss 0.0840, train acc 0.887, test acc 0.869, time 1.6 sec
epoch 6, loss 0.0698, train acc 0.887, test acc 0.870, time 1.6 sec
epoch 7, loss 0.0595, train acc 0.893, test acc 0.874, time 0.8 sec
epoch 8, loss 0.0522, train acc 0.891, test acc 0.869, time 0.7 sec
epoch 9, loss 0.0464, train acc 0.888, test acc 0.875, time 0.7 sec
epoch 10, loss 0.0415, train acc 0.894, test acc 0.870, time 0.7 sec


# 输出预测结果

In [None]:
test[:5]
pre_[:5]
# tensor([[   0,    3,  219,    0,    0,    0,    0,    0,    0,    0,    0,    0],
#         [ 331,   93, 3438,   17,   14,  155,    0,    0,    0,    0,    0,    0],
#         [   7,  871,  351,    5,    0,    0,    0,    0,    0,    0,    0,    0],
#         [8491,   27,    2,    5,    0,    0,    0,    0,    0,    0,    0,    0],
#         [ 548,  219,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [17]:
df = pd.read_csv ("/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/submission.csv" , encoding = "utf-8")


In [None]:
net = net.to(device)
print("training on ", device)
pre = []
for X in pre_iter:
    # print(X)
    X = torch.tensor(X[0])
    X = X.to(device)
    y_hat = net(X)
    pre.append(y_hat.detach().cpu().numpy())

In [None]:
sub = []
for j in range(len(pre)):
    item = pre[j][0][1].item()
    sub.append(item)

# len(pre)
# pre[5][0][1].item()
sub

https://www.cnblogs.com/everfight/p/create_dataframe_from_different_type.html

In [None]:
ID = []
for i in range(0, 4189):
    ID.append(i)
ID[-1]
list_ = zip(ID, sub)
list_
# diction = dict(zip(ID,sub))
# diction
output = pd.DataFrame.from_records(list_, columns=['ID', 'Prediction'])
# output
# df = pd.DataFrame.from_records(sub, columns=['Prediction'])
# df

In [None]:
import csv
# index=False
output.to_csv ("/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/submission.csv" , encoding = "utf-8", index=False)
dfr = pd.read_csv ("/home/cc/holdshy/XJQ/Pytorch/Dive_into_DL/text_cnn/submission.csv" , encoding = "utf-8")
print (dfr)