In [150]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import jieba
import time
from itertools import chain
from sklearn.metrics import accuracy_score

In [151]:
train_data = pd.read_csv('data/train.tsv', sep='\t')
valid_data = pd.read_csv('data/dev.tsv', sep='\t')
test_data = pd.read_csv('data/test.tsv', sep='\t') 
x_train, y_train = train_data.text_a.values, train_data.label.values # 训练集
x_valid, y_valid = valid_data.text_a.values, valid_data.label.values # 验证集
x_test, y_test = test_data.text_a.values, test_data.label.values # 测试集

In [152]:
vocab = set()
cut_docs = [" ".join(jieba.cut(x)) for x in train_data.text_a.values]
cut_docs = list(cut_docs)
print(cut_docs[0])

# 创建停用词列表
stopword_path = 'data/chinese_stopwords.txt'
def get_stop_words():
    stopwords = [line.strip() for line in open(stopword_path, encoding='UTF-8').readlines()]
    stopwords += [",", "'"]
    return stopwords
stopwords = get_stop_words()

# 去除停用词
segs_without_stop = []
for sentence in cut_docs:
    sentWords = [x.strip() for x in sentence.split(' ') if x.strip() and x.strip() not in stopwords]
    segs_without_stop.append(' '.join(sentWords))

print(segs_without_stop[0])

for doc in segs_without_stop:
    for word in doc:
        if word.strip():
            vocab.add(word.strip())

segs = segs_without_stop

# ---------------------------------------------------------------------------------------------------
from collections import Counter
print('-' * 20 + 'TEXT ' + '-' * 20)
print("Total number: {}".format(len(segs)))
print("Average length: {}".format(np.mean([len(sentence.split()) for sentence in segs])))
print("Max length: {}".format(np.max([len(sentence.split()) for sentence in segs])))
print("Min length: {}".format(np.min([len(sentence.split()) for sentence in segs])))
pos_text_seg = " ".join(segs)
c = Counter(pos_text_seg.split()).most_common(100)
print("Most common words : \n{} \n".format(c))
# ---------------------------------------------------------------------------------------------------

# 将词表写入本地vocab.txt文件
with open('data/vocab.txt', 'w') as file:
    for word in  vocab:
        file.write(word)
        file.write('\n')

选择 珠江 花园 的 原因 就是 方便 ， 有 电动 扶梯 直接 到达 海边 ， 周围 餐馆 、 食廊 、 商场 、 超市 、 摊位 一应俱全 。 酒店 装修 一般 ， 但 还 算 整洁 。   泳池 在 大堂 的 屋顶 ， 因此 很小 ， 不过 女儿 倒 是 喜欢 。   包 的 早餐 是 西式 的 ， 还 算 丰富 。   服务 吗 ， 一般
选择 珠江 花园 原因 方便 电动 扶梯 直接 到达 海边 周围 餐馆 食廊 商场 超市 摊位 一应俱全 酒店 装修 算 整洁 泳池 大堂 屋顶 很小 女儿 倒 喜欢 包 早餐 西式 算 丰富 服务
--------------------TEXT --------------------
Total number: 9146
Average length: 36.85928274655587
Max length: 13236
Min length: 0
Most common words : 
[('酒店', 4312), ('没有', 3197), ('房间', 3077), ('不错', 2666), ('说', 2135), ('本书', 1860), ('一个', 1848), ('感觉', 1818), ('比较', 1690), ('买', 1681), ('服务', 1601), ('书', 1522), ('住', 1487), ('非常', 1378), ('喜欢', 1192), ('入住', 1191), ('!', 1164), ('没', 1146), ('会', 1119), ('知道', 959), ('太', 958), ('有点', 957), ('觉得', 956), ('孩子', 936), ('问题', 935), ('）', 912), ('很多', 901), ('价格', 899), ('中', 892), ('（', 845), ('系统', 802), ('想', 802), ('时', 791), ('前台', 788), ('早餐', 771), ('方便', 768), ('月', 762), ('携程', 748), ('里', 722), ('服务员', 718), ('差', 682), ('发现', 681), ('不能', 670), ('东西', 65

In [153]:
# 构建词表
vocab = set(vocab)
vocab_size = len(vocab)
word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i + 1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

In [154]:
def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features

def pad_samples(features, maxlen=500, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while(len(padded_feature) < maxlen):
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features

In [155]:
train_features = torch.tensor(pad_samples(encode_samples(x_train, vocab)))
train_labels = torch.tensor([score for score in y_train])
test_features = torch.tensor(pad_samples(encode_samples(x_test, vocab)))
test_labels = torch.tensor([score for score in y_test])
valid_features = torch.tensor(pad_samples(encode_samples(x_valid, vocab)))
valid_labels = torch.tensor([score for score in y_valid])

In [156]:
print(train_features)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tensor([[1725, 1818, 2998,  ...,    0,    0,    0],
        [1932, 1841,  505,  ...,    0,    0,    0],
        [3151,   61, 1240,  ...,    0,    0,    0],
        ...,
        [2848, 2699, 2272,  ...,    0,    0,    0],
        [2432, 1573, 1363,  ...,    0,    0,    0],
        [1997, 2688, 1799,  ...,    0,    0,    0]])


In [157]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 bidirectional, labels, **kwargs):
        super(SentimentNet, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.num_hiddens,
                               num_layers=num_layers, bidirectional=self.bidirectional,
                               dropout=0)
        if self.bidirectional:
            self.decoder = nn.Linear(num_hiddens * 4, labels)
        else:
            self.decoder = nn.Linear(num_hiddens * 2, labels)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        # states: [num_layers * num_directions, batch_size, num_hiddens],
        # hidden: [num_layers * num_directions, batch_size, num_hiddens]
        states, hidden = self.encoder(embeddings.permute([1, 0, 2]))
        # encoding: [batch_size, num_hiddens]
        encoding = torch.cat([states[0], states[-1]], dim=1)
        outputs = self.decoder(encoding)
        return outputs
    
    def single_predict(self, samples):
        samples = encode_samples(samples, vocab)
        padded_samples = pad_samples(samples, 500, PAD = 0)
        padded_samples = torch.tensor(padded_samples)
        padded_samples = padded_samples.reshape(len(samples), 1, -1)
        padded_samples = padded_samples.to(device)
        result_list = []
        for i in range(len(samples)):
            outputs = self.forward(padded_samples[i])
            result_list.append(outputs.argmax(dim=1).item())
        return result_list
    

        # inputs = jieba.cut(inputs)
        # # drop the stop words
        # inputs = [word for word in inputs if word not in stopwords]
        # # convert the words to indices
        # feature = []
        # for token in inputs:
        #     if token in word_to_idx:
        #         feature.append(word_to_idx[token])
        #     else:
        #         feature.append(0)
        # # padding
        # if len(feature) >= 500:
        #     padded_feature = feature[:500]
        # else:
        #     padded_feature = feature
        #     while(len(padded_feature) < 500):
        #         padded_feature.append(0)
        # padded_feature = torch.tensor(padded_feature)
        # padded_feature = padded_feature.reshape(1, -1)
        # # sent to device
        # padded_feature = padded_feature.to(device)
        # # predict
        # outputs = self.forward(padded_feature)
        # return outputs.argmax(dim=1).item()

In [158]:
num_epochs = 50
embed_size = 100
num_hiddens = 100
num_layers = 2
bidirectional = True
batch_size = 128
labels = 2
lr = 0.3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = SentimentNet(vocab_size=(vocab_size+1), embed_size=embed_size,
                   num_hiddens=num_hiddens, num_layers=num_layers,
                   bidirectional=bidirectional,
                   labels=labels)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

In [159]:
train_set = torch.utils.data.TensorDataset(train_features, train_labels)
test_set = torch.utils.data.TensorDataset(test_features, test_labels)

train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                         shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)
valid_set = torch.utils.data.TensorDataset(valid_features, valid_labels)
valid_iter = torch.utils.data.DataLoader(valid_set, batch_size=batch_size,
                                            shuffle=False)


In [160]:
best_valid_acc = 0
for epoch in range(num_epochs):
    start = time.time()
    train_loss, valid_losses = 0, 0
    train_acc, valid_acc = 0, 0
    n, m = 0, 0
    for feature, label in train_iter:
        n += 1
        net.zero_grad()
        feature = feature.cuda()
        label = label.cuda()
        score = net(feature)
        # print(score.type)
        score = score.to(device=device)
        loss = loss_function(score, label)
        loss.backward()
        optimizer.step()
        train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                 dim=1), label.cpu())
        train_loss += loss
    with torch.no_grad():
        # use valid set to test
            for valid_feature, valid_label in valid_iter:
                m += 1
                valid_feature = valid_feature.cuda()
                valid_label = valid_label.cuda()
                valid_score = net(valid_feature)
                valid_loss = loss_function(valid_score, valid_label)
                valid_acc += accuracy_score(torch.argmax(valid_score.cpu().data,
                                                        dim=1), valid_label.cpu())
        # use test set to test
    # save the best model
    if (valid_acc / m > 0.82) and (valid_acc / m > best_valid_acc):
        torch.save(net, './checkpoints/best_model_bilstm.pkl')
        best_valid_acc = valid_acc / m
    end = time.time()
    runtime = end - start
    # print the result
    print('epoch %d, train loss %.4f, train acc %.3f, valid loss %.4f, valid acc %.3f, time %.1f sec'
            % (epoch + 1, train_loss / n, train_acc / n, valid_loss / m, valid_acc / m, runtime))

epoch 1, train loss 0.6908, train acc 0.527, valid loss 0.0687, valid acc 0.560, time 35.0 sec
epoch 2, train loss 0.6770, train acc 0.582, valid loss 0.0667, valid acc 0.581, time 35.0 sec
epoch 3, train loss 0.6387, train acc 0.625, valid loss 0.0628, valid acc 0.657, time 35.1 sec
epoch 4, train loss 0.6047, train acc 0.665, valid loss 0.0622, valid acc 0.639, time 35.2 sec
epoch 5, train loss 0.5816, train acc 0.690, valid loss 0.0605, valid acc 0.650, time 35.1 sec
epoch 6, train loss 0.5512, train acc 0.716, valid loss 0.0572, valid acc 0.720, time 35.4 sec
epoch 7, train loss 0.5272, train acc 0.740, valid loss 0.0566, valid acc 0.732, time 35.2 sec
epoch 8, train loss 0.5131, train acc 0.752, valid loss 0.0489, valid acc 0.749, time 35.5 sec
epoch 9, train loss 0.4859, train acc 0.770, valid loss 0.0570, valid acc 0.722, time 36.0 sec
epoch 10, train loss 0.4709, train acc 0.779, valid loss 0.0475, valid acc 0.749, time 36.3 sec
epoch 11, train loss 0.4578, train acc 0.784, val

In [190]:
# load the best model and test with 
from sklearn.metrics import classification_report
net_bilstm = torch.load('./checkpoints/best_model_bilstm.pkl')
net_bilstm.eval()
result = []
with torch.no_grad():
    for test_feature, test_label in test_iter:
        test_feature = test_feature.cuda()
        test_label = test_label.cuda()
        test_score = net_bilstm(test_feature)
        test_acc = accuracy_score(torch.argmax(test_score.cpu().data,
                                                dim=1), test_label.cpu())
        result.extend(torch.argmax(test_score.cpu().data, dim=1).numpy().tolist())

print(classification_report(y_test, result, digits=4))

              precision    recall  f1-score   support

           0     0.8615    0.8615    0.8615       592
           1     0.8651    0.8651    0.8651       608

    accuracy                         0.8633      1200
   macro avg     0.8633    0.8633    0.8633      1200
weighted avg     0.8633    0.8633    0.8633      1200



In [193]:
net_bilstm.single_predict(['房间差劲','绝绝子'])

[0, 1]

In [163]:
num_epochs = 40
embed_size = 200
num_hiddens = 100
num_layers = 2
bidirectional = False
batch_size = 128
labels = 2
lr = 0.2
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = SentimentNet(vocab_size=(vocab_size+1), embed_size=embed_size,
                   num_hiddens=num_hiddens, num_layers=num_layers,
                   bidirectional=bidirectional,
                   labels=labels)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

In [164]:
best_valid_acc = 0
for epoch in range(num_epochs):
    start = time.time()
    train_loss, valid_losses = 0, 0
    train_acc, valid_acc = 0, 0
    n, m = 0, 0
    for feature, label in train_iter:
        n += 1
        net.zero_grad()
        feature = feature.cuda()
        label = label.cuda()
        score = net(feature)
        # print(score.type)
        score = score.to(device=device)
        loss = loss_function(score, label)
        loss.backward()
        optimizer.step()
        train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                 dim=1), label.cpu())
        train_loss += loss
    with torch.no_grad():
        # use valid set to test
            for valid_feature, valid_label in valid_iter:
                m += 1
                valid_feature = valid_feature.cuda()
                valid_label = valid_label.cuda()
                valid_score = net(valid_feature)
                valid_loss = loss_function(valid_score, valid_label)
                valid_acc += accuracy_score(torch.argmax(valid_score.cpu().data,
                                                        dim=1), valid_label.cpu())
        # use test set to test
    # save the best model
    if (valid_acc / m > 0.65) and (valid_acc / m > best_valid_acc):
        torch.save(net, './checkpoints/best_model_lstm.pkl')
        best_valid_acc = valid_acc / m
    end = time.time()
    runtime = end - start
    # print the result
    print('epoch %d, train loss %.4f, train acc %.3f, valid loss %.4f, valid acc %.3f, time %.1f sec'
            % (epoch + 1, train_loss / n, train_acc / n, valid_loss / m, valid_acc / m, runtime))

epoch 1, train loss 0.6931, train acc 0.511, valid loss 0.0693, valid acc 0.529, time 15.1 sec
epoch 2, train loss 0.6926, train acc 0.516, valid loss 0.0694, valid acc 0.543, time 14.9 sec
epoch 3, train loss 0.6915, train acc 0.520, valid loss 0.0696, valid acc 0.505, time 15.0 sec
epoch 4, train loss 0.6905, train acc 0.524, valid loss 0.0697, valid acc 0.501, time 15.0 sec
epoch 5, train loss 0.6886, train acc 0.543, valid loss 0.0698, valid acc 0.553, time 15.0 sec
epoch 6, train loss 0.6865, train acc 0.559, valid loss 0.0701, valid acc 0.566, time 15.0 sec
epoch 7, train loss 0.6825, train acc 0.575, valid loss 0.0704, valid acc 0.557, time 15.1 sec
epoch 8, train loss 0.6762, train acc 0.587, valid loss 0.0709, valid acc 0.586, time 15.3 sec
epoch 9, train loss 0.6686, train acc 0.595, valid loss 0.0708, valid acc 0.577, time 15.4 sec
epoch 10, train loss 0.6592, train acc 0.611, valid loss 0.0711, valid acc 0.595, time 15.4 sec
epoch 11, train loss 0.6505, train acc 0.618, val

In [165]:
# load the best model and test with 
from sklearn.metrics import classification_report
net_lstm = torch.load('./checkpoints/best_model_lstm.pkl')
net_lstm.eval()
result = []
with torch.no_grad():
    for test_feature, test_label in test_iter:
        test_feature = test_feature.cuda()
        test_label = test_label.cuda()
        test_score = net_lstm(test_feature)
        test_acc = accuracy_score(torch.argmax(test_score.cpu().data,
                                                dim=1), test_label.cpu())
        result.extend(torch.argmax(test_score.cpu().data, dim=1).numpy().tolist())

print(classification_report(y_test, result, digits=4))

              precision    recall  f1-score   support

           0     0.6307    0.6030    0.6166       592
           1     0.6293    0.6562    0.6425       608

    accuracy                         0.6300      1200
   macro avg     0.6300    0.6296    0.6295      1200
weighted avg     0.6300    0.6300    0.6297      1200



In [177]:
net.single_predict(['房间太小。其他的都一般。。。。。。。。。'])

[1]