In [3]:
import json
import requests

import torch
import torch.nn as nn
import torch.optim

import re
import jieba
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

In [None]:
# 不用自己下载，直接使用已下载好的数据
def get_comments(url):
    comments = []
    resp = requests.get(url)
    resp.encoding = 'gbk'

    if resp.status_code != 200:
        return []

    content = resp.text
    if content:
        ind = content.find('(')
        s1 = content[ind + 1 : -2]
        try:
            js = json.loads(s1)
            comment_infos = js['comments']
        except:
            print('error')
            return([])

        for comment_info in comment_infos:
            comment_content = comment_info['content']
            str1 = comment_content + '\n'
            comments.append(str1)
    return comments

good_comments = []
good_comment_url_templates = [
    '',
    '',
    '',
    '',
    ''
]

j = 0
for good_comment_url_template in good_comment_url_templates:
    # 模拟翻页100次
    for i in range(100):
        url = good_comment_url_template.format(i)
        good_comments += get_comments(url)
        print('第 {} 条记录，总文本长度: {}'.format(j, len(good_comments)))
        j += 1

fw = open('data/good.txt', 'w', encoding = 'utf-8')
fw.writelines(good_comments)

In [12]:
good_file = 'emotion_classifier_data/good.txt'
bad_file = 'emotion_classifier_data/bad.txt'

def filter_punc(sentence):
    sentence = re.sub("[\\s+\\.\\!\\/_,$%^*(+\\\"\\'“”《》?“]+|[+——！，。？、~@#￥%……&*（）：]+", "", sentence)
    return(sentence)

def prepare_data(good_file, bad_file, is_filter=True):
    all_words = []
    pos_sentences = []
    neg_sentences = []
    with open(good_file, 'r', encoding='utf-8') as fr:
        for idx, line in enumerate(fr):
            if is_filter:
                line = filter_punc(line)
            words = jieba.lcut(line) # 分词
            if len(words) > 0:
                all_words += words
                pos_sentences.append(words)
    print('{0} include {1} lines, {2} words.'.format(good_file, idx +  1, len(all_words)))

    count = len(all_words)
    with open(bad_file, 'r', encoding='utf-8') as fr:
        for idx, line in enumerate(fr):
            if is_filter:
                line = filter_punc(line)
            words = jieba.lcut(line) # 分词
            if len(words) > 0:
                all_words += words
                neg_sentences.append(words)
    print('{0} include {1} lines, {2} words.'.format(bad_file, idx +  1, len(all_words) - count))

    # 建立词典: word -> [id, 频率]
    diction = {}
    cnt = Counter(all_words) # 统计频率
    for word, freq in cnt.items():
        diction[word] = [len(diction), freq]
    print('diction size: {}'.format(len(diction)))
    return (pos_sentences, neg_sentences, diction)
            
pos_sentences, neg_sentences, diction = prepare_data(good_file, bad_file, True)
st = sorted([(v[1], w) for w, v in diction.items()])

def word2index(word, diction):
    if word in diction:
        value = diction[word][0]
    else:
        value = -1
    return value

def index2word(index, diction):
    for w, v in diction.items():
        if v[0] == index:
            return w
    return None

emotion_classifier_data/good.txt include 8089 lines, 100839 words.
emotion_classifier_data/bad.txt include 5076 lines, 56070 words.
diction size: 7135


In [28]:
print(word2index('不错', diction))
print(index2word(13, diction))

13
不错


In [108]:
# 句子的向量化
# 向量的尺寸是词典中词汇的个数，i位置上的数值为第i个单词出现的频率
def sentence2vec(sentence, dictionary):
    vector = np.zeros(len(dictionary))
    for l in sentence:
        vector[l] += 1
    return 1.0 * vector / len(sentence)  # 压缩到0~1之间

dataset= []
labels = []
sentences = [] # 原始句子，调试用 

#  处理正向评论
for sentence in pos_sentences:
    new_sentence_index = []
    for l in sentence: # 已经分词了
        if l in diction:
            new_sentence_index.append(word2index(l, diction))
    dataset.append(sentence2vec(new_sentence_index, diction))
    labels.append(0) # 0代表正标签
    sentences.append(sentence)

#  处理负向评论
for sentence in neg_sentences:
    new_sentence_index = [] 
    for l in sentence: # 已经分词了
        if l in diction:
            new_sentence_index.append(word2index(l, diction))
    dataset.append(sentence2vec(new_sentence_index, diction))
    labels.append(1) # 1代表负标签
    sentences.append(sentence)

indices = np.random.permutation(len(dataset))
dataset = [dataset[i] for i in indices]
labels = [labels[i] for i in indices]
sentences = [sentences[i] for i in indices]
print(len(dataset[0]))
print(dataset[0][30:128])
print(sentences[0])

<class 'list'>
7135
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.33333333 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]
['布料', '很', 

In [101]:
test_size = len(dataset) // 10
train_data = dataset[2 * test_size :]
train_label = labels[2 * test_size :]

validate_data = dataset[: test_size]
validate_label = labels[: test_size]

test_data = dataset[test_size: 2 * test_size]
test_label = labels[test_size: 2 * test_size]

print(len(validate_data))

1303


In [None]:
# 构建神经网络
m = nn.ReLU()
model = nn.Sequential(
    nn.Linear(len(diction), 10),
    nn.ReLU(), # max(0,x)
    nn.Linear(10, 2),
    nn.LogSoftmax(dim=1), # dim=1按照第二个维度计算输出，也就是输出每一个样本在各个类别的概率
)

# 计算分类准确度
def rightness(predictions, labels):
    # max函数返回[value, index], pred得到最大概率的索引，0表示好，1表示负面
    pred = torch.max(predictions.data, 1)[1] # max的dim=1 表示沿着第二个维度计算max，即好或坏两个分类数值， 

    rights = pred.eq(labels.data.view_as(pred)).sum()

    return rights, len(labels)

# 对于分类问题 损失函数为交叉熵
cost = nn.NLLLoss()
# 优化器，自动调节学习率
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
records = []

losses = []
for epoch in range(10):
    for i, data in enumerate(zip(train_data, train_label)):
        x, y = data
        x = torch.FloatTensor(x).view(1, -1) #[1, len(diction)]
        y = torch.LongTensor(np.array([y])) #[1,1]

        optimizer.zero_grad(); # 清空梯度
        predict = model(x) 
        loss = cost(predict, y)

        losses.append(loss.data.numpy())

        loss.backward() # 梯度反向传播
        optimizer.step() # 开始对参数进一步优化

        # 运行校验集的数据，只是验证结果，不能反向传播梯度参与训练
        if i % 3000 == 0:
            val_losses = []
            rights = []
            for j, val in enumerate(zip(validate_data, validate_label)):
                x, y = val
                x = torch.FloatTensor(x).view(1, -1) #[1, len(diction)]
                y = torch.LongTensor(np.array([y])) #[1,1]

                predict = model(x)
                right = rightness(predict, y) # 计算准确度
                rights.append(right)
                
                loss = cost(predict, y)
                val_losses.append(loss.data.numpy())
            # 计算校验集的平均准确度
            right_ratio = 1.0 * np.sum(np.fromiter((i[0] for i in rights), int)) / np.sum(np.fromiter((i[1] for i in rights), int))

            print('第 {} 轮，训练损失: {:.2f}，校验损失: {:.2f}，校验准确率: {:.2f}'
                  .format(epoch, np.mean(losses), np.mean(val_losses), right_ratio))
            records.append([np.mean(losses), np.mean(val_losses), right_ratio])
            

第 0 轮，训练损失: 0.89，校验损失: 0.75，校验准确率: 0.39
第 0 轮，训练损失: 0.66，校验损失: 0.64，校验准确率: 0.61
第 0 轮，训练损失: 0.64，校验损失: 0.56，校验准确率: 0.69
第 0 轮，训练损失: 0.60，校验损失: 0.48，校验准确率: 0.73
第 1 轮，训练损失: 0.58，校验损失: 0.43，校验准确率: 0.87
第 1 轮，训练损失: 0.55，校验损失: 0.40，校验准确率: 0.87
第 1 轮，训练损失: 0.53，校验损失: 0.38，校验准确率: 0.88
第 1 轮，训练损失: 0.51，校验损失: 0.37，校验准确率: 0.87
第 2 轮，训练损失: 0.50，校验损失: 0.36，校验准确率: 0.89
第 2 轮，训练损失: 0.49，校验损失: 0.35，校验准确率: 0.89
第 2 轮，训练损失: 0.48，校验损失: 0.34，校验准确率: 0.89
第 2 轮，训练损失: 0.47，校验损失: 0.35，校验准确率: 0.88
第 3 轮，训练损失: 0.47，校验损失: 0.33，校验准确率: 0.90
第 3 轮，训练损失: 0.46，校验损失: 0.33，校验准确率: 0.90
第 3 轮，训练损失: 0.45，校验损失: 0.32，校验准确率: 0.90
第 3 轮，训练损失: 0.44，校验损失: 0.33，校验准确率: 0.88
第 4 轮，训练损失: 0.44，校验损失: 0.32，校验准确率: 0.90
第 4 轮，训练损失: 0.43，校验损失: 0.31，校验准确率: 0.90
第 4 轮，训练损失: 0.43，校验损失: 0.31，校验准确率: 0.90
第 4 轮，训练损失: 0.42，校验损失: 0.32，校验准确率: 0.89
第 5 轮，训练损失: 0.42，校验损失: 0.31，校验准确率: 0.90
第 5 轮，训练损失: 0.42，校验损失: 0.30，校验准确率: 0.90
第 5 轮，训练损失: 0.41，校验损失: 0.30，校验准确率: 0.90
第 5 轮，训练损失: 0.41，校验损失: 0.31，校验准确率: 0.89
第 6 轮，训练损失: 0.41，校验损失: 0.30，校验准确率: 0.90
