In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import *

In [8]:
# option
##############################################
data_path = 'data/weibo_senti_100k.csv'
stopwords_path = 'data/baidu_stopwords.txt'
moods = {0: '负向', 1: '正向'}
N = 3000


# begin
##############################################
pd_all = pd.read_csv(data_path)

print('数目（总体）：%d' % pd_all.shape[0])
for label, mood in moods.items(): 
    print('数目（{}): {}'.format(mood,  pd_all[pd_all.label==label].shape[0]))

数目（总体）：119988
数目（负向): 59995
数目（正向): 59993


In [9]:
s = pd_all.sample(10000)
texts = [item[1] for item in s.values]
text_encoding, dictionary = pre_process(texts, stopwords_path, N=N)
print(text_encoding.shape)

with open('dictionary.txt', 'w') as file:
    for item in dictionary:
        file.write(str(item) + '\n')

停用词：1396
字典： ['[', ']', ' ', '，', '@', '的', '/', ':', '了', '！', '。', '哈哈', '泪', '是', '在', '？', '嘻嘻', '啊', '都', '有', '?', '不', '就', '好', '爱', '也', '~', '抓狂', '吧', '人', '回复', '-', '鼓掌', '要', '去', '#', '：', '还', '太', '和', '衰', '来', '_', '给', '说', '怒', '晕', '吃', '小', '看', '很', '.', '到', '偷笑', '啦', '…', '一个', '又', '大', '个', '没', '多', '呢', '吗', '想', '会', '上', '一', '着', 'cn', 'http', 't', '微', '被', '就是', '博', '让', '、', '得', '可爱', '哦', '大家', '能', '开心', '们', '北京', '可以', '还是', '“', '中', '没有', '做', '”', '把', '!', '最', '真', '过', '呀', '转发', '不是', '才', '中国', '再', '对', '心', '【', '】', '喜欢', '还有', '里', '为', '之', '等', '下', '谢谢', '月', '一起', '知道', '点', '真的', '汗', ',', '赞', '看到', '请', '么', '已经', '快', '朋友', '...', '与', '用', '老', '～', '带', '叫', '更', '开始', '后', '时候', '一下', '力', '走', '呵呵', '不能', '跟', '但', '哈', '美食', '）', '起来', 'good', '一定', '》', '《', '可', '死', '年', '真是', '不要', '买', '（', '笑', '新', '美', '威武', '找', '时间', '比', '生活', '感觉', '只', '转', '从', '旅游', '出来', '微博', '不错', '酒店', '馋嘴', '孩子', '觉得', '可怜', '老师', '

In [10]:
from model.DecisionTree import DecisionTree
from model.DNN import DNN
from model.NaiveBayes import NaiveBayes
from model.SVM import SVM

X = text_encoding
y = [item[0] for item in s.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree = DecisionTree()
decision_tree.train(X_train, y_train)
print('DecisionTree')
test(decision_tree, X_test, y_test)
print('-------------------------------------------')


naive_bayes = NaiveBayes()
naive_bayes.train(X_train, y_train)
print('NaiveBayes')
test(naive_bayes, X_test, y_test)
print('-------------------------------------------')


# svm = SVM()
# svm.train(X_train, y_train)
# print('SVM')
# test(svm, X_test, y_test)
# print('-------------------------------------------')


dnn = DNN(N, len(moods))
dnn.train(X_train, y_train)
print('DNN')
test(dnn, X_test, y_test)
print('-------------------------------------------')


models = [decision_tree, naive_bayes, dnn]
print('Assemble')
assemble_test(models, X, y)
print('-------------------------------------------')
    

DecisionTree
Accuracy: 0.966
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1038
           1       0.96      0.97      0.96       962

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

-------------------------------------------
NaiveBayes
Accuracy: 0.942
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1038
           1       0.95      0.93      0.94       962

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000

-------------------------------------------
DNN
Accuracy: 0.9565
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1038
           1       0.95      0.96      0.95       962

    accuracy     

In [11]:
naive_bayes.save('param/naive_bayes_model.pth')
decision_tree.save('param/decision_tree_model.pth')
dnn.save('param/dnn_model.pth')

保存到: param/naive_bayes_model.pth
保存到: param/decision_tree_model.pth
保存到: param/dnn_model.pth
