In [17]:
import pandas as pd
import jieba
import jieba.posseg as posseg
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import torch



In [18]:
pd_all = pd.read_csv('weibo_senti_100k.csv')
moods = {0: '负向', 1: '正向'}

# pd_all = pd.read_csv('simplifyweibo_4_moods.csv')
# moods = {0: '喜悦', 1: '愤怒', 2: '厌恶', 3: '低落'}

print('微博数目（总体）：%d' % pd_all.shape[0])

for label, mood in moods.items(): 
    print('微博数目（{}): {}'.format(mood,  pd_all[pd_all.label==label].shape[0]))

微博数目（总体）：119988
微博数目（负向): 59995
微博数目（正向): 59993


In [19]:


def get_top_words(texts, stop_words, remove=[], top=3000):
    # texts: [text1, text2, ...]
    # stop_words: [word1, word2, ...]
    # remove: ['a', 'n', ...]，需要移除的词的词性

    remove = [item.lower() for item in remove]
    counts = {}
    for text in texts:
        words = posseg.cut(text)
        words = [word for word, flag in words if flag[0].lower() not in remove]
        words = set(words)

        for word in words:
            # if len(word) == 1 or word.isdigit() or word in stop_words:
            if word.isdigit():    
            # if word.isdigit() or word in stop_words:    
                continue
            else:
                counts[word] = counts.get(word,0) + 1
    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    top_words = [item[0] for item in counts[:top]]
    return top_words


def get_text_encoding(texts, dictionary):
    # texts: [text1, text2, ...]
    # dictionary: [word1, word2, ....]

    encoding_size = len(dictionary)
    text_encoding = np.zeros((len(texts), encoding_size))
    for textId in range(len(texts)):
        words = set(jieba.cut(texts[textId]))
        for word in words:
            for dicId  in range(encoding_size):
                if word == dictionary[dicId]:
                    text_encoding[textId, dicId] = 1
                    break
            
    return text_encoding


def pre_process(texts):
    # texts: [text1, text2, ...]
    # output: [[1, 2, 3], [2, 1, 3], [1]]

    with open('baidu_stopwords.txt', 'r', encoding='utf-8') as f:
        stop_words = [line.strip() for line in f.readlines()]

    print('停用词：{}'.format(len(stop_words)))

    remove = ['r', 't']
    dictionary = get_top_words(texts, stop_words, remove=remove, top=10000)
    print('字典：', dictionary)
    
    text_encoding = get_text_encoding(texts, dictionary)

    return text_encoding, dictionary


In [20]:
s = pd_all.sample(100000)
texts = [item[1] for item in s.values]
text_encoding, dictionary = pre_process(texts)
print(text_encoding.shape)

with open('dictionary.txt', 'w') as file:
    for item in dictionary:
        file.write(str(item) + '\n')

    

停用词：1396
字典： [']', '[', ' ', '，', '@', '的', '/', ':', '！', '了', '。', '哈哈', '是', '泪', '在', '？', '嘻嘻', '啊', '都', '有', '?', '不', '好', '就', '爱', '也', '~', '抓狂', '鼓掌', '人', '-', '回复', '吧', '要', '#', '去', '：', '太', '还', '和', '衰', '_', '来', '说', '给', '怒', '晕', '吃', '小', '很', '看', '.', '到', '…', '偷笑', '啦', '大', '个', '吗', '又', '呢', '一个', '想', '没', '多', 'cn', 'http', '会', 't', '上', '一', '微', '就是', '着', '哦', '能', '博', '、', '被', '可爱', '得', '让', '大家', '“', '可以', '”', '开心', '们', '北京', '还是', '做', '没有', '中', '把', '真', '最', '!', '不是', '呀', '转发', '过', '心', '再', '中国', '【', '】', '对', '才', '为', '还有', '知道', '喜欢', '之', '谢谢', '里', ',', '真的', '点', '赞', '老', '等', '快', '下', '月', '么', '已经', '汗', '用', '看到', '请', '力', '朋友', '更', '一起', '...', '与', '～', '时候', '一下', '叫', '走', '但', '年', '后', '带', '跟', '开始', '一定', '买', '不能', '从', '不要', '）', '美食', '呵呵', '》', '《', '威武', '新', '起来', 'good', '（', '旅游', '哈', '可', '转', '老师', '围观', '地', '找', '只', '笑', '觉得', '美', '馋嘴', '死', '真是', '酒店', '时间', '儿', '哈哈哈', '微博', '希望', '生活', '出来', '

In [None]:
from model.DecisionTree import DecisionTree
from model.DNN import DNN
from model.NaiveBayes import NaiveBayes
from model.SVM import SVM

In [None]:
def test(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y, y_pred))


X = text_encoding
y = [item[0] for item in s.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree = DecisionTree()
decision_tree.train(X_train, y_train)
print('DecisionTree')
test(decision_tree, X_test, y_test)
print('-------------------------------------------')


naive_bayes = NaiveBayes()
naive_bayes.train(X_train, y_train)
print('NaiveBayes')
test(naive_bayes, X_test, y_test)
print('-------------------------------------------')


# svm = SVM()
# svm.train(X_train, y_train)
# print('SVM')
# test(svm, X_test, y_test)
# print('-------------------------------------------')
    

DecisionTree
Accuracy: 0.959
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       979
           1       0.96      0.96      0.96      1021

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

-------------------------------------------
NaiveBayes
Accuracy: 0.931
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       979
           1       0.94      0.92      0.93      1021

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000

-------------------------------------------


In [None]:
dnn = DNN(10000, 2)

In [None]:
dnn.train(X_train, y_train)

  prob = F.softmax(out)


In [None]:
print('DNN')
test(dnn, X_test, y_test)
print('-------------------------------------------')

DNN
Accuracy: 0.9435
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       979
           1       0.97      0.92      0.94      1021

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000

-------------------------------------------


In [None]:
naive_bayes.save('param/naive_bayes_model.pth')
decision_tree.save('param/decision_tree_model.pth')
dnn.save('param/dnn_model.pth')


保存到: param/naive_bayes_model.pth
保存到: param/decision_tree_model.pth
保存到: param/dnn_model.pth


In [None]:
from model.DecisionTree import DecisionTree
from model.DNN import DNN
from model.NaiveBayes import NaiveBayes
from model.SVM import SVM
import numpy as np
import jieba



def get_text_encoding(texts, dictionary):
    # texts: [text1, text2, ...]
    # dictionary: [word1, word2, ....]

    encoding_size = len(dictionary)
    text_encoding = np.zeros((len(texts), encoding_size))
    for textId in range(len(texts)):
        words = set(jieba.cut(texts[textId]))
        for word in words:
            for dicId  in range(encoding_size):
                if word == dictionary[dicId]:
                    text_encoding[textId, dicId] = 1
                    break
            
    return text_encoding


def assemble_test(models, X, y):

    num = len(models)
    y_pred = np.zeros(len(y))
    for model in models:
        y_pred += np.array(model.predict(X))

    y_pred = [1 if value >= num*1.0/2 else 0 for value in y_pred]

    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y, y_pred))


dictionary = []
with open('dictionary.txt', 'r') as file:
    for line in file:
        dictionary.append(line.strip())
print(dictionary)

s = pd_all.sample(1000)
texts = [item[1] for item in s.values]
text_encoding = get_text_encoding(texts, dictionary)
X = text_encoding
y = [item[0] for item in s.values]


decision_tree = DecisionTree()
naive_bayes = NaiveBayes()
dnn = DNN()

decision_tree.load('param/decision_tree_model.pth')
naive_bayes.load('param/naive_bayes_model.pth')
dnn.load('param/dnn_model.pth')


models = [decision_tree, naive_bayes, dnn]
assemble_test(models, X, y)



# def predict(text):
#     text_encoding = get_text_encoding([text], dictionary)
#     return [decision_tree.predict(text_encoding)[0], 
#             naive_bayes.predict(text_encoding)[0], 
#             dnn.predict(text_encoding)[0]]


# print(predict('没有见过这么'))

[']', '[', '', '，', '@', '的', '/', ':', '！', '了', '。', '哈哈', '是', '泪', '？', '在', '啊', '嘻嘻', '有', '都', '不', '?', '好', '就', '也', '爱', '~', '抓狂', '回复', '鼓掌', '-', '人', '吧', '太', '#', '要', '：', '去', '还', '和', '衰', '说', '给', '_', '吃', '怒', '来', '晕', '看', '很', '小', '到', '.', '大', '偷笑', '…', '又', '吗', '个', '一个', '没', '啦', 'http', '呢', 't', 'cn', '会', '多', '想', '一', '上', '着', '能', '就是', '博', '微', '、', '哦', '得', '可爱', '大家', '让', '”', '“', '被', '可以', '们', '开心', '北京', '做', '还是', '没有', '真', '过', '呀', '把', '!', '心', '中', '不是', '转发', '最', '再', '中国', '为', '才', '【', '】', '对', '还有', '里', '谢谢', '真的', '知道', '赞', '用', '之', ',', '老', '已经', '汗', '更', '下', '点', '喜欢', '力', '朋友', '么', '月', '请', '看到', '与', '等', '...', '年', '后', '快', '叫', 'good', '一起', '不要', '时候', '呵呵', '～', '新', '起来', '美食', '买', '一下', '走', '但', '从', '开始', '威武', '围观', '馋嘴', '旅游', '不能', '《', '只', '儿', '老师', '》', '一定', '带', '时间', '真是', '失望', '地', '跟', '哈哈哈', '（', '哈', '美', '）', '希望', '家', '酒店', '找', '笑', '出来', '觉得', '微博', '不错', '转', '拍', '世界', '不过

  prob = F.softmax(out)


In [None]:
from model.BERT import BERT

bert = BERT()
bert.save('param/bert_model/')
# bert.load('param/bert_model')

# s = pd_all.sample(2000)
# X = [item[1] for item in s.values]
# y = [item[0] for item in s.values]

# def test(model, X, y):
#     y_pred = model.predict(X)
#     accuracy = accuracy_score(y, y_pred)
#     print(f"Accuracy: {accuracy}")
#     print(classification_report(y, y_pred))

# test(bert, X, y)


AttributeError: 'BERT' object has no attribute 'model'