In [1]:
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
import random

# 文本处理，将label和content分开

In [2]:
# preprocess用于将一个文本文档进行切词，并以字符串形式输出切词结果
path = './cnews.test.txt'
with open(path,'r',encoding='UTF-8') as f:
    cnews_test = f.readlines()
# 取test中前3000出来分为2000为训练样本，1000测试样本
cnews_test = cnews_test[500:1000]+cnews_test[1500:2000]+cnews_test[2500:3000]+cnews_test[3500:4000]+cnews_test[4500:5000]+cnews_test[5500:6000]
# 将test中的label取出
test_label,test_x = [],[]
n = list(range(len(cnews_test)))
random.shuffle(n)
for i in n:
    each = cnews_test[i]
    each0 = each.split('\t')
    test_label.append(each0[0])
    test_x.append(each0[1])

对文本内容进行分词后并以" "连接

In [3]:
# 取test中前3000出来分为2000为训练样本，1000测试样本
import jieba
# 使用jieba精确分词
test_x = [[each0 for each0 in jieba.cut(each)] for each in test_x]
test_x = [' '.join(each) for each in test_x]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lejon\AppData\Local\Temp\jieba.cache
Loading model cost 0.630 seconds.
Prefix dict has been built succesfully.


将3000个样本分为2500的train数据，500的test数据

In [4]:
train_X = test_x[:2500]
train_y = test_label[:2500]
test_X = test_x[2500:]
test_y = test_label[2500:]

多项式模型计算文本分类

In [5]:
count_vector = CountVectorizer()
# 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j]表示j词在i类文本下的词频
vector_matrix = count_vector.fit_transform(train_X)

# tfidf度量模型
train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vector_matrix)
# 将词频矩阵转化为权重矩阵，每一个特征值就是一个单词的TF-IDF值

# 调用MultinomialNB分类器进行训练
clf = MultinomialNB().fit(train_tfidf,train_y)

# 测试
test_vector = count_vector.transform(test_X)
test_tfidf = TfidfTransformer(use_idf=False).fit_transform(test_vector)
predict_result = clf.predict(test_tfidf)

以正确分类的个数，简单评测模型预测结果的准确率

In [6]:
# 评测预测效果
def accuracy_(test_y,predict):
    TP,num = 0,len(test_y)
    for i in range(num):
        if test_y[i]==predict[i]:
            TP+=1
    return TP/num

多项式模型分类效果

In [7]:
print('多项式模型分类效果：%f'%accuracy_(test_y,predict_result))

多项式模型分类效果：0.970000


伯努利模型计算文本分类

In [8]:
count_vector = CountVectorizer()
# 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j]表示j词在i类文本下的词频
vector_matrix = count_vector.fit_transform(train_X)

# tfidf度量模型
train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vector_matrix)
# 将词频矩阵转化为权重矩阵，每一个特征值就是一个单词的TF-IDF值

# 调用MultinomialNB分类器进行训练
clf = BernoulliNB().fit(train_tfidf,train_y)

# 测试
test_vector = count_vector.transform(test_X)
test_tfidf = TfidfTransformer(use_idf=False).fit_transform(test_vector)
predict_result = clf.predict(test_tfidf)

伯努利模型分类效果

In [9]:
print('伯努利模型分类效果：%f'%accuracy_(test_y,predict_result))

伯努利模型分类效果：0.870000
