In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
news=fetch_20newsgroups(subset='all')

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)

# 使用CountVectorizer并且不去掉停用词的条件下，对文本特征进行量化的naive bayes classifier 性能测试

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
count_vec=CountVectorizer()

In [7]:
X_count_train = count_vec.fit_transform(X_train) # X_count_train 中的count指的是CountVectorizer中的count
X_count_test = count_vec.transform(X_test)

In [8]:
from sklearn.naive_bayes import MultinomialNB

In [9]:
mnb_count=MultinomialNB()# mnb_count 中的count指的是CountVectorizer中的count

In [10]:
# 使用朴素贝叶斯分类器，对CountVectorizer(without filtering stop words)后的训练样本进行参数学习
mnb_count.fit(X_count_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
print("The accuracy of classifying 20newsgroups using Naive Bayes(CountVectorizer without filtering stop words) is: " 
      +str(mnb_count.score(X_count_test,y_test)))


The accuracy of classifying 20newsgroups using Naive Bayes(CountVectorizer without filtering stop words) is: 0.8397707979626485


In [12]:
y_pred_mnb_count=mnb_count.predict(X_count_test)

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test,y_pred=y_pred_mnb_count,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.92      0.94      0.93       245
         

# 使用TfidfVectorizer (Term Frequency Inverse Document Frequency)并且不去除停用词的条件下，对文本特征进行量化的naive bayes classifier 的性能分析

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf_vec=TfidfVectorizer()

In [16]:
X_tfidf_train=tfidf_vec.fit_transform(X_train)
X_tfidf_test=tfidf_vec.transform(X_test)

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
mnb_tfidf=MultinomialNB()

In [19]:
mnb_tfidf.fit(X_tfidf_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
y_pred_mnb_tfidf=mnb_tfidf.predict(X_tfidf_test)

In [21]:
print("The accuracy of classifying 20newsgroups with Naive Bayes (TfidfVectorizer without filtering stop words) is : "
     +str(mnb_tfidf.score(X=X_tfidf_test,y=y_test)))

The accuracy of classifying 20newsgroups with Naive Bayes (TfidfVectorizer without filtering stop words) is : 0.8463497453310697


In [22]:
from sklearn.metrics import classification_report

In [23]:
print(classification_report(y_true=y_test,y_pred=y_pred_mnb_tfidf))

             precision    recall  f1-score   support

          0       0.84      0.67      0.75       201
          1       0.85      0.74      0.79       250
          2       0.82      0.85      0.83       248
          3       0.76      0.88      0.82       240
          4       0.94      0.84      0.89       242
          5       0.96      0.84      0.89       263
          6       0.93      0.69      0.79       257
          7       0.84      0.92      0.88       238
          8       0.98      0.92      0.95       276
          9       0.96      0.91      0.94       251
         10       0.88      0.99      0.93       233
         11       0.73      0.98      0.83       238
         12       0.91      0.83      0.87       249
         13       0.97      0.92      0.95       245
         14       0.89      0.96      0.93       221
         15       0.51      0.97      0.67       232
         16       0.83      0.96      0.89       251
         17       0.92      0.97      0.95   

# 分别使用CountVectorizer 和 TfidfVectorizer（with filtering stop words），对文本特征进行量化的Naive Bayes Classifier 进行performance test

In [24]:
count_filter_vec=CountVectorizer(analyzer='word',stop_words='english')
tfidf_filter_vec=TfidfVectorizer(analyzer='word',stop_words='english')

In [25]:
# 使用带有stopwords停用词filter的CountVectorizer对train和test分别进行量化处理
X_count_filter_train=count_filter_vec.fit_transform(X_train)
X_count_filter_test=count_filter_vec.transform(X_test)

In [26]:
# 使用带有停用词filter的TfidfVectorizer(term frequency inverse document frequency)对train和test分别进行量化处理
X_tfidf_filter_train=tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test=tfidf_filter_vec.transform(X_test)

In [27]:
# 初始化默认配置的朴素贝叶斯分类器，并对CountVectorizer后的数据进行predict和accuracy分析
from sklearn.naive_bayes import MultinomialNB
mnb_count_filter=MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
print("The accuracy of classifying 20newsgroups using Naive Bayes(CountVectorizer by filtering stopwords)"
     +str(mnb_count_filter.score(X_count_filter_test,y_test)))

y_pred_mnb_count_filter=mnb_count_filter.predict(X_count_filter_test)

The accuracy of classifying 20newsgroups using Naive Bayes(CountVectorizer by filtering stopwords)0.8637521222410866


In [28]:
# 初始化另一个默认配置的朴素贝叶斯分类器，并对TfidfVectorizer后的数据进行predict和accuracy分析
from sklearn.naive_bayes import MultinomialNB
mnb_tfidf_filter=MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
print("The accuracy of classifying 20newsgroup using Naive Bayes(TfidfVectorizer by filtering stopwords)"
     +str(mnb_tfidf_filter.score(X_tfidf_filter_test,y_test)))

y_pred_mnb_tfidf_filter=mnb_tfidf_filter.predict(X=X_tfidf_filter_test)

The accuracy of classifying 20newsgroup using Naive Bayes(TfidfVectorizer by filtering stopwords)0.8826400679117148


In [29]:
# 对以上两个模型进行更加详细的性能评估
from sklearn.metrics import classification_report
# CountVectorizer模型性能评估
print(classification_report(y_true=y_test,y_pred=y_pred_mnb_count_filter))
# TfidfVectorizer模型性能评估
print(classification_report(y_true=y_test,y_pred=y_pred_mnb_tfidf_filter))

             precision    recall  f1-score   support

          0       0.85      0.89      0.87       201
          1       0.62      0.88      0.73       250
          2       0.93      0.22      0.36       248
          3       0.62      0.88      0.73       240
          4       0.93      0.85      0.89       242
          5       0.82      0.85      0.84       263
          6       0.90      0.79      0.84       257
          7       0.91      0.91      0.91       238
          8       0.98      0.94      0.96       276
          9       0.98      0.92      0.95       251
         10       0.92      0.99      0.95       233
         11       0.91      0.97      0.93       238
         12       0.87      0.89      0.88       249
         13       0.94      0.95      0.95       245
         14       0.91      0.96      0.93       221
         15       0.87      0.94      0.90       232
         16       0.89      0.96      0.93       251
         17       0.95      0.98      0.97   