In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test')
news = fetch_20newsgroups(subset='all')

In [3]:
print(f"Число наблюдений в обучающей выборке {newsgroups_train.filenames.shape}")

Число наблюдений в обучающей выборке (11314,)


In [4]:
print("Число наблюдений в тестовой выборке", newsgroups_test.filenames.shape)
print("Список новостных рубрик\n")
print(list(newsgroups_train.target_names))

Число наблюдений в тестовой выборке (7532,)
Список новостных рубрик

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
from numpy import vectorize
from numpy import arange
vectorizer = CountVectorizer()
sparse_train = vectorizer.fit_transform(newsgroups_train.data)
sparse_test = vectorizer.transform(newsgroups_test.data)

In [6]:
alpha_increment = arange(0.05, 1.00, 0.05)
print(alpha_increment)

[0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65 0.7
 0.75 0.8  0.85 0.9  0.95]


1. подбор альфы для мультиномиальной модели БА

In [7]:
results = []
for alpha in alpha_increment:
  mnb = MultinomialNB(alpha=alpha)
  mnb.fit(sparse_train, newsgroups_train.target)
  pred = mnb.predict(sparse_test)
  accuracy = metrics.accuracy_score(newsgroups_test.target, pred)
  results.append((alpha, accuracy))
  print(f"alpha={alpha:.2f}, accuracy={accuracy:.4f}")

maximum_accuracy = max(results, key=lambda x: x[1])
print(maximum_accuracy)

alpha=0.05, accuracy=0.7388
alpha=0.10, accuracy=0.7395
alpha=0.15, accuracy=0.7386
alpha=0.20, accuracy=0.7362
alpha=0.25, accuracy=0.7331
alpha=0.30, accuracy=0.7286
alpha=0.35, accuracy=0.7242
alpha=0.40, accuracy=0.7191
alpha=0.45, accuracy=0.7136
alpha=0.50, accuracy=0.7084
alpha=0.55, accuracy=0.7013
alpha=0.60, accuracy=0.6957
alpha=0.65, accuracy=0.6877
alpha=0.70, accuracy=0.6815
alpha=0.75, accuracy=0.6737
alpha=0.80, accuracy=0.6665
alpha=0.85, accuracy=0.6573
alpha=0.90, accuracy=0.6476
alpha=0.95, accuracy=0.6417
(np.float64(0.1), 0.7395114179500797)


подбор альфы для модели бернули БА

In [8]:
results = []
for alpha in alpha_increment:
  clf = BernoulliNB(alpha=alpha)
  clf.fit(sparse_train, newsgroups_train.target)
  pred = clf.predict(sparse_test)
  accuracy = metrics.accuracy_score(newsgroups_test.target, pred)
  results.append((alpha, accuracy))
  print(f"alpha={alpha:.2f}, accuracy={accuracy:.4f}")

maximum_accuracy = max(results, key=lambda x: x[1])
print(maximum_accuracy)

alpha=0.05, accuracy=0.7014
alpha=0.10, accuracy=0.6972
alpha=0.15, accuracy=0.6928
alpha=0.20, accuracy=0.6907
alpha=0.25, accuracy=0.6843
alpha=0.30, accuracy=0.6790
alpha=0.35, accuracy=0.6746
alpha=0.40, accuracy=0.6702
alpha=0.45, accuracy=0.6658
alpha=0.50, accuracy=0.6613
alpha=0.55, accuracy=0.6572
alpha=0.60, accuracy=0.6532
alpha=0.65, accuracy=0.6492
alpha=0.70, accuracy=0.6447
alpha=0.75, accuracy=0.6410
alpha=0.80, accuracy=0.6358
alpha=0.85, accuracy=0.6304
alpha=0.90, accuracy=0.6249
alpha=0.95, accuracy=0.6223
(np.float64(0.05), 0.7014073287307488)


In [9]:
from sklearn.metrics import classification_report

# для MultinomialNB
mnb = MultinomialNB(alpha=0.10)
mnb.fit(sparse_train, newsgroups_train.target)
pred_mnb = mnb.predict(sparse_test)
print("MultinomialNB (alpha=0.10):")
print(classification_report(newsgroups_test.target, pred_mnb,
                            target_names=newsgroups_test.target_names))
accMNB = metrics.accuracy_score(newsgroups_test.target, pred_mnb)
print(f"accuracy = {accMNB:.4f}")


MultinomialNB (alpha=0.10):
                          precision    recall  f1-score   support

             alt.atheism       0.68      0.70      0.69       319
           comp.graphics       0.45      0.83      0.59       389
 comp.os.ms-windows.misc       0.20      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.55      0.72      0.62       392
   comp.sys.mac.hardware       0.79      0.62      0.69       385
          comp.windows.x       0.65      0.74      0.70       395
            misc.forsale       0.84      0.80      0.82       390
               rec.autos       0.84      0.84      0.84       396
         rec.motorcycles       0.89      0.89      0.89       398
      rec.sport.baseball       0.98      0.84      0.90       397
        rec.sport.hockey       0.96      0.95      0.95       399
               sci.crypt       0.80      0.90      0.85       396
         sci.electronics       0.73      0.68      0.71       393
                 sci.med       0.69      0.88  

In [10]:
from sklearn.metrics import classification_report

# для BernulilNB
clf = BernoulliNB(alpha=0.05)
clf.fit(sparse_train, newsgroups_train.target)
pred_clf = clf.predict(sparse_test)
print("BernoulliNB (alpha=0.05):")
print(classification_report(newsgroups_test.target, pred_clf,
                            target_names=newsgroups_test.target_names))
accCLF = metrics.accuracy_score(newsgroups_test.target, pred_clf)
print(f"accuracy = {accCLF:.4f}")

BernoulliNB (alpha=0.05):
                          precision    recall  f1-score   support

             alt.atheism       0.71      0.59      0.65       319
           comp.graphics       0.43      0.76      0.55       389
 comp.os.ms-windows.misc       0.22      0.01      0.01       394
comp.sys.ibm.pc.hardware       0.54      0.71      0.61       392
   comp.sys.mac.hardware       0.83      0.58      0.68       385
          comp.windows.x       0.70      0.73      0.72       395
            misc.forsale       0.69      0.84      0.76       390
               rec.autos       0.81      0.80      0.80       396
         rec.motorcycles       0.80      0.83      0.81       398
      rec.sport.baseball       0.94      0.76      0.84       397
        rec.sport.hockey       0.96      0.92      0.94       399
               sci.crypt       0.67      0.84      0.75       396
         sci.electronics       0.62      0.68      0.65       393
                 sci.med       0.77      0.75    

Оптимальные значения: MultinomialNB (α = 0.10, accuracy = 73.95%) и BernoulliNB (α = 0.05, accuracy = 70.14%). С ростом α точность падает — слишком сильное сглаживание "размывает" различия между рубриками, и модель хуже различает классы. Маленькие значения α сохраняют информативность редких слов, характерных для конкретных рубрик.

2. априорные вероятности (сравнение подходов: по долям или равные)

In [11]:
import numpy as np
unique, counts = np.unique(newsgroups_train.target, return_counts=True)
print(counts)
print(counts / counts.sum())

[480 584 591 590 578 593 585 594 598 597 600 595 591 594 593 599 546 564
 465 377]
[0.04242531 0.05161747 0.05223617 0.05214778 0.05108715 0.05241294
 0.05170585 0.05250133 0.05285487 0.05276648 0.05303164 0.05258971
 0.05223617 0.05250133 0.05241294 0.05294326 0.04825879 0.04984974
 0.04109952 0.03332155]


In [12]:
equal_priors = np.ones(20) / 20

In [13]:
class_priors = counts / counts.sum()

In [14]:
# Равные априорные
mnb2 = MultinomialNB(alpha=0.10, class_prior=equal_priors)
mnb2.fit(sparse_train, newsgroups_train.target)
pred2 = mnb2.predict(sparse_test)
acc2 = metrics.accuracy_score(newsgroups_test.target, pred2)

# По долям в выборке
mnb3 = MultinomialNB(alpha=0.10, class_prior=class_priors)
mnb3.fit(sparse_train, newsgroups_train.target)
pred3 = mnb3.predict(sparse_test)
acc3 = metrics.accuracy_score(newsgroups_test.target, pred3)

print("Для мультиномиального:")
print(f"По умолчанию:      accuracy = {accMNB:.6f}")
print(f"Равные априорные:  accuracy = {acc2:.6f}")
print(f"По долям выборки:  accuracy = {acc3:.6f}")

Для мультиномиального:
По умолчанию:      accuracy = 0.739511
Равные априорные:  accuracy = 0.739246
По долям выборки:  accuracy = 0.739511


In [15]:
# Равные априорные
clf2 = BernoulliNB(alpha=0.05, class_prior=equal_priors)
clf2.fit(sparse_train, newsgroups_train.target)
pred2 = clf2.predict(sparse_test)
acc2 = metrics.accuracy_score(newsgroups_test.target, pred2)

# По долям в выборке
clf3 = BernoulliNB(alpha=0.05, class_prior=class_priors)
clf3.fit(sparse_train, newsgroups_train.target)
pred3 = clf3.predict(sparse_test)
acc3 = metrics.accuracy_score(newsgroups_test.target, pred3)

print("Для мультиномиального:")
print(f"По умолчанию:      accuracy = {accCLF:.6f}")
print(f"Равные априорные:  accuracy = {acc2:.6f}")
print(f"По долям выборки:  accuracy = {acc3:.6f}")

Для мультиномиального:
По умолчанию:      accuracy = 0.701407
Равные априорные:  accuracy = 0.701673
По долям выборки:  accuracy = 0.701407


Главный вывод: на сбалансированном датасете разница между подходами пренебрежимо мала. Выбор априорных вероятностей становится важным только при сильном дисбалансе классов.