In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

In [4]:
train_set = fetch_20newsgroups(subset='train', random_state=42)
test_set = fetch_20newsgroups(subset='test', random_state=42)

X_train = train_set.data
y_train = train_set.target
X_test = test_set.data
y_test = test_set.target

print('categories')
pprint(train_set.target_names)
print('article 1')
print(f"news scripts:\n{X_train[0]}")
print('article 1 category')
print(f"text category label {y_train[0]}")

categories
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
article 1
news scripts:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)

print(f'(text_num,work_num) count')
print(X_train_bow[0])
print('word vector')
print(X_train_bow[0].toarray())

(text_num,work_num) count
  (0, 4605)	1
  (0, 16574)	1
  (0, 18299)	1
  (0, 26070)	1
  (0, 34131)	1
  (0, 34943)	1
  (0, 35135)	1
  (0, 35560)	1
  (0, 37378)	1
  (0, 37722)	5
  (0, 40939)	1
  (0, 45232)	1
  (0, 48550)	1
  (0, 48552)	1
  (0, 50039)	1
  (0, 50455)	2
  (0, 51651)	1
  (0, 51714)	1
  (0, 57203)	1
  (0, 63238)	1
  (0, 63970)	1
  (0, 65968)	1
  (0, 67023)	1
  (0, 73061)	1
  (0, 74552)	1
  :	:
  (0, 79519)	1
  (0, 83103)	1
  (0, 86416)	1
  (0, 87451)	1
  (0, 90192)	1
  (0, 91885)	1
  (0, 94962)	1
  (0, 95944)	1
  (0, 98748)	1
  (0, 99619)	1
  (0, 101175)	1
  (0, 104609)	1
  (0, 105907)	1
  (0, 108033)	1
  (0, 109044)	1
  (0, 109354)	1
  (0, 111094)	1
  (0, 113755)	1
  (0, 114195)	1
  (0, 114439)	1
  (0, 118013)	2
  (0, 118714)	1
  (0, 122887)	2
  (0, 124627)	1
  (0, 127721)	1
word vector
[[0 0 0 ... 0 0 0]]


In [6]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=0.4)
mnb.fit(X_train_bow, y_train)

print(f"train score: {mnb.score(X_train_bow, y_train)}")
print(f"test score: {mnb.score(X_test_bow, y_test)}")

train score: 0.9512992752342231
test score: 0.8110727562400425


In [9]:
mbn_small = MultinomialNB(alpha=0.001)
mbn_small.fit(X_train_bow, y_train)

mbn_large = MultinomialNB(alpha=100)
mbn_large.fit(X_train_bow, y_train)

print(f"small train score: {mbn_small.score(X_train_bow, y_train)}")
print(f"small test score: {mbn_small.score(X_test_bow, y_test)}")
print(f"large train score: {mbn_large.score(X_train_bow, y_train)}")
print(f"large test score: {mbn_large.score(X_test_bow, y_test)}")

small train score: 0.987802722290967
small test score: 0.798858204992034
large train score: 0.7473926109245183
large test score: 0.6323685608072225
