In [1]:
# We are accessing a built in dataset called 20NG dataset, which contains 18,000 news posts on 20 categories. 
# We will only use 4-category subset in this demo.
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

labels = dataset.target
true_k = np.unique(labels).shape[0]
data = dataset.data  

In [3]:
labels.shape

(3387,)

In [4]:
type(dataset)

sklearn.utils.Bunch

In [5]:
# This is to vectorize the text corpus. After these codes, the X object will be the input vector for machine learning models.
# When transform into vectors, we do NOT use the raw count of a word in a document. Instead, we use the word's tf-idf score in a document.
# max_df=0.5 means ignoring words that appear in more than 50% of the documents; min_df=2 means ignoring words that appear in less than 2 documents.

vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)

X = vectorizer.fit_transform(data)

In [6]:
# X in the previous step is hi-dimentional data, we need to use some dimentionality reduction technique. 
# In this case, we will use SVD (Singular Value Decomposition), which is a common matrix decomposition technique.
# We want to reduce the dimentionality to 5.
# We have to re-normalize after we run our SVD on the dataset.

n_components = 5
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

In [7]:
X.shape

(3387, 5)

In [8]:
# Randomly select 80% (3387*80% = 2710) rows from X as the training set
training_idx = np.random.choice(X.shape[0], size=2710, replace=False)
X_training = X[training_idx, :]

In [9]:
# The remaining is the test set
test_idx = list(set(range(X.shape[0])) - set(training_idx))
X_test = X[test_idx, :]

In [10]:
# The same split for the labels list
labels_training = [labels[i] for i in training_idx]
labels_test = [labels[j] for j in test_idx]

In [11]:
# This is to use Naive Bayes Classifier to predict the label of each news article.
gnb = GaussianNB()
gnb.fit(X_training, labels_training)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
# This is the prediction result.
gnb.predict(X_test)

array([3, 1, 2, 2, 1, 3, 0, 2, 2, 0, 0, 0, 2, 1, 1, 3, 1, 1, 2, 3, 2, 2,
       2, 2, 0, 1, 1, 1, 1, 2, 2, 0, 3, 2, 2, 1, 1, 1, 1, 2, 2, 1, 0, 0,
       1, 0, 1, 1, 3, 3, 1, 1, 2, 3, 1, 2, 3, 1, 1, 1, 2, 1, 0, 3, 3, 3,
       0, 3, 2, 1, 1, 1, 3, 1, 0, 2, 2, 1, 0, 3, 1, 3, 0, 2, 2, 1, 1, 2,
       0, 1, 2, 2, 2, 0, 0, 3, 2, 2, 2, 1, 3, 1, 2, 2, 2, 3, 2, 3, 2, 3,
       2, 3, 1, 0, 3, 1, 2, 3, 3, 1, 2, 0, 3, 3, 0, 0, 3, 0, 2, 2, 2, 2,
       2, 1, 2, 2, 0, 1, 0, 3, 1, 0, 1, 2, 2, 2, 2, 1, 1, 3, 2, 1, 3, 3,
       1, 0, 0, 1, 2, 0, 1, 0, 0, 2, 2, 0, 2, 3, 3, 2, 2, 0, 0, 1, 2, 1,
       2, 3, 0, 1, 2, 2, 0, 1, 1, 1, 2, 2, 1, 2, 3, 0, 2, 3, 1, 2, 2, 0,
       0, 1, 1, 3, 1, 2, 2, 3, 1, 1, 1, 3, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2,
       0, 3, 3, 0, 1, 2, 3, 2, 0, 1, 0, 3, 2, 3, 2, 0, 0, 0, 2, 0, 3, 2,
       2, 2, 2, 0, 0, 0, 2, 2, 1, 2, 0, 3, 1, 2, 1, 1, 2, 2, 1, 3, 0, 0,
       2, 1, 3, 2, 1, 1, 2, 2, 0, 1, 0, 2, 2, 0, 1, 3, 3, 1, 3, 1, 0, 2,
       2, 1, 0, 3, 1, 2, 2, 1, 2, 2, 1, 3, 0, 2, 2,

In [13]:
# This is the confusion matrix.
confusion_matrix(gnb.predict(X_test),labels_test)

array([[120,   2,   2,  23],
       [  3, 171,  14,   6],
       [  1,  11, 181,   9],
       [ 45,   6,   2,  81]])

In [14]:
# This is the accuracy score.
accuracy_score(gnb.predict(X_test),labels_test)


0.8168389955686853