In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [4]:
twenty_train['target_names']

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(twenty_train.data)

2257

In [6]:
len(twenty_train.filenames)

2257

In [7]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

# the .join turns the results of the split function (a list) into a string

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [8]:
# we are getting the target names i.e. the category of the newsgroup for the ith observation

print(twenty_train.target_names[twenty_train.target[0]])


# notice that the target names is an nd.array

twenty_train.target[:12]


# notice that the array values relate to the category names

for t in twenty_train.target[:12]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med
soc.religion.christian
comp.graphics


In [9]:
twenty_train.data[0]

'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

## Vectorize (matrixfy, tokenize) the data

Bags of words are high-dimensional sparse datasets because even though 100,000 unique words may be used (high-dimensional), the number of times each word is used will be quite low (sparse).

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(
    twenty_train.data) # we vectorize the data into an [i,j] matrix where i is the number of observations and j is the number of words over all observations

X_train_counts.shape

(2257, 35788)

In [12]:
# something to do with N-grams of words/consecutive characters
# more info here: https://towardsdatascience.com/introduction-to-language-models-n-gram-e323081503d9

count_vect.vocabulary_.get(u'algorithm')

4690

## Transforming the Data

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

* **tf** Term Frequencies; is standardizing the occurrence of words by the total number of words in the document

* **tf-idf** Term Frequency times Inverse Document Frequency; can be done on top of tf, this downscales weights that occur in many documents which are less useful than words that only occur in a small subset of documents

In [14]:
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts) # fit the estimator to data

X_train_tf = tf_transformer.transform(X_train_counts) # perform tf transformation of our counts data (???)

X_train_tf.shape

(2257, 35788)

In [15]:
# doing the above in one step instead of two steps:

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # perform tf_idf transformation of counts data
# notice fit_transform is used instead of .fit() then .transformt() 

X_train_tfidf.shape

(2257, 35788)

## Training the classifier

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
# we've trained our classifier on the tf_idf normalized training set

In [17]:
docs_new = ['God is love', 'OpenGL on the GPU is fast'] # 2 observations

X_new_counts = count_vect.transform(docs_new) # transform data to matrix of word counts (vectorize the data)

X_new_counts.shape # 2 rows

(2, 35788)

In [18]:
X_new_tfidf = tfidf_transformer.transform(X_new_counts) # standardize our test set data via tf_idf transformation

In [19]:
predicted = clf.predict(X_new_tfidf)
predicted
# we predicted that our 2 test observations are of class type 3 and 1 which are...

array([3, 1], dtype=int64)

In [20]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Building a Pipeline
Build a pipeline so that we can accomplish vectorization -> transformation -> classifier in one line of code

In [21]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
    ])

In [22]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

np.mean(predicted == twenty_test.target) # apparently you can use the np.mean() function check our predicted values with the target values   

# 83.489% accuracy

0.8348868175765646

Some helpful articles:

[Linear Kernel (Linear SVM): Why is it recommended for text classification](https://www.svm-tutorial.com/2014/10/svm-linear-kernel-good-text-classification/)

[Is it true, in higher dimensions that data is easier to linearly separate?](https://stats.stackexchange.com/questions/33437/is-it-true-that-in-high-dimensions-data-is-easier-to-separate-linearly)