In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
from nltk.corpus import stopwords
import numpy as np

### Load Data
Download data from 20 news groups 

In [0]:
newsgroups_train = fetch_20newsgroups(subset='train')
print(list(newsgroups_train.target_names))

newsgroups_test = fetch_20newsgroups(subset='train')

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


##Prepare the Data
To keep it simple, let's filter only 5 of the 20 topics. 
We will then convert the unstructured text to a structured vector of thousands of features made up of the words from the documents.  Stop words like “is”, “the”, “it” wil be removed.  

In [0]:
#Categories     0               1                   2               3             4
categories = ['alt.atheism', 'comp.graphics', 'rec.motorcycles', 'sci.space', 'talk.politics.guns']

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, 
                                      shuffle=True, random_state=2017, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, 
                                     shuffle=True, random_state=2017, remove=('headers', 'footers', 'quotes'))

y_train = newsgroups_train.target
y_test = newsgroups_test.target

# Convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()    # This one is the basic text to feature vector function, try some of the options
vectorizer = TfidfVectorizer(lowercase=False, stop_words='english')
vectorizer = TfidfVectorizer(smooth_idf = True, max_df=0.5, stop_words='english')
vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df=0.5,  ngram_range=(1, 2), stop_words='english')
X_train = vectorizer.fit_transform(newsgroups_train.data)  # Learn vocabulary and idf, return term-document matrix.
X_test = vectorizer.transform(newsgroups_test.data)        # Transform documents to term-document matrix.

print("Train Dataset")
print("%d documents" % len(newsgroups_train.data))
print("%d categories" % len(newsgroups_train.target_names))
print("n_samples: %d, n_features: %d" % X_train.shape)

print("\nTest Dataset")
print("%d documents" % len(newsgroups_test.data))
print("%d categories" % len(newsgroups_test.target_names))
print("n_samples: %d, n_features: %d" % X_test.shape)

Train Dataset
2801 documents
5 categories
n_samples: 2801, n_features: 241036

Test Dataset
1864 documents
5 categories
n_samples: 1864, n_features: 241036


### Decision Tree Model

In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

clf = tree.DecisionTreeClassifier(criterion = 'entropy', random_state=0, min_samples_leaf = 3, max_depth=None, min_samples_split=2)
clf.fit(X_train, y_train)

# generate evaluation metrics
print("Categories: 0=alt.atheism, 1=comp.graphics, 2=rec.motorcycles, 3=sci.space, 4=talk.politics.guns\n")
print ("Train - Accuracy :", metrics.accuracy_score(y_train, clf.predict(X_train)))
print ("Train - Confusion matrix :\n", metrics.confusion_matrix(y_train, clf.predict(X_train)))
print ("Train - classification report :\n", metrics.classification_report(y_train, clf.predict(X_train)))

print ("Test - Accuracy :", metrics.accuracy_score(y_test, clf.predict(X_test)))
print ("Test - Confusion matrix :\n",metrics.confusion_matrix(y_test, clf.predict(X_test)))
print ("Test - classification report :\n", metrics.classification_report(y_test, clf.predict(X_test)))

Categories: 0=alt.atheism, 1=comp.graphics, 2=rec.motorcycles, 3=sci.space, 4=talk.politics.guns

Train - Accuracy : 0.8418367346938775
Train - Confusion matrix :
 [[279   4  30  12   7]
 [ 13 349  31  16   5]
 [ 14  17 370   7   7]
 [ 15  16  37 330  10]
 [ 17   8  36   8 322]]
Train - classification report :
               precision    recall  f1-score   support

           0       0.83      0.84      0.83       332
           1       0.89      0.84      0.86       414
           2       0.73      0.89      0.81       415
           3       0.88      0.81      0.85       408
           4       0.92      0.82      0.87       391

    accuracy                           0.84      1960
   macro avg       0.85      0.84      0.84      1960
weighted avg       0.85      0.84      0.84      1960

Test - Accuracy : 0.6052318668252081
Test - Confusion matrix :
 [[ 86   4  33   8  17]
 [ 12 114  24  17   3]
 [ 17  13 125  17  11]
 [ 14  19  35 106  11]
 [ 24   3  31  19  78]]
Test - classificat

In [0]:
# Now let's look at one example.   Choose a test example by setting tx = value
# Try 0, 1801, 531, 1500, 99, 777
tx = 0

print("newsgroups_test example number", tx, ":")
print(newsgroups_test.data[tx])
#print(X_test.shape)

print("\nThe associated TFIDF vector:")
print(X_test[tx])

print("\nThe model classifies this example as:")
y_test_example = clf.predict(X_test[tx])
print("Category = ", y_test_example, "=", categories[int(y_test_example)])

newsgroups_test example number 0 :


"This is your god" (from John Carpenter's "They Live," natch)



The associated TFIDF vector:
  (0, 29912)	0.26750324743587045
  (0, 29918)	0.31543082231244873
  (0, 31375)	0.26239467065053323
  (0, 98408)	0.16463577011982283
  (0, 98483)	0.31543082231244873
  (0, 120184)	0.1610908752373777
  (0, 120191)	0.29991874236005844
  (0, 152013)	0.17626588255558212
  (0, 152042)	0.288912746481491
  (0, 184984)	0.12335578536189261
  (0, 185233)	0.31543082231244873
  (0, 195494)	0.21057314470311617
  (0, 195510)	0.31543082231244873
  (0, 196318)	0.2335572485629079
  (0, 196319)	0.29991874236005844

The model classifies this example as:
Category =  [2] = rec.motorcycles
