## News Articles

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 
              'soc.religion.christian',
              'comp.graphics', 
              'sci.med']
# Load Data
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [3]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
twenty_train['target']

array([1, 1, 3, ..., 2, 2, 2])

In [5]:
len(twenty_train.data)

2257

## Feature Generation

In [6]:
# BoW Feature Generation
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer Object
count_vect = CountVectorizer()

# Fit and Transfrom Text Data
X_train_counts = count_vect.fit_transform(twenty_train.data)

# Check Shape of Count Vector
X_train_counts.shape

(2257, 35788)

## Classification

In [7]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_counts, twenty_train.target)

In [8]:
# Test the model
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = count_vect.transform(docs_new)

predicted = clf.predict(X_new_counts)

In [9]:
predicted

array([3, 1])

In [13]:
list(zip(docs_new, predicted))

[('God is love', 3), ('OpenGL on the GPU is fast', 1)]

In [18]:
twenty_train.target_names[0]

'alt.atheism'

In [10]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [17]:
# Model assessment using Test data
X_test_counts = count_vect.transform(twenty_test.data)

y_pred = clf.predict(X_test_counts)

In [18]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

metrics.accuracy_score(twenty_test.target, y_pred)

0.9340878828229028

In [21]:
metrics.precision_score(twenty_test.target, y_pred,average='macro'),metrics.recall_score(twenty_test.target, y_pred, average='macro')

(0.934146210065757, 0.9326014303654366)

## Clustering

In [19]:
# Import KMeans Model
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4).fit(X_train_counts)

pred_labels = kmeans.labels_

In [20]:
pred_labels

array([3, 3, 3, ..., 0, 2, 3], dtype=int32)

In [30]:
twenty_train.target

array([1, 1, 3, ..., 2, 2, 2])

In [31]:
pred_labels

array([0, 0, 0, ..., 3, 2, 0], dtype=int32)

In [21]:
from sklearn import metrics
# DBI score
dbi=metrics.davies_bouldin_score(X_train_counts.toarray(), pred_labels)
# Silhoutte Score
ss=metrics.silhouette_score(X_train_counts.toarray(), pred_labels , metric='euclidean')

dbi,ss

(1.09184700574742, 0.5492415093083978)