In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

### Выбираем категории новостей для последующей кластеризации

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

### Загружаем данные по выбранным категориям

In [3]:
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

In [16]:
dataset.data[1] # по груупам новостей

'From: agr00@ccc.amdahl.com (Anthony G Rose)\nSubject: Re: Who\'s next?  Mormons and Jews?\nReply-To: agr00@JUTS.ccc.amdahl.com (Anthony G Rose)\nOrganization: Amdahl Corporation, Sunnyvale CA\nLines: 18\n\nIn article <1993Apr20.142356.456@ra.royalroads.ca> mlee@post.RoyalRoads.ca (Malcolm Lee) writes:\n>\n>In article <C5rLps.Fr5@world.std.com>, jhallen@world.std.com (Joseph H Allen) writes:\n>|> In article <1qvk8sINN9vo@clem.handheld.com> jmd@cube.handheld.com (Jim De Arras) writes:\n>|> \n>|> It was interesting to watch the 700 club today.  Pat Robertson said that the\n>|> "Branch Dividians had met the firey end for worshipping their false god." He\n>|> also said that this was a terrible tragedy and that the FBI really blew it.\n>\n>I don\'t necessarily agree with Pat Robertson.  Every one will be placed before\n>the judgement seat eventually and judged on what we have done or failed to do\n>on this earth.  God allows people to choose who and what they want to worship.\n\nI\'m sorry,

In [5]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [19]:
labels

array([0, 1, 1, ..., 2, 1, 1], dtype=int64)

In [20]:
true_k

4

### Создаём tf-idf векторизатор и преобразуем набор данных

In [6]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)

In [7]:
print(vectorizer)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=10000, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


In [8]:
X = vectorizer.fit_transform(dataset.data)

In [9]:
print(X)

  (0, 4280)	0.22992141448640455
  (0, 7902)	0.1889256019903947
  (0, 9919)	0.21328232097817776
  (0, 8794)	0.30132329024982274
  (0, 4283)	0.15150641591152192
  (0, 5020)	0.13393559776294517
  (0, 1450)	0.2820108789924157
  (0, 292)	0.12435662889321993
  (0, 9667)	0.16910493634764495
  (0, 2042)	0.05383593316292927
  (0, 1004)	0.045051336007442246
  (0, 152)	0.07759116315242108
  (0, 9272)	0.22241586118986265
  (0, 4871)	0.22241586118986265
  (0, 7723)	0.22129150325593108
  (0, 8310)	0.14984287145300662
  (0, 6020)	0.21064980213680898
  (0, 2582)	0.055320543195910114
  (0, 9740)	0.08374591684764861
  (0, 68)	0.04808827112166061
  (0, 939)	0.04956170975219452
  (0, 149)	0.047357670652908794
  (0, 198)	0.04962177316072546
  (0, 274)	0.0597785123773888
  (0, 369)	0.1398730113697326
  :	:
  (3386, 9292)	0.30552157148654474
  (3386, 3544)	0.08013051548044868
  (3386, 1728)	0.2125936993383011
  (3386, 1855)	0.1093152679252292
  (3386, 4350)	0.10274431566590712
  (3386, 5957)	0.08620868011292

### Устанавливаем параметры для кластеризации и запускаем обучение модели

In [10]:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000, verbose=True)

In [11]:
km.fit(X)

Init 1/1 with method: k-means++
Inertia for init 1/1: 965.696104
Minibatch iteration 1/400: mean batch inertia: 0.975016, ewa inertia: 0.975016 
Minibatch iteration 2/400: mean batch inertia: 0.971561, ewa inertia: 0.972976 
Minibatch iteration 3/400: mean batch inertia: 0.969555, ewa inertia: 0.970957 
Minibatch iteration 4/400: mean batch inertia: 0.968936, ewa inertia: 0.969764 
Minibatch iteration 5/400: mean batch inertia: 0.967327, ewa inertia: 0.968325 
Minibatch iteration 6/400: mean batch inertia: 0.967836, ewa inertia: 0.968037 
Minibatch iteration 7/400: mean batch inertia: 0.966869, ewa inertia: 0.967347 
Minibatch iteration 8/400: mean batch inertia: 0.966152, ewa inertia: 0.966642 
Minibatch iteration 9/400: mean batch inertia: 0.965983, ewa inertia: 0.966253 
Minibatch iteration 10/400: mean batch inertia: 0.965226, ewa inertia: 0.965647 
Minibatch iteration 11/400: mean batch inertia: 0.968196, ewa inertia: 0.967151 
Minibatch iteration 12/400: mean batch inertia: 0.966

MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=True)

### Получаем центры кластеров

In [12]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

### Выводим признаки, характеризующие центры кластеров

In [13]:
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 keith
 sgi
 livesey
 caltech
 wpd
 solntze
 cwru
 jon
 com
 cc
Cluster 1:
 graphics
 com
 university
 posting
 host
 nntp
 image
 thanks
 cs
 computer
Cluster 2:
 god
 com
 sandvik
 people
 jesus
 don
 say
 christian
 kent
 bible
Cluster 3:
 space
 henry
 access
 toronto
 digex
 nasa
 pat
 alaska
 shuttle
 zoo
