In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [6]:
n = 853
print('Topic = {0}\n'.format(newsgroups_train.target_names[newsgroups_train.target[n]]))
print(newsgroups_train.data[n])

Topic = sci.crypt

Archive-name: cryptography-faq/part09
Last-modified: 1993/4/15


FAQ for sci.crypt, part 9: Other Miscellany

This is the ninth of ten parts of the sci.crypt FAQ. The parts are
mostly independent, but you should read the first part before the rest.
We don't have the time to send out missing parts by mail, so don't ask.
Notes such as ``[KAH67]'' refer to the reference list in the last part.

The sections of this FAQ are available via anonymous FTP to rtfm.mit.edu 
as /pub/usenet/news.answers/cryptography-faq/part[xx].  The Cryptography 
FAQ is posted to the newsgroups sci.crypt, sci.answers, and news.answers 
every 21 days.


Contents:

* What is the National Security Agency (NSA)?
* What are the US export regulations?
* What is TEMPEST?
* What are the Beale Ciphers, and are they a hoax?
* What is the American Cryptogram Association, and how do I get in touch?
* Is RSA patented?
* What about the Voynich manuscript?


* What is the National Security Agency (NSA)?

  Th

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df=20, max_df=.04)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.04, max_features=None, min_df=20,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [25]:
len(vectorizer.vocabulary_)

5877

In [26]:
vectorizer.vocabulary_

{'wondering': 5813,
 'car': 989,
 'saw': 4673,
 'door': 1791,
 'sports': 5027,
 'looked': 3198,
 'late': 3055,
 'early': 1851,
 'doors': 1792,
 'small': 4928,
 'addition': 360,
 'separate': 4773,
 'rest': 4509,
 'body': 835,
 'model': 3443,
 'engine': 1947,
 'specs': 5003,
 'production': 4152,
 'history': 2591,
 'info': 2778,
 'fair': 2123,
 'souls': 4971,
 'upgraded': 5576,
 'si': 4856,
 'clock': 1156,
 'shared': 4820,
 'experiences': 2069,
 'poll': 4007,
 'brief': 881,
 'message': 3366,
 'procedure': 4136,
 'speed': 5008,
 'cpu': 1452,
 'rated': 4305,
 'add': 357,
 'cards': 991,
 'adapters': 355,
 'heat': 2556,
 'hour': 2637,
 'usage': 5585,
 'floppy': 2245,
 'disk': 1745,
 'functionality': 2333,
 '800': 258,
 'floppies': 2244,
 'especially': 1989,
 'requested': 4472,
 'days': 1538,
 'network': 3583,
 'knowledge': 3020,
 'base': 724,
 'upgrade': 5575,
 'haven': 2539,
 'answered': 497,
 'folks': 2253,
 'mac': 3227,
 'plus': 3991,
 'finally': 2202,
 'gave': 2359,
 'weekend': 5754,
 'st

In [27]:
vectorizer.vocabulary_.get('separate')

4773

In [28]:
text = 'I was wondering if anyone out there could enlighten me on this car I saw'
x = vectorizer.transform([text])

In [29]:
type(x)

scipy.sparse.csr.csr_matrix

In [30]:
x.data

array([1, 1, 1], dtype=int64)

In [31]:
x.nonzero()

(array([0, 0, 0]), array([ 989, 4673, 5813]))

In [32]:
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
vectorizer.inverse_transform(x)

[array(['car', 'saw', 'wondering'], dtype='<U18')]

In [34]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 5877)

In [35]:
X_train.nnz / np.prod(X_train.shape)

0.006326439400317432

# THE TASK

In [41]:
from tqdm import tqdm

def lda(X, our_topics, alpha, beta, itera=10):
    our_key = np.zeros((our_topics, X.shape[1]))
    our_dk = np.zeros((X.shape[0], our_topics))
    our_k = np.zeros(our_topics)
    
    docs, words = X.nonzero()
    z = np.random.choice(our_topics, len(docs))
    
    for doc, word, current_z in zip(docs, words, z):
        our_dk[doc, current_z] += 1
        our_key[current_z, word] += 1
        our_k[current_z] += 1
    
    for current_itera in tqdm(range(itera)):
        for i in range(len(docs)):
            current_word = words[i]
            current_doc = docs[i]
            current_topic = z[i]
            
            our_dk[current_doc, current_topic] -= 1
            our_key[current_topic, current_word] -= 1
            our_k[current_topic] -= 1
            
            p = (our_dk[current_doc, :] + alpha) * (our_key[:, current_word] + beta[current_word]) / \
                (our_k + beta.sum())
            z[i] = np.random.choice(np.arange(our_topics), p=p / p.sum())
            
            our_dk[current_doc, z[i]] += 1
            our_key[z[i], current_word] += 1
            our_k[z[i]] += 1
    
    return z, our_key, our_dk, our_k


In [42]:
our_topics = 20
z, our_key, our_dk, our_k = lda(X_train, our_topics, 1 * np.ones(our_topics), \
                         1 * np.ones(X_train.shape[1]), 60)

100%|██████████| 60/60 [19:43<00:00, 19.29s/it]


In [45]:
top_words = np.argsort(our_key, axis=1)[:, :-11:-1]

for topic in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_words[topic]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(topic, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	asked	came	days	happened	later	left	saw	told	took	went
Topic 1:	area	bike	dod	feel	looks	nice	oh	ride	stay	stuff
Topic 2:	banks	cadre	geb	gordon	intellect	pitt	shameful	skepticism	soon	surrender
Topic 3:	clinton	control	crime	federal	gun	guns	law	laws	rights	states
Topic 4:	bible	christ	christian	christians	church	faith	jesus	man	religion	word
Topic 5:	cause	certain	disease	effect	food	large	results	treatment	type	usually
Topic 6:	application	code	file	files	ftp	running	server	user	version	window
Topic 7:	advance	anybody	appreciate	appreciated	hi	info	news	sorry	thank	wondering
Topic 8:	au	ca	cs	david	ed	hi	michael	mr	ms	university
Topic 9:	center	development	earth	nasa	research	science	space	systems	technology	university
Topic 10:	history	israel	israeli	jewish	jews	killed	land	population	today	war
Topic 11:	box	condition	control	current	low	output	radio	sale	sell	sound
Topic 12:	aren	couldn	couple	david	guess	hear	sorry	tried	wonder	wouldn
Topic 13:	algorithm	chip	clipper	enc

Мы можем найти определённые топики, к которым относятся слова из этого списка в заданных темах, например:
Топик 18: электроника.
Топик 17: хоккей или бейсбол.
Топик 16: автомобили.
Топик 13: шифрование.
Топик 10: религия.
Топик 9: космос.
Топик 6: устройство ос windows.
Топик 5: медицина.
Топик 4: христианская религия.
Топик 3: политика.
Топик 1: возможно, мотоциклы.


In [44]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']