In [33]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS, analyzer='word', binary=True, min_df = 10, max_df =.04)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(binary=True, max_df=0.04, min_df=10,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [35]:
len(vectorizer.vocabulary_)

10299

In [36]:
vectorizer.vocabulary_

{'wondering': 10138,
 'enlighten': 3576,
 'car': 1905,
 'saw': 8221,
 'door': 3288,
 'sports': 8787,
 'looked': 5726,
 'late': 5506,
 '60s': 542,
 'early': 3399,
 '70s': 593,
 'doors': 3289,
 'small': 8633,
 'addition': 824,
 'bumper': 1791,
 'separate': 8381,
 'rest': 7936,
 'body': 1628,
 'model': 6155,
 'engine': 3560,
 'specs': 8750,
 'production': 7337,
 'history': 4668,
 'info': 4995,
 'fair': 3856,
 'brave': 1710,
 'souls': 8705,
 'upgraded': 9730,
 'si': 8518,
 'clock': 2185,
 'oscillator': 6697,
 'shared': 8445,
 'experiences': 3762,
 'poll': 7118,
 'brief': 1731,
 'message': 6021,
 'detailing': 3061,
 'procedure': 7318,
 'speed': 8756,
 'cpu': 2689,
 'rated': 7616,
 'add': 821,
 'cards': 1912,
 'adapters': 818,
 'heat': 4598,
 'hour': 4740,
 'usage': 9748,
 'floppy': 4047,
 'disk': 3202,
 'functionality': 4198,
 '800': 620,
 'floppies': 4046,
 'especially': 3635,
 'requested': 7883,
 'days': 2867,
 'network': 6398,
 'knowledge': 5439,
 'base': 1435,
 'upgrade': 9729,
 'haven'

In [37]:
X_train = vectorizer.fit_transform(newsgroups_train.data)

In [9]:
X_train.shape

(11314, 10299)

In [39]:
def lda(X, n_tags, n_iter, a, b):
    n_kw = np.zeros((n, X.shape[1]))
    n_dk = np.zeros((X.shape[0], n))
    n_k = np.zeros(n)
    docs, words = X.nonzero()
    tags = np.random.choice(n, len(docs))
    for w, d, k in zip(words, docs, tags):
        n_kw[k, w] += 1
        n_dk[d, k] += 1
        n_k[k] += 1
        
    for i in range(n_iter):
        for k in range(len(docs)):
            n_kw[tags[k], words[k]] -= 1
            n_dk[docs[k], tags[k]] -= 1
            n_k[tags[k]] -= 1
            
            p = (n_dk[docs[k], :] + a) * (n_kw[:, words[k]] + b[words[k]]) / (n_k + b.sum())
            
            tags[k] = np.random.choice(np.arange(n), p = p / p.sum())
            
            n_kw[tags[k], words[k]] += 1
            n_dk[docs[k], tags[k]] += 1
            n_k[tags[k]] += 1
    return n_kw, n_dk, n_k, tags
        
    

In [40]:
n = 20
n_kw, n_dk, n_k, tags = lda(X_train, n, 50, np.ones(n), np.ones(X_train.shape[1]))

In [42]:
res = np.argsort(n_kw, axis = 1)[:, -10:]
n = 0
for tag in res:
    v = np.zeros((1, X_train.shape[1]))
    v[0, tag] = 1
    print(str(n))
    n += 1
    print(vectorizer.inverse_transform(v)[0])

0
['american' 'control' 'crime' 'gun' 'law' 'laws' 'national' 'public'
 'rights' 'states']
1
['chip' 'clipper' 'encryption' 'key' 'keys' 'phone' 'public' 'secret'
 'secure' 'security']
2
['bike' 'car' 'cars' 'dod' 'engine' 'miles' 'ride' 'road' 'speed' 'turn']
3
['add' 'appreciated' 'current' 'hear' 'oh' 'similar' 'small' 'sorry'
 'sounds' 'stuff']
4
['anybody' 'days' 'hi' 'ones' 'regards' 'seriously' 'sorry' 'sounds'
 'talking' 'wondering']
5
['article' 'david' 'folks' 'mentioned' 'mind' 'somebody' 'sounds' 'thank'
 'understand' 'working']
6
['anybody' 'couldn' 'couple' 'guess' 'haven' 'imagine' 'nice' 'sorry'
 'sort' 'stuff']
7
['game' 'games' 'hockey' 'league' 'play' 'player' 'players' 'season'
 'team' 'win']
8
['agree' 'answer' 'anybody' 'area' 'btw' 'couple' 'knows' 'likely' 'stuff'
 'willing']
9
['bible' 'christ' 'christian' 'christians' 'claim' 'faith' 'jesus' 'man'
 'religion' 'word']
10
['14' '25' 'ad' 'ah' 'hi' 'mb' 'mi' 'mr' 'ms' 'tm']
11
['100' 'buy' 'computer' 'condition' 

Можно увидеть темы: политика(0), криптография(1), машины(2), хокей(7), религия(9), покупки(11), новости(12), исследования(18) и т.д