In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [2]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df=10, max_df=1.0)
vectorizer.fit(newsgroups_train.data);

In [23]:
X_train = vectorizer.fit_transform(newsgroups_train.data)

(11314, 10441)

### Запускаем модель LDA и Gibbs Sampling.

In [6]:
import time

def lda(X, tags_n, alpha, beta, iter_):
    n_kw = np.zeros((tags_n, X.shape[1]))
    n_dk = np.zeros((X.shape[0], tags_n))
    n_k = np.zeros(tags_n)
    d, w = X.nonzero()
    docs, words = X.nonzero()
    z = np.random.choice(tags_n, docs.shape[0])
    # assign random tags
    for d, w, z_ in zip(docs, words, z):
        n_dk[d, z_] += 1
        n_kw[z_, w] += 1
        n_k[z_] += 1
    # run the algo
    for it in range(iter_):
        for idx, d in enumerate(docs):
            w = words[idx]
            t = z[idx]
            n_dk[d, t] -= 1
            n_kw[t, w] -= 1
            n_k[t] -= 1
            p = (n_dk[d, :] + alpha)*(n_kw[:, w] + beta[w])/(n_k + beta.sum())
            z[idx] = np.random.choice(np.arange(tags_n), p=p/p.sum())
            n_dk[d, z[idx]] += 1
            n_kw[z[idx], w] += 1
            n_k[z[idx]] += 1
    return z, n_kw, n_dk, n_k

In [7]:
tags_n = 20
start = time.time()
z, n_kw, n_dk, n_k = lda(X_train, 
                         tags_n, 
                         alpha=np.ones(tags_n), 
                         beta=np.ones(X_train.shape[1]), 
                         iter_=50)
end = time.time()

In [16]:
print(f'Time:{np.round((end - start)/60, 0)} min {np.mod(np.round(end - start, 0),60)} s')

Time:28.0 min 34.0 s


#### Выведем топ 10

In [17]:
top_10 = np.argsort(n_kw, axis=1)[:, -10:]

In [18]:
print('Topics:')
for idx, t in enumerate(top_10):
    d = np.zeros((1, X_train.shape[1]))
    for w in t:
        d[0, w] = 1
    topic = ', '.join(vectorizer.inverse_transform(d)[0])
    print(f'{idx}- {topic};')
print(f'\n targets: {newsgroups_train.target_names}')

Topics:
0- 10, 1993, date, edu, following, information, internet, mail, research, university;
1- actually, did, doesn, doing, heard, let, maybe, sure, tell, understand;
2- agree, believe, case, doesn, fact, mean, point, reason, things, true;
3- ago, believe, com, day, did, edu, ll, long, quite, read;
4- card, disk, drive, edu, hard, mac, monitor, offer, sale, video;
5- believe, bible, christ, christian, christians, god, jesus, life, man, religion;
6- children, country, government, israel, killed, rights, said, state, war, world;
7- bike, buy, car, cars, drive, engine, looking, miles, road, speed;
8- 10, game, games, league, play, players, season, team, win, year;
9- 14, ah, end, ll, ma, max, mi, mr, ms, tm;
10- advance, appreciated, help, hi, mail, problem, running, using, windows, work;
11- american, clinton, government, house, national, president, public, states, support, white;
12- article, com, edu, heard, interested, mail, news, post, really, stuff;
13- available, code, file, ftp,

Среди полученных тем можно разобрать такие как:
4. mac hardware
5. religion
6. politics
7. autos
8. sport
9. os ms-windows
10. os windows
11. politics
13. windows x
14. cryptography
15. pc hardware 