In [15]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
newsgroups_train = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes')
)
data = newsgroups_train.data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def advanced_tokenizer(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2 and t not in stop_words]
    lemmas = []
    for t in tokens:
        lemma = lemmatizer.lemmatize(t)           
        lemma = lemmatizer.lemmatize(lemma, 'v')   
        lemma = lemmatizer.lemmatize(lemma, 'a')   
        lemma = lemmatizer.lemmatize(lemma, 'r')   
        lemma = lemmatizer.lemmatize(lemma, 's')   
        lemmas.append(lemma)
    return lemmas
vectorizer = CountVectorizer(
    tokenizer=advanced_tokenizer,
    min_df=10,
    max_df=0.4,
    lowercase=False
)
X_train = vectorizer.fit_transform(data)
feature_names = vectorizer.get_feature_names_out()
D = X_train.shape[0]
V = X_train.shape[1]
docs_idx = []
words_idx = []
for d in range(D):
    row = X_train[d]
    for word_id, count in zip(row.indices, row.data):
        docs_idx.extend([d] * count)
        words_idx.extend([word_id] * count)

docs_idx = np.array(docs_idx, dtype=int)
words_idx = np.array(words_idx, dtype=int)
W = len(docs_idx)
K = 20
alpha = 0.1
beta = 0.01
num_iter = 150     
np.random.seed(42)
z = np.random.randint(0, K, size=W)
ndk = np.zeros((D, K))
nkw = np.zeros((K, V))
nk = np.zeros(K)
for i in range(W):
    d = docs_idx[i]
    w = words_idx[i]
    k = z[i]
    ndk[d, k] += 1
    nkw[k, w] += 1
    nk[k] += 1
beta_V = beta * V
for iteration in range(num_iter):
    print(f"Iteration {iteration + 1}/{num_iter}")
    for i in range(W):
        d = docs_idx[i]
        w = words_idx[i]
        old_k = z[i]
        ndk[d, old_k] -= 1
        nkw[old_k, w] -= 1
        nk[old_k] -= 1
        p = (ndk[d] + alpha) * (nkw[:, w] + beta) / (nk + beta_V)
        p /= p.sum()
        new_k = np.random.choice(K, p=p)
        z[i] = new_k
        ndk[d, new_k] += 1
        nkw[new_k, w] += 1
        nk[new_k] += 1
phi = (nkw + beta) / (nk[:, np.newaxis] + beta_V)
print("Топ 10 слов по каждому тегу")
for k in range(K):
    top_idx = np.argsort(phi[k])[-10:][::-1]
    top_words = [feature_names[i] for i in top_idx]
    print(f"Topic {k:2d}: {' | '.join(top_words)}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zahar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\zahar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zahar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zahar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Iteration 1/150
Iteration 2/150
Iteration 3/150
Iteration 4/150
Iteration 5/150
Iteration 6/150
Iteration 7/150
Iteration 8/150
Iteration 9/150
Iteration 10/150
Iteration 11/150
Iteration 12/150
Iteration 13/150
Iteration 14/150
Iteration 15/150
Iteration 16/150
Iteration 17/150
Iteration 18/150
Iteration 19/150
Iteration 20/150
Iteration 21/150
Iteration 22/150
Iteration 23/150
Iteration 24/150
Iteration 25/150
Iteration 26/150
Iteration 27/150
Iteration 28/150
Iteration 29/150
Iteration 30/150
Iteration 31/150
Iteration 32/150
Iteration 33/150
Iteration 34/150
Iteration 35/150
Iteration 36/150
Iteration 37/150
Iteration 38/150
Iteration 39/150
Iteration 40/150
Iteration 41/150
Iteration 42/150
Iteration 43/150
Iteration 44/150
Iteration 45/150
Iteration 46/150
Iteration 47/150
Iteration 48/150
Iteration 49/150
Iteration 50/150
Iteration 51/150
Iteration 52/150
Iteration 53/150
Iteration 54/150
Iteration 55/150
Iteration 56/150
Iteration 57/150
Iteration 58/150
Iteration 59/150
Iterat

(код работал более 1.5 часов, поэтому я отслеживал итерации)
Чистыми темами получились:
Topic 1: misc.forsale
Topic 3: soc.religion.christian
Topic 4: rec.autos
Topic 8: sci.space
Topic 13: sci.med
Topic 14: rec.sport.hockey
Topic 17: talk.politics.guns
Topic 18: sci.crypt
Topic 19: comp.sys.ibm.pc.hardware

Почти чистыми (есть общие или лишние или мусорные слова) получились:
Topic 2: rec.motorcycles (есть мусорное слово maxaxaxa)
Topic 5: talk.politics.mideast (была раскрыта лишь часть связанная с армяно турецким конфликтом)
Topic 6: talk.politics.mideast (та же тема, но здесь про арабов и евреев, поэтому выделилась в отдельную тему)
Topic 7: talk.politics.misc (есть общие слова)
Topic 9: comp.graphics (пересечения с другими темами)
Topic 11: comp.os.ms-windows.misc / comp.windows.x (сложно определить конкретно)
Остальные в основном состоят из общих слов. 
(можно конечно уменьшить альфа и увеличить итерации, но тогда компьютер будет обрабатывать это более 2 часов) 
