In [18]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from nltk import word_tokenize, pos_tag, download
from nltk.stem import WordNetLemmatizer

In [19]:
# download relevant nltk datasets

download("wordnet")
download("omw-1.4")
download("punkt")
download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to /home/max/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/max/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/max/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/max/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
# parameters

n_docs = -1 # all documents (takes a while to preprocess!)
n_topics = 20 # number of topics to extract (rank)
max_iter = 1000 # number of iterations in the NMF
n_top_words = 10 # number of words to display for each topic

In [21]:
# load dataset

data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=0,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)

In [22]:
# preprocess data

def preprocess(text: str) -> str:
    tokens = word_tokenize(text.lower())
    cleaned = [token for token in tokens if token.isalnum()]
    nouns = [word for (word, pos) in pos_tag(cleaned) if pos in ("NN", "NNS", "NNP")]
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(noun, pos="n") for noun in nouns]
    return " ".join(lemmatized)

data_pped = map(preprocess, data[:n_docs])

In [23]:
# vectorize the documents using term frequency-inverse document frequency
vectorizer = TfidfVectorizer(stop_words="english")
doc_vectors = vectorizer.fit_transform(data_pped)

In [24]:
# use NMF to decompose the document vectors into topic vectors
nmf = NMF(n_components=n_topics, init="nndsvda", random_state=0, max_iter=max_iter)
topic_vectors = nmf.fit_transform(doc_vectors)

In [26]:
# print the topics with their most occuring words
for topic_idx, topic in enumerate(nmf.components_):
    top_words_idx = topic.argsort()[:-n_top_words-1:-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    print(f"Topic {topic_idx}:", *top_words)

Topic 0: people thing life person armenian world religion point government lot
Topic 1: window application manager font server memory screen display client size
Topic 2: drive disk scsi controller hd ide tape cable mac jumper
Topic 3: team player year league season fan hockey run nhl baseball
Topic 4: chip key encryption clipper phone algorithm escrow government number bit
Topic 5: card video bus slot memory controller port mode pc ram
Topic 6: car engine dealer mile price model owner speed tire power
Topic 7: thanks advance hi info anybody reply help information response question
Topic 8: god jesus christian faith christ belief life religion word truth
Topic 9: file directory format disk image ftp site utility help copy
Topic 10: program image space code help graphic widget job source value
Topic 11: problem error machine memory solution mouse duo apple screen line
Topic 12: driver version printer mouse diamond bb mode site ftp color
Topic 13: law gun government right state crime weap