In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups

# Load the 20 newsgroups dataset (a sample dataset included in scikit-learn)
newsgroups = fetch_20newsgroups(subset='all')

# Vectorize the text data using TF-IDF representation
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(newsgroups.data)

# Perform K-means clustering
k = 20  # Number of clusters (you can adjust this)
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)

# Print top terms for each cluster
terms = vectorizer.get_feature_names_out()  # Updated method
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(k):
    print(f"Cluster {i + 1}:")
    top_terms = [terms[ind] for ind in order_centroids[i, :5]]
    print(top_terms)

Cluster 1:
['ibm', 'com', 'austin', 'disclaimer', 'edu']
Cluster 2:
['god', 'jesus', 'christian', 'bible', 'christ']
Cluster 3:
['ca', 'canada', 'bnr', 'university', 'organization']
Cluster 4:
['edu', 'subject', 'lines', 'organization', 'com']
Cluster 5:
['windows', 'dos', 'file', 'edu', 'ms']
Cluster 6:
['netcom', 'key', 'clipper', 'com', 'encryption']
Cluster 7:
['uk', 'ac', 'university', 'writes', 'lines']
Cluster 8:
['virginia', 'edu', 'university', 'organization', 'writes']
Cluster 9:
['sale', '00', 'edu', 'offer', 'condition']
Cluster 10:
['edu', 'university', 'host', 'nntp', 'posting']
Cluster 11:
['people', 'edu', 'government', 'don', 'israel']
Cluster 12:
['sgi', 'jon', 'com', 'keith', 'caltech']
Cluster 13:
['access', 'digex', 'pat', 'net', 'communications']
Cluster 14:
['cs', 'edu', 'pitt', 'science', 'article']
Cluster 15:
['drive', 'scsi', 'ide', 'drives', 'controller']
Cluster 16:
['game', 'team', 'games', 'hockey', 'edu']
Cluster 17:
['uiuc', 'cso', 'ohio', 'edu', 'state