#Load and View Newsgroups Data

In [None]:
from sklearn.datasets import fetch_20newsgroups

def load_newsgroup_data(categories=None, subset='train'):
    newsgroups = fetch_20newsgroups(categories=categories, subset=subset)
    return newsgroups

categories = ['comp.graphics', 'talk.politics.guns', 'alt.atheism', 'sci.med', 'sci.space']
data = load_newsgroup_data(categories)


#Text Preprocessing

In [None]:
import nltk
import re
import string
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

processed_data = [preprocess(doc) for doc in data.data]
print(processed_data[0])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


mhamiltonimitzmcskentedu lawnmowerman subject atf burns dividian ranch survivors keywords nata thing nntppostinghost nimitzmcskentedu replyto matthew hamilton organization kent state university cs lines article rjlttransferstratuscom cdtswstratuscom c tavares writes article aprmcskentedu mhamiltonimitzmcskentedu lawnmowerman writes oh guess shooting kind babies right sick bastard cdtrocketswstratuscom believe speak company cdtvosstratuscom write today special investors packet thanks reply post way never never ever said right shoot kind babies however branch davidian people insisted staying savior yeah right budy boy brainwashed believing ever says truth even means give lives cause therefore davids fault atfs gave days get days many many rest us however sad hear death child unlike sick bastard supposedly matthew r hamilton mhamiltomcskentedu aka cs physics major hksuvxbkentedu lawnmowerman kent state university hksuvxbkentedu look future advicequotessayingsjibberishphilosohy


#TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_data)
print(X.shape)


(2797, 41964)


#NMF Clustering

In [None]:
from sklearn.decomposition import NMF
import numpy as np
import pandas as pd

nmf = NMF(n_components=5, random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

nmf_labels = np.argmax(W, axis=1)

target_names = data.target_names
true_categories = [target_names[label] for label in data.target]

nmf_df = pd.DataFrame({
    'Document Index': np.arange(len(nmf_labels)),
    'Predicted Cluster (NMF)': nmf_labels,
    'True Category': true_categories
})

print(nmf_df.head(20))


    Document Index  Predicted Cluster (NMF)       True Category
0                0                        3  talk.politics.guns
1                1                        2       comp.graphics
2                2                        2           sci.space
3                3                        2           sci.space
4                4                        3  talk.politics.guns
5                5                        3           sci.space
6                6                        4             sci.med
7                7                        3  talk.politics.guns
8                8                        3  talk.politics.guns
9                9                        3             sci.med
10              10                        2       comp.graphics
11              11                        2           sci.space
12              12                        2       comp.graphics
13              13                        2           sci.space
14              14                      

#Manual KMeans Clustering

In [None]:
def kmeans_custom(X, n_clusters, max_iter=100, random_state=42):
    np.random.seed(random_state)
    initial_idx = np.random.choice(X.shape[0], n_clusters, replace=False)
    centroids = X[initial_idx]

    for _ in range(max_iter):
        distances = np.linalg.norm(X[:, None] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(n_clusters)])
        if np.allclose(centroids, new_centroids, atol=1e-4):
            break
        centroids = new_centroids
    return labels

X_array = X.toarray()
custom_kmeans_labels = kmeans_custom(X_array, n_clusters=5)

custom_kmeans_df = pd.DataFrame({
    'Document Index': np.arange(len(custom_kmeans_labels)),
    'Predicted Cluster (Custom KMeans)': custom_kmeans_labels,
    'True Category': true_categories
})

print(custom_kmeans_df.head(20))


    Document Index  Predicted Cluster (Custom KMeans)       True Category
0                0                                  1  talk.politics.guns
1                1                                  0       comp.graphics
2                2                                  0           sci.space
3                3                                  0           sci.space
4                4                                  1  talk.politics.guns
5                5                                  0           sci.space
6                6                                  1             sci.med
7                7                                  1  talk.politics.guns
8                8                                  1  talk.politics.guns
9                9                                  0             sci.med
10              10                                  0       comp.graphics
11              11                                  0           sci.space
12              12                    

#Scikit-Learn KMeans Clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

kmeans_df = pd.DataFrame({
    'Document Index': np.arange(len(kmeans_labels)),
    'Predicted Cluster (Sklearn KMeans)': kmeans_labels,
    'True Category': true_categories
})

print(kmeans_df.head(20))


    Document Index  Predicted Cluster (Sklearn KMeans)       True Category
0                0                                   3  talk.politics.guns
1                1                                   2       comp.graphics
2                2                                   1           sci.space
3                3                                   1           sci.space
4                4                                   3  talk.politics.guns
5                5                                   1           sci.space
6                6                                   0             sci.med
7                7                                   3  talk.politics.guns
8                8                                   3  talk.politics.guns
9                9                                   1             sci.med
10              10                                   2       comp.graphics
11              11                                   1           sci.space
12              12       

#Final Comparison of All Methods

In [None]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
import pandas as pd

ari_nmf = adjusted_rand_score(data.target, nmf_labels)
ami_nmf = adjusted_mutual_info_score(data.target, nmf_labels)
silhouette_nmf = silhouette_score(X, nmf_labels)

ari_custom = adjusted_rand_score(data.target, custom_kmeans_labels)
ami_custom = adjusted_mutual_info_score(data.target, custom_kmeans_labels)
silhouette_custom = silhouette_score(X, custom_kmeans_labels)

ari_kmeans = adjusted_rand_score(data.target, kmeans_labels)
ami_kmeans = adjusted_mutual_info_score(data.target, kmeans_labels)
silhouette_kmeans = silhouette_score(X, kmeans_labels)

results = pd.DataFrame({
    'Method': ['NMF', 'Custom KMeans', 'Sklearn KMeans'],
    'ARI': [ari_nmf, ari_custom, ari_kmeans],
    'AMI': [ami_nmf, ami_custom, ami_kmeans],
    'Silhouette': [silhouette_nmf, silhouette_custom, silhouette_kmeans]
})

print(results)


           Method       ARI       AMI  Silhouette
0             NMF  0.401679  0.556622    0.006952
1   Custom KMeans  0.335531  0.463245    0.006565
2  Sklearn KMeans  0.483305  0.604758    0.006843
