### Task 3

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
import numpy as np

In [6]:
# Load the 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Concatenate training and testing datasets
data = newsgroups_train.data + newsgroups_test.data
labels = np.concatenate([newsgroups_train.target, newsgroups_test.target])

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data)

# Cluster the data into 20 clusters using KMeans
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X)

# Print true class vs cluster distribution
conf_matrix = confusion_matrix(labels, cluster_labels)
print("True class vs Cluster distribution:")
print(conf_matrix)

True class vs Cluster distribution:
[[136  44   0 155   0   1   4 227  14   0   1   0   0   0  27   0   0  18
   38 134]
 [  1 600   0   0  82   0   0  18   2   2  17   0   0   1   7   1   0  14
    0 228]
 [  0 207   0   0 554   0   0  21   6   2   2   9   0   5  16   6   0   8
    0 149]
 [  0 479   0   0  93   0   0   9   6   3   9 191   0   2  31   1   2   4
    0 152]
 [  0 450   0   0  12   0   0  15  20  14  11  70   0   1  26   1   0   9
    0 334]
 [  0 650   0   0 139   3   0   2  20   2  28   0   0   1   4   2   0   0
    0 137]
 [  0 331   0   0  21   0   0  27  14  32   3  42   0   5  14  13  10  10
    0 453]
 [  0 149   0   0   2   0   1 499  19  31  10   0   0   0  22   1   0  41
    0 215]
 [  0 146   0   0   0   0   0 633  11   6  18   0   0   1   9   0   0   2
    0 170]
 [  0  79   0   0   0   0   0 141   8   7   3   0   0   1   3  14 366  19
    0 353]
 [  0  73   0   0   0   0   0  20  54   6   4   0   0   0   6  19 715   2
    0 100]
 [  0 146   0   0  10 538   1

In [7]:
# original clusters 
correctly_clustered = np.max(conf_matrix, axis=0)
accuracy = sum(correctly_clustered) / len(data)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Accuracy: 32.29%


### Summary:
The Agglomerative Clustering algorithm applied to the 20 Newsgroups dataset resulted in an accuracy of approximately 32.29%. The confusion matrix shows the distribution of true class labels versus assigned cluster labels.