In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups

In [None]:
# Load dataset
categories = ['sci.space', 'comp.graphics', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(subset='train', categories=categories,
                              remove=('headers', 'footers', 'quotes'))
docs = newsgroups.data[:10]  # Select first 10 documents

In [None]:
# Convert text into numerical form using TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(docs)

In [None]:
# Compute Cosine Similarity
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert similarity matrix into DataFrame
doc_names = [f"Doc {i+1}" for i in range(len(docs))]
similarity_df = pd.DataFrame(cos_sim, index=doc_names, columns=doc_names)

In [None]:
# Visualize similarity matrix using a heatmap
plt.figure(figsize=(8,6))
sns.heatmap(similarity_df, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Document Similarity Heatmap")
plt.show()

In [None]:
# Bar Plot of Document Similarities
plt.figure(figsize=(8,6))
for i in range(len(docs)):
    plt.bar(doc_names, cos_sim[i], alpha=0.6, label=f"Doc {i+1}")
plt.legend()
plt.show()

In [None]:
# Pairwise Similarity Distribution
plt.figure(figsize=(8,6))
sns.histplot(cos_sim.flatten(), bins=10, kde=True, color='purple')
plt.xlabel("Similarity Score")
plt.ylabel("Frequency")
plt.title("Distribution of Document Similarities")
plt.show()

In [None]:
# Display similarity matrix
print("Cosine Similarity Matrix:")
print(similarity_df)