In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving News_Category_Dataset_v3.json to News_Category_Dataset_v3.json
User uploaded file "News_Category_Dataset_v3.json" with length 87295572 bytes


In [None]:
# This script contains the necessary commands to install all required
# libraries for the Semantic News Clustering project in a Google Colab environment.

# 1. Install core Sentence-BERT (SBERT) library
!pip install sentence-transformers

# 2. Install NetworkX and the community detection algorithm (python-louvain)
# Note: 'python-louvain' is the package name for the 'community' import.
!pip install networkx python-louvain

# 3. Install UMAP (Uniform Manifold Approximation and Projection) for dimensionality reduction
# Note: 'umap-learn' is the package name.
!pip install umap-learn

# 4. Install Plotly for interactive visualizations
!pip install plotly

# 5. Install the standard data science libraries (often pre-installed in Colab, but included for completeness)
!pip install pandas numpy scikit-learn

print("Installation complete. You should now be able to run all steps of the Semantic News Clustering script.")


Installation complete. You should now be able to run all steps of the Semantic News Clustering script.


In [None]:
# Semantic News Clustering using Graph + Sentence-BERT (Hybrid Approach)
# Author: Your Group
# Libraries required: pandas, numpy, scikit-learn, networkx, plotly, umap-learn, sentence-transformers, community
# NOTE: The libraries networkx, plotly, umap, and sentence-transformers were unavailable during execution,
# so the clustering and visualization steps will result in errors if run in an environment without them.
# The code below is provided as the FINAL INTENDED SCRIPT.

import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# --- Import place holders for unavailable libraries ---
try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    print("Warning: 'sentence_transformers' not found. Sentence-BERT steps will fail.")
    class SentenceTransformer:
        def __init__(self, model_name): pass
        def encode(self, articles, show_progress_bar=False): return np.zeros((len(articles), 128))

try:
    import networkx as nx
except ImportError:
    print("Warning: 'networkx' not found. Graph steps will fail.")
    class nx:
        Graph = object
        connected_components = lambda g: [[1]]
        spring_layout = lambda g, k, iterations, seed: {}
        best_partition = lambda g: {}

try:
    import umap
except ImportError:
    print("Warning: 'umap-learn' not found. UMAP steps will fail.")
    class umap:
        def UMAP(self, n_neighbors, min_dist, random_state):
            class Reducer:
                def fit_transform(self, data): return np.zeros((data.shape[0], 2))
            return Reducer()

try:
    import plotly.graph_objs as go
except ImportError:
    print("Warning: 'plotly' not found. Visualization steps will fail.")
    class go:
        Figure = object
        Scatter = object
# ---------------------------------------------------


# --- Text Preprocessing Function ---
def preprocess_text(text):
    """Simple cleaning: lowercasing, removing common non-alphanumeric characters."""
    text = str(text).lower()
    # Remove special characters, keeping spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()


# -------------------------------------------------
# Step 1: Load Dataset, Preprocessing, and Subset
# -------------------------------------------------
data = pd.read_json("News_Category_Dataset_v3.json", lines=True)

# Re-introduce subset_size = 5000 for efficient execution
subset_size = 5000
data_subset = data.head(subset_size).copy()

# Combine headline and short_description and apply cleaning
data_subset['combined_text'] = data_subset.apply(
    lambda row: preprocess_text(row['headline'] + ' ' + row['short_description']), axis=1
)

articles = data_subset['combined_text'].tolist()
tokenized_articles = [article.split() for article in articles]

# Print unique categories
print("--- Unique Categories in Dataset Subset ---")
unique_categories = data_subset['category'].unique()
print(unique_categories)
print(f"Total unique categories: {len(unique_categories)}\n")

# Print 5 preprocessed articles
print("--- 5 Preprocessed Articles ---")
for i, article in enumerate(articles[:5]):
    print(f"Article {i+1}: {article}")
print("\n")


# -------------------------------------------------
# Step 2: Classical Vectorization
# -------------------------------------------------
print("--- Step 2: Classical Vectorization Results ---")
MAX_FEATURES_LIMIT = 5000 # Limit features for consistent display and performance

# i. BOW (binary)
vectorizer_bow_binary = CountVectorizer(binary=True, max_features=MAX_FEATURES_LIMIT)
X_bow_binary = vectorizer_bow_binary.fit_transform(articles)
feature_names = vectorizer_bow_binary.get_feature_names_out()
print(f"i. BOW (binary) shape: {X_bow_binary.shape}")
print("   Sample Features:", feature_names[:5])
print("   Sample Matrix:\n", X_bow_binary[:5, :5].toarray())


# ii. BOW (frequency)
vectorizer_bow_freq = CountVectorizer(binary=False, max_features=MAX_FEATURES_LIMIT)
X_bow_freq = vectorizer_bow_freq.fit_transform(articles)
print(f"\nii. BOW (frequency) shape: {X_bow_freq.shape}")
print("   Sample Features:", vectorizer_bow_freq.get_feature_names_out()[:5])
print("   Sample Matrix:\n", X_bow_freq[:5, :5].toarray())


# iii. One-Hot Encoding (word level) - Full vocabulary
vocab = sorted(list(set(word for article_tokens in tokenized_articles for word in article_tokens)))
one_hot_vectors = []
for article_tokens in tokenized_articles[:5]:
    vector = [1 if word in article_tokens else 0 for word in vocab]
    one_hot_vectors.append(vector)
one_hot_array = np.array(one_hot_vectors)

# Print using features from BOW for consistent sampling
feature_indices_to_print = [vocab.index(name) for name in feature_names[:5] if name in vocab]
sample_one_hot = one_hot_array[:, feature_indices_to_print]

print(f"\niii. One-Hot Encoding shape: ({len(articles)} x {len(vocab)})")
print("   Sample Features:", [vocab[i] for i in feature_indices_to_print])
print("   Sample Matrix:\n", sample_one_hot)


# iv. TF-IDF
tfidf = TfidfVectorizer(max_features=MAX_FEATURES_LIMIT)
X_tfidf = tfidf.fit_transform(articles)
print(f"\niv. TF-IDF shape: {X_tfidf.shape}")
print("   Sample Features:", tfidf.get_feature_names_out()[:5])
print("   Sample Matrix:\n", X_tfidf[:5, :5].toarray())
print("\n")

# -------------------------------------------------
# Step 3: Semantic Embeddings (Sentence-BERT)
# -------------------------------------------------
print("--- Step 3: Generating Sentence-BERT Embeddings ---")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(articles, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")


# -------------------------------------------------
# Step 4: Build Similarity Graph
# -------------------------------------------------
print("--- Step 4: Building Similarity Graph ---")
sim_matrix = cosine_similarity(embeddings)
threshold = 0.7
G = nx.Graph()
for i in range(len(articles)):
    G.add_node(i, label=articles[i])
    for j in range(i+1, len(articles)):
        if sim_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=sim_matrix[i][j])
print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


# -------------------------------------------------
# Step 5: Graph-Based Clustering (Louvain)
# -------------------------------------------------
print("--- Step 5: Graph-Based Clustering (Louvain) ---")
partition = community_louvain.best_partition(G)
graph_labels = [partition[i] for i in range(len(articles))]
num_clusters_louvain = len(set(graph_labels))
print(f"Louvain found {num_clusters_louvain} clusters.")


# -------------------------------------------------
# Step 6: Traditional Clustering (K-Means)
# -------------------------------------------------
print("--- Step 6: Traditional Clustering (K-Means) ---")
num_clusters = num_clusters_louvain
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(embeddings)
print(f"K-Means ran with {num_clusters} clusters.")


# -------------------------------------------------
# Step 7: Dimensionality Reduction for Visualization
# -------------------------------------------------
print("--- Step 7: Dimensionality Reduction (UMAP) ---")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
embedding_2d = reducer.fit_transform(embeddings)
print(f"UMAP 2D embedding shape: {embedding_2d.shape}")


# -------------------------------------------------
# Step 8: Plot Interactive Graph-Based Clusters
# -------------------------------------------------
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    mode='markers',
    marker=dict(color=graph_labels, colorscale='Viridis', size=8),
    text=articles,
    hoverinfo='text'
))
fig.update_layout(title="Semantic News Clusters (Graph-Based)")
fig.show()


# -------------------------------------------------
# Step 9: Evaluate Clustering
# -------------------------------------------------
print("\n--- Step 9: Evaluate Clustering ---")
silhouette_graph = silhouette_score(embeddings, graph_labels)
silhouette_kmeans = silhouette_score(embeddings, kmeans_labels)

print(f"Silhouette Score - Graph-Based (Louvain): {silhouette_graph:.3f}")
print(f"Silhouette Score - K-Means: {silhouette_kmeans:.3f}")


# -------------------------------------------------
# Step 10: Optional: Visualize Network Graph (Interactive)
# -------------------------------------------------
# The interactive graph visualization code is commented out here as it relies heavily on unavailable libraries (networkx, plotly).
 #You can uncomment and run it once the necessary libraries are installed in your environment.

pos = nx.spring_layout(G, k=0.15, iterations=20)
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_color = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_color.append(partition[node])

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    marker=dict(size=8, color=node_color, colorscale='Viridis'),
    text=[G.nodes[n]['label'] for n in G.nodes()],
    hoverinfo='text'
)

fig2 = go.Figure(data=[edge_trace, node_trace])
fig2.update_layout(title='Interactive Graph Visualization of News Clusters', showlegend=False)
fig2.show()

--- Unique Categories in Dataset Subset ---
['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT']
Total unique categories: 28

--- 5 Preprocessed Articles ---
Article 1: over 4 million americans roll up sleeves for omicrontargeted covid boosters health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the us ordered for the fall
Article 2: american airlines flyer charged banned for life after punching flight attendant on video he was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation according to the us attorneys office in los angeles
Article 3: 23 of the funniest tweets about cats and dogs this w

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Embeddings shape: (5000, 384)
--- Step 4: Building Similarity Graph ---
Graph created with 5000 nodes and 582 edges.
--- Step 5: Graph-Based Clustering (Louvain) ---
Louvain found 4643 clusters.
--- Step 6: Traditional Clustering (K-Means) ---
K-Means ran with 4643 clusters.
--- Step 7: Dimensionality Reduction (UMAP) ---



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP 2D embedding shape: (5000, 2)



--- Step 9: Evaluate Clustering ---
Silhouette Score - Graph-Based (Louvain): 0.015
Silhouette Score - K-Means: 0.018


In [None]:
# Semantic News Clustering using Graph + Sentence-BERT (Hybrid Approach)
# Author: Your Group
# Libraries required: pandas, numpy, scikit-learn, networkx, plotly, umap-learn, sentence-transformers, community
# NOTE: The libraries networkx, plotly, umap, and sentence-transformers were unavailable during execution,
# so the clustering and visualization steps will result in errors if run in an environment without them.
# The code below is provided as the FINAL INTENDED SCRIPT.

import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# --- Import place holders for unavailable libraries ---
try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    print("Warning: 'sentence_transformers' not found. Sentence-BERT steps will fail.")
    class SentenceTransformer:
        def __init__(self, model_name): pass
        def encode(self, articles, show_progress_bar=False): return np.zeros((len(articles), 128))

try:
    import networkx as nx
except ImportError:
    print("Warning: 'networkx' not found. Graph steps will fail.")
    class nx:
        Graph = object
        connected_components = lambda g: [[1]]
        spring_layout = lambda g, k, iterations, seed: {}
        best_partition = lambda g: {}

try:
    # Need to import 'community' and alias it to 'community_louvain' for Step 5
    import community as community_louvain
except ImportError:
    print("Warning: 'python-louvain' or 'community' not found. Louvain steps will fail.")
    class community_louvain:
        best_partition = lambda g: {i: 0 for i in range(g.number_of_nodes())} if hasattr(g, 'number_of_nodes') else {}

try:
    import umap
except ImportError:
    print("Warning: 'umap-learn' not found. UMAP steps will fail.")
    class umap:
        def UMAP(self, n_neighbors, min_dist, random_state):
            class Reducer:
                def fit_transform(self, data): return np.zeros((data.shape[0], 2))
            return Reducer()

try:
    import plotly.graph_objs as go
except ImportError:
    print("Warning: 'plotly' not found. Visualization steps will fail.")
    class go:
        Figure = object
        Scatter = object
# ---------------------------------------------------


# --- Text Preprocessing Function ---
def preprocess_text(text):
    """Simple cleaning: lowercasing, removing common non-alphanumeric characters."""
    text = str(text).lower()
    # Remove special characters, keeping spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()


# -------------------------------------------------
# Step 1: Load Dataset, Preprocessing, and Subset
# -------------------------------------------------
# NOTE: Assumes 'News_Category_Dataset_v3.json' is available in the execution environment
try:
    data = pd.read_json("News_Category_Dataset_v3.json", lines=True)
except FileNotFoundError:
    print("FATAL ERROR: 'News_Category_Dataset_v3.json' not found. Creating dummy data.")
    data = pd.DataFrame({
        'headline': ['dummy news 1', 'another dummy headline 2', 'a third article 3', 'just one more story 4', 'last test article 5'],
        'short_description': ['description 1', 'desc 2', 'desc 3', 'desc 4', 'desc 5'],
        'category': ['TECH', 'POLITICS', 'TECH', 'SPORTS', 'POLITICS']
    })


# Re-introduce subset_size = 5000 for efficient execution
subset_size = 5000
data_subset = data.head(subset_size).copy()

# Combine headline and short_description and apply cleaning
data_subset['combined_text'] = data_subset.apply(
    lambda row: preprocess_text(row['headline'] + ' ' + row['short_description']), axis=1
)

articles = data_subset['combined_text'].tolist()
tokenized_articles = [article.split() for article in articles]

# Print unique categories
print("--- Unique Categories in Dataset Subset ---")
unique_categories = data_subset['category'].unique()
print(unique_categories)
print(f"Total unique categories: {len(unique_categories)}\n")

# Print 5 preprocessed articles
print("--- 5 Preprocessed Articles ---")
for i, article in enumerate(articles[:5]):
    print(f"Article {i+1}: {article}")
print("\n")


# -------------------------------------------------
# Step 2: Classical Vectorization
# -------------------------------------------------
print("--- Step 2: Classical Vectorization Results ---")
MAX_FEATURES_LIMIT = 5000 # Limit features for consistent display and performance

# i. BOW (binary)
vectorizer_bow_binary = CountVectorizer(binary=True, max_features=MAX_FEATURES_LIMIT)
X_bow_binary = vectorizer_bow_binary.fit_transform(articles)
feature_names = vectorizer_bow_binary.get_feature_names_out()
print(f"i. BOW (binary) shape: {X_bow_binary.shape}")
print("   Sample Features:", feature_names[:5])
print("   Sample Matrix:\n", X_bow_binary[:5, :5].toarray())


# ii. BOW (frequency)
vectorizer_bow_freq = CountVectorizer(binary=False, max_features=MAX_FEATURES_LIMIT)
X_bow_freq = vectorizer_bow_freq.fit_transform(articles)
print(f"\nii. BOW (frequency) shape: {X_bow_freq.shape}")
print("   Sample Features:", vectorizer_bow_freq.get_feature_names_out()[:5])
print("   Sample Matrix:\n", X_bow_freq[:5, :5].toarray())


# iii. One-Hot Encoding (word level) - Full vocabulary
vocab = sorted(list(set(word for article_tokens in tokenized_articles for word in article_tokens)))
one_hot_vectors = []
for article_tokens in tokenized_articles[:5]:
    vector = [1 if word in article_tokens else 0 for word in vocab]
    one_hot_vectors.append(vector)
one_hot_array = np.array(one_hot_vectors)

# Print using features from BOW for consistent sampling
feature_indices_to_print = [vocab.index(name) for name in feature_names[:5] if name in vocab]
sample_one_hot = one_hot_array[:, feature_indices_to_print]

print(f"\niii. One-Hot Encoding shape: ({len(articles)} x {len(vocab)})")
print("   Sample Features:", [vocab[i] for i in feature_indices_to_print])
print("   Sample Matrix:\n", sample_one_hot)


# iv. TF-IDF
tfidf = TfidfVectorizer(max_features=MAX_FEATURES_LIMIT)
X_tfidf = tfidf.fit_transform(articles)
print(f"\niv. TF-IDF shape: {X_tfidf.shape}")
print("   Sample Features:", tfidf.get_feature_names_out()[:5])
print("   Sample Matrix:\n", X_tfidf[:5, :5].toarray())
print("\n")

# -------------------------------------------------
# Step 3: Semantic Embeddings (Sentence-BERT)
# -------------------------------------------------
print("--- Step 3: Generating Sentence-BERT Embeddings ---")
# The SentenceTransformer class is now a placeholder if the library is missing.
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(articles, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")


# -------------------------------------------------
# Step 4: Build Similarity Graph
# -------------------------------------------------
print("--- Step 4: Building Similarity Graph ---")
# Note: For large datasets, computing the full cosine similarity matrix can be memory-intensive.
sim_matrix = cosine_similarity(embeddings)
threshold = 0.7 # Only connect nodes with similarity above this value
G = nx.Graph()
for i in range(len(articles)):
    G.add_node(i, label=articles[i])
    for j in range(i+1, len(articles)):
        if sim_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=sim_matrix[i][j])
print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


# -------------------------------------------------
# Step 5: Graph-Based Clustering (Louvain)
# -------------------------------------------------
print("--- Step 5: Graph-Based Clustering (Louvain) ---")
# 'community_louvain' is the alias for the 'community' library
try:
    partition = community_louvain.best_partition(G)
except AttributeError:
    # Use placeholder result if graph creation failed
    print("Louvain failed (check networkx/community library). Using placeholder labels.")
    partition = {i: 0 for i in range(G.number_of_nodes())}

graph_labels = [partition[i] for i in range(len(articles))]
num_clusters_louvain = len(set(graph_labels))
print(f"Louvain found {num_clusters_louvain} clusters.")


# -------------------------------------------------
# Step 6: Traditional Clustering (K-Means)
# -------------------------------------------------
print("--- Step 6: Traditional Clustering (K-Means) ---")
# Use the number of clusters found by Louvain for a fair comparison
num_clusters = num_clusters_louvain
if num_clusters < 1 or num_clusters > len(articles):
    # Fallback for dummy data or trivial Louvain result
    num_clusters = max(2, min(len(articles) // 10, 10))
    print(f"Adjusted K-Means cluster count to {num_clusters} due to Louvain result.")

kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(embeddings)
print(f"K-Means ran with {num_clusters} clusters.")


# -------------------------------------------------
# Step 7: Dimensionality Reduction for Visualization
# -------------------------------------------------
print("--- Step 7: Dimensionality Reduction (UMAP) ---")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
# The UMAP placeholder returns zeros if the library is missing
embedding_2d = reducer.fit_transform(embeddings)
print(f"UMAP 2D embedding shape: {embedding_2d.shape}")


# -------------------------------------------------
# Step 8: Plot Interactive Graph-Based Clusters
# -------------------------------------------------
print("--- Step 8: Plotting Graph-Based Clusters (Requires Plotly) ---")
try:
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=embedding_2d[:, 0],
        y=embedding_2d[:, 1],
        mode='markers',
        marker=dict(color=graph_labels, colorscale='Viridis', size=8),
        text=articles,
        hoverinfo='text'
    ))
    fig.update_layout(title="Semantic News Clusters (Graph-Based)")
    fig.show()
except AttributeError:
    print("Plotly visualization skipped due to missing library or placeholder execution.")


# -------------------------------------------------
# Step 9: Evaluate Clustering
# -------------------------------------------------
print("\n--- Step 9: Evaluate Clustering ---")
# Skip evaluation if clustering resulted in a single trivial cluster (e.g., in placeholder mode)
if len(set(graph_labels)) > 1 and len(set(graph_labels)) < len(articles):
    silhouette_graph = silhouette_score(embeddings, graph_labels)
    print(f"Silhouette Score - Graph-Based (Louvain): {silhouette_graph:.3f}")
else:
    silhouette_graph = -999 # Sentinel value
    print("Silhouette Score - Graph-Based (Louvain): N/A (Trivial clustering result)")

if len(set(kmeans_labels)) > 1 and len(set(kmeans_labels)) < len(articles):
    silhouette_kmeans = silhouette_score(embeddings, kmeans_labels)
    print(f"Silhouette Score - K-Means: {silhouette_kmeans:.3f}")
else:
    silhouette_kmeans = -999 # Sentinel value
    print("Silhouette Score - K-Means: N/A (Trivial clustering result)")


# -------------------------------------------------
# Step 10: Optional: Visualize Network Graph (Interactive)
# -------------------------------------------------
print("\n--- Step 10: Interactive Network Visualization (Requires NetworkX/Plotly) ---")
try:
    pos = nx.spring_layout(G, k=0.15, iterations=20, seed=42) # Added seed for determinism
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    node_color = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_color.append(partition[node])

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        marker=dict(size=8, color=node_color, colorscale='Viridis'),
        text=[G.nodes[n]['label'] for n in G.nodes()],
        hoverinfo='text'
    )

    fig2 = go.Figure(data=[edge_trace, node_trace])
    fig2.update_layout(title='Interactive Graph Visualization of News Clusters', showlegend=False)
    fig2.show()

except Exception as e:
    # Catch any error from the complex visualization steps
    print(f"Interactive Network Visualization skipped or failed due to missing libraries or error: {type(e).__name__}.")

FATAL ERROR: 'News_Category_Dataset_v3.json' not found. Creating dummy data.
--- Unique Categories in Dataset Subset ---
['TECH' 'POLITICS' 'SPORTS']
Total unique categories: 3

--- 5 Preprocessed Articles ---
Article 1: dummy news 1 description 1
Article 2: another dummy headline 2 desc 2
Article 3: a third article 3 desc 3
Article 4: just one more story 4 desc 4
Article 5: last test article 5 desc 5


--- Step 2: Classical Vectorization Results ---
i. BOW (binary) shape: (5, 14)
   Sample Features: ['another' 'article' 'desc' 'description' 'dummy']
   Sample Matrix:
 [[0 0 0 1 1]
 [1 0 1 0 1]
 [0 1 1 0 0]
 [0 0 1 0 0]
 [0 1 1 0 0]]

ii. BOW (frequency) shape: (5, 14)
   Sample Features: ['another' 'article' 'desc' 'description' 'dummy']
   Sample Matrix:
 [[0 0 0 1 1]
 [1 0 1 0 1]
 [0 1 1 0 0]
 [0 0 1 0 0]
 [0 1 1 0 0]]

iii. One-Hot Encoding shape: (5 x 20)
   Sample Features: ['another', 'article', 'desc', 'description', 'dummy']
   Sample Matrix:
 [[0 0 0 1 1]
 [1 0 1 0 1]
 [0 1 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings shape: (5, 384)
--- Step 4: Building Similarity Graph ---
Graph created with 5 nodes and 0 edges.
--- Step 5: Graph-Based Clustering (Louvain) ---
Louvain found 5 clusters.
--- Step 6: Traditional Clustering (K-Means) ---
K-Means ran with 5 clusters.
--- Step 7: Dimensionality Reduction (UMAP) ---


  warn(
  warn(


UMAP 2D embedding shape: (5, 2)
--- Step 8: Plotting Graph-Based Clusters (Requires Plotly) ---



--- Step 9: Evaluate Clustering ---
Silhouette Score - Graph-Based (Louvain): N/A (Trivial clustering result)
Silhouette Score - K-Means: N/A (Trivial clustering result)

--- Step 10: Interactive Network Visualization (Requires NetworkX/Plotly) ---


In [None]:
import umap
import plotly.graph_objs as go
import numpy as np # Assuming embeddings is a numpy array
# Assuming 'embeddings' (your Sentence-BERT output) and 'graph_labels' (your Louvain cluster results) are already defined and are numpy arrays/lists.
# For demonstration purposes, let's create some dummy data if you don't have them yet:
# embeddings = np.random.rand(100, 128) # 100 articles, 128-dim embeddings
# graph_labels = np.random.randint(0, 5, 100) # 5 dummy clusters
# articles = [f"Article {i}" for i in range(100)] # Dummy article titles

# -------------------------------------------------
# Step 7: Dimensionality Reduction for Visualization
# -------------------------------------------------
print("--- Step 7: Dimensionality Reduction (UMAP) ---")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
embedding_2d = reducer.fit_transform(embeddings)
print(f"UMAP 2D embedding shape: {embedding_2d.shape}")


# -------------------------------------------------
# Step 8: Plot Interactive Graph-Based Clusters
# -------------------------------------------------
print("--- Step 8: Plotting Graph-Based Clusters (Requires Plotly) ---")
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    mode='markers',
    marker=dict(color=graph_labels, colorscale='Viridis', size=8),
    text=articles, # Make sure 'articles' list is available
    hoverinfo='text'
))
fig.update_layout(title="Semantic News Clusters (Graph-Based)",
                  xaxis_title="UMAP Dimension 1",
                  yaxis_title="UMAP Dimension 2")
fig.show()

# To plot K-Means clusters instead, simply replace `graph_labels` with `kmeans_labels`
# fig_kmeans = go.Figure()
# fig_kmeans.add_trace(go.Scatter(
#     x=embedding_2d[:, 0],
#     y=embedding_2d[:, 1],
#     mode='markers',
#     marker=dict(color=kmeans_labels, colorscale='Viridis', size=8),
#     text=articles,
#     hoverinfo='text'
# ))
# fig_kmeans.update_layout(title="Semantic News Clusters (K-Means)",
#                          xaxis_title="UMAP Dimension 1",
#                          yaxis_title="UMAP Dimension 2")
# fig_kmeans.show()

--- Step 7: Dimensionality Reduction (UMAP) ---
UMAP 2D embedding shape: (5, 2)
--- Step 8: Plotting Graph-Based Clusters (Requires Plotly) ---



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



In [None]:
import networkx as nx
import plotly.graph_objs as go
import numpy as np

# This code snippet assumes the following variables have been correctly defined by the preceding steps:
# G: The networkx.Graph object created in Step 4.
# partition: The dictionary {node_id: cluster_id} created by community_louvain.best_partition(G) in Step 5.
# articles: The list of preprocessed news articles.

# You must ensure networkx and plotly are installed for this code to work.

# -------------------------------------------------
# Step 10: Visualize Network Graph (Interactive)
# -------------------------------------------------
print("\n--- Plotting Louvain Network Graph (Requires NetworkX/Plotly) ---")

# 1. Calculate positions using a force-directed layout (Spring Layout)
# Added seed for determinism in the layout
pos = nx.spring_layout(G, k=0.15, iterations=20, seed=42)

# 2. Prepare the Edge Trace (The lines connecting the nodes)
edge_x = []
edge_y = []
for edge in G.edges():
    # Get coordinates for the two nodes connected by the edge
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    # Add the coordinates for the line segment and a 'None' to break the line
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# 3. Prepare the Node Trace (The articles themselves)
node_x = []
node_y = []
node_color = []
for node in G.nodes():
    # Get coordinates and color for the node
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    # Color based on the Louvain partition (community ID)
    node_color.append(partition[node])

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    # Color the nodes by their community ID
    marker=dict(size=8, color=node_color, colorscale='Viridis'),
    # Use the original article text for hover information
    text=[G.nodes[n]['label'] for n in G.nodes()],
    hoverinfo='text'
)

# 4. Create and Display the Figure
fig2 = go.Figure(data=[edge_trace, node_trace])

fig2.update_layout(
    title='Interactive Graph Visualization of News Clusters (Louvain Communities)',
    showlegend=False,
    hovermode='closest',
    # Hide axes for a cleaner network view
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

fig2.show()


--- Plotting Louvain Network Graph (Requires NetworkX/Plotly) ---


In [None]:
# -------------------------------------------------
# Step 9: Evaluate Clustering
# -------------------------------------------------
print("\n--- Step 9: Evaluate Clustering ---")
# Assumes 'embeddings', 'graph_labels' (Louvain), and 'kmeans_labels' are available.

if len(set(graph_labels)) > 1 and len(set(graph_labels)) < len(articles):
    silhouette_graph = silhouette_score(embeddings, graph_labels)
    print(f"Silhouette Score - Graph-Based (Louvain): {silhouette_graph:.3f}")
else:
    silhouette_graph = "N/A (Trivial)"
    print("Silhouette Score - Graph-Based (Louvain): N/A (Trivial clustering result)")

if len(set(kmeans_labels)) > 1 and len(set(kmeans_labels)) < len(articles):
    silhouette_kmeans = silhouette_score(embeddings, kmeans_labels)
    print(f"Silhouette Score - K-Means: {silhouette_kmeans:.3f}")
else:
    silhouette_kmeans = "N/A (Trivial)"
    print("Silhouette Score - K-Means: N/A (Trivial clustering result)")

print(f"\nNumber of Clusters Used (Louvain & K-Means): {num_clusters_louvain}")


--- Step 9: Evaluate Clustering ---
Silhouette Score - Graph-Based (Louvain): N/A (Trivial clustering result)
Silhouette Score - K-Means: N/A (Trivial clustering result)

Number of Clusters Used (Louvain & K-Means): 5


In [None]:
import plotly.graph_objs as go

# --- Assume these values come from Step 9 of your script ---
# For demonstration purposes, let's use some example values.
# In a real run, these would be the actual numbers from your script's output.
silhouette_graph = 0.250  # Example value for Louvain
silhouette_kmeans = 0.285 # Example value for K-Means

# Handle cases where Silhouette Score might be "N/A (Trivial)"
if isinstance(silhouette_graph, str):
    print("Cannot plot Louvain score, it was trivial/N/A.")
    louvain_plottable = None
else:
    louvain_plottable = silhouette_graph

if isinstance(silhouette_kmeans, str):
    print("Cannot plot K-Means score, it was trivial/N/A.")
    kmeans_plottable = None
else:
    kmeans_plottable = silhouette_kmeans


# --- Create the comparison bar chart ---
print("\n--- Generating Comparison Bar Chart of Silhouette Scores (Requires Plotly) ---")

cluster_methods = ['Graph-Based (Louvain)', 'Traditional (K-Means)']
silhouette_scores = [louvain_plottable, kmeans_plottable]

# Filter out None values if any method was trivial
plot_methods = []
plot_scores = []
for method, score in zip(cluster_methods, silhouette_scores):
    if score is not None:
        plot_methods.append(method)
        plot_scores.append(score)

if not plot_methods:
    print("No valid silhouette scores to plot.")
else:
    fig_comparison = go.Figure(data=[
        go.Bar(
            x=plot_methods,
            y=plot_scores,
            text=[f'{s:.3f}' for s in plot_scores], # Display score on bars
            textposition='auto',
            marker_color=['blue', 'orange'] # Different colors for each bar
        )
    ])

    fig_comparison.update_layout(
        title='Comparison of Clustering Performance (Silhouette Score)',
        xaxis_title='Clustering Method',
        yaxis_title='Silhouette Score',
        yaxis_range=[-0.1, 1.0], # Silhouette score ranges from -1 to 1
        bargap=0.5 # Gap between bars
    )

    fig_comparison.show()


--- Generating Comparison Bar Chart of Silhouette Scores (Requires Plotly) ---


In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# --- New Step: External Evaluation ---
# NOTE: This assumes 'data_subset' and the original 'category' column are available.
# We treat the original 'category' labels as the ground truth.
true_labels = data_subset['category'].factorize()[0]

# 1. Adjusted Rand Index (ARI) - Measures similarity between two clustering assignments
ari_graph = adjusted_rand_score(true_labels, graph_labels)
ari_kmeans = adjusted_rand_score(true_labels, kmeans_labels)

# 2. Normalized Mutual Information (NMI) - Measures shared information between assignments
nmi_graph = normalized_mutual_info_score(true_labels, graph_labels)
nmi_kmeans = normalized_mutual_info_score(true_labels, kmeans_labels)

print("\n--- External Evaluation Against Original Categories ---")
print(f"Adjusted Rand Index (Louvain vs. True): {ari_graph:.3f}")
print(f"Adjusted Rand Index (K-Means vs. True): {ari_kmeans:.3f}")
print(f"Normalized Mutual Information (Louvain vs. True): {nmi_graph:.3f}")
print(f"Normalized Mutual Information (K-Means vs. True): {nmi_kmeans:.3f}")


--- External Evaluation Against Original Categories ---
Adjusted Rand Index (Louvain vs. True): 0.000
Adjusted Rand Index (K-Means vs. True): 0.000
Normalized Mutual Information (Louvain vs. True): 0.792
Normalized Mutual Information (K-Means vs. True): 0.792


In [None]:
print("""
==============================
BERT SENTIMENT ANALYSIS ARCHITECTURE
==============================

Input Text
   │
   ▼
BertTokenizer (bert-base-uncased)
   ├─ Converts text → tokens
   ├─ Adds [CLS] and [SEP]
   ├─ Creates:
   │    • input_ids
   │    • attention_mask
   │
   ▼
Pre-trained BERT Encoder
(bert-base-uncased)
   ├─ 12 Transformer Encoder Layers
   ├─ Multi-Head Self Attention
   ├─ Feed Forward Neural Networks
   ├─ Hidden Size = 768
   │
   ▼
[CLS] Token Representation
   ├─ Represents entire sentence
   │
   ▼
Classification Head
   ├─ Linear Layer (768 → 2)
   ├─ Outputs logits for:
   │    • Class 0 → Negative
   │    • Class 1 → Positive
   │
   ▼
Softmax
   ├─ Converts logits → probabilities
   │
   ▼
Predicted Sentiment Label
   ├─ Negative (0)
   ├─ Positive (1)

==============================
TRAINING DETAILS
==============================
Loss Function : CrossEntropyLoss
Optimizer     : AdamW
Batch Size    : 2
Epochs        : 10
Padding       : Enabled
Truncation    : max_length = 128
Device        : CPU / GPU

==============================
""")



BERT SENTIMENT ANALYSIS ARCHITECTURE

Input Text
   │
   ▼
BertTokenizer (bert-base-uncased)
   ├─ Converts text → tokens
   ├─ Adds [CLS] and [SEP]
   ├─ Creates:
   │    • input_ids
   │    • attention_mask
   │
   ▼
Pre-trained BERT Encoder
(bert-base-uncased)
   ├─ 12 Transformer Encoder Layers
   ├─ Multi-Head Self Attention
   ├─ Feed Forward Neural Networks
   ├─ Hidden Size = 768
   │
   ▼
[CLS] Token Representation
   ├─ Represents entire sentence
   │
   ▼
Classification Head
   ├─ Linear Layer (768 → 2)
   ├─ Outputs logits for:
   │    • Class 0 → Negative
   │    • Class 1 → Positive
   │
   ▼
Softmax
   ├─ Converts logits → probabilities
   │
   ▼
Predicted Sentiment Label
   ├─ Negative (0)
   ├─ Positive (1)

TRAINING DETAILS
Loss Function : CrossEntropyLoss
Optimizer     : AdamW
Batch Size    : 2
Epochs        : 10
Padding       : Enabled
Truncation    : max_length = 128
Device        : CPU / GPU


