In [11]:
# !pip install datasets tqdm -q

# BERTopic Analysis on CCNews Dataset

**Dataset**: Stanford CCNews 2024 - A large-scale news dataset containing articles from Common Crawl, representing global news discourse across multiple languages and regions.

**Methodology**: 
- **Topic Modeling**: BERTopic with multilingual embeddings to discover latent topics
- **Embeddings**: E5-small model for semantic understanding across languages
- **Clustering**: UMAP + HDBSCAN for topic discovery

**Technical Setup**:
- Sample: 500 multilingual articles for computational efficiency
- Tools: BERTopic, sentence-transformers, UMAP, HDBSCAN

In [None]:
import json, re, os, warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

from datasets import load_dataset

# Configuration
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Model Parameters
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-small"
MIN_TOPIC_SIZE = 10       # Minimum number of documents to form a topic
N_NEIGHBORS = 12          # Used by UMAP for dimensionality reduction
N_COMPONENTS = 5          # Used by UMAP for dimensionality reduction
RANDOM_STATE = 42         # Ensures reproducible results

# Load dataset in streaming mode and convert to DataFrame
print("Loading CCNews dataset...")
# Use num_proc=1 to avoid multiprocessing issues
dataset = load_dataset("stanford-oval/ccnews", name="2024", streaming=True)

Loading CCNews dataset...


In [13]:
# English Stopwords (from M2_NLP_TopicModelBert)
def get_english_stopwords():
    """Creates a robust list of English stopwords."""
    try:
        # Start with the standard list from sklearn
        from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_sw
        base = set(sklearn_sw)
    except ImportError:
        # Fallback to a basic list if sklearn is not available
        base = {"a", "an", "and", "the", "in", "is", "it", "of", "for", "on", "with", "as", "by", "that", "this"}

    # Add custom stopwords common in news data
    custom_stopwords = {
        "im", "ive", "id", "like", "just", "dont", "know", "feel", "think",
        "people", "really", "want", "time", "would", "get", "one", "even", "go", "going",
        "said", "say", "make", "something", "anything", "everything",
        "news", "report", "reports", "according", "sources", "source", "article",
        "today", "yesterday", "tomorrow", "week", "month", "year", "years",
        "new", "latest", "breaking", "update", "updates"
    }
    base.update(custom_stopwords)

    return sorted(list(base))

ENGLISH_STOPWORDS = get_english_stopwords()
print(f"Generated a list with {len(ENGLISH_STOPWORDS)} English stopwords.")

Generated a list with 354 English stopwords.


In [14]:
# Data Loading and Preparation (adapted from M2_NLP_TopicModelBert)
def prepare_ccnews_data(dataset, sample_size=500):
    """Load and prepare CCNews data for topic modeling."""
    print(f"Converting streaming dataset to DataFrame with {sample_size} samples from all languages...")
    
    # Convert streaming dataset to list
    data_list = []
    count = 0
    processed = 0
    for item in dataset['train']:
        processed += 1
        # Include all articles regardless of language
        data_list.append(item)
        count += 1
        if count % 100 == 0:
            print(f"Found {count} articles (processed {processed} total)...")
        if count >= sample_size:
            break
    
    print(f"Final: Found {count} articles after processing {processed} total articles")
    
    # Create DataFrame
    df = pd.DataFrame(data_list)
    
    # Normalize columns to avoid errors  
    if 'title' not in df.columns:
        df['title'] = ""
    if 'text' not in df.columns:
        df['text'] = ""
    
    df["title"] = df["title"].fillna("").astype(str)
    df["text"] = df["text"].fillna("").astype(str)
    
    # Combine title and text for complete document (following M2 pattern)
    df["full_text"] = (df["title"].str.strip() + " . " + df["text"].str.strip()).str.strip()
    
    # Remove empty documents
    df = df[df["full_text"].str.len() > 0].reset_index(drop=True)
    
    # Show language distribution if language field exists
    if 'language' in df.columns:
        print("\nLanguage distribution:")
        print(df['language'].value_counts().head(10))
    
    print(f"Prepared {len(df)} documents for analysis")
    return df

# Load and prepare the data
df = prepare_ccnews_data(dataset, sample_size=500)

Converting streaming dataset to DataFrame with 500 samples from all languages...
Found 100 articles (processed 100 total)...
Found 200 articles (processed 200 total)...
Found 100 articles (processed 100 total)...
Found 200 articles (processed 200 total)...
Found 300 articles (processed 300 total)...
Found 300 articles (processed 300 total)...
Found 400 articles (processed 400 total)...
Found 500 articles (processed 500 total)...
Found 400 articles (processed 400 total)...
Found 500 articles (processed 500 total)...
Final: Found 500 articles after processing 500 total articles

Language distribution:
language
en    182
es    109
ru     45
ar     45
pt     38
fr     23
kn     13
it     12
pl      8
tr      7
Name: count, dtype: int64
Prepared 500 documents for analysis
Final: Found 500 articles after processing 500 total articles

Language distribution:
language
en    182
es    109
ru     45
ar     45
pt     38
fr     23
kn     13
it     12
pl      8
tr      7
Name: count, dtype: int64
P

In [15]:
# Text Cleaning for the Vectorizer (from M2_NLP_TopicModelBert)
CLEAN_RE_URL = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
CLEAN_RE_WS  = re.compile(r"\s+")

def clean_for_vectorizer(s: str) -> str:
    """Clean text for the CountVectorizer."""
    s = s.lower()
    s = CLEAN_RE_URL.sub(" ", s)
    s = CLEAN_RE_WS.sub(" ", s)
    return s.strip()

# Apply text cleaning
df["text_clean"] = df["full_text"].apply(clean_for_vectorizer)

# Show sample of the data
print("Sample of prepared data:")
print(df[['title', 'full_text', 'text_clean']].head(3))

Sample of prepared data:
                                               title  \
0  Anadolu Otoyolu'nun Kocaeli kesimindeki trafik...   
1  Fallece madre de Michelle Obama, Marian Robins...   
2  Nearly 1.9 million Fiji water bottles recalled...   

                                           full_text  \
0  Anadolu Otoyolu'nun Kocaeli kesimindeki trafik...   
1  Fallece madre de Michelle Obama, Marian Robins...   
2  Nearly 1.9 million Fiji water bottles recalled...   

                                          text_clean  
0  anadolu otoyolu'nun kocaeli kesimindeki trafik...  
1  fallece madre de michelle obama, marian robins...  
2  nearly 1.9 million fiji water bottles recalled...  


In [16]:
# Generating Text Embeddings (from M2_NLP_TopicModelBert)
print("Loading embedding model...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# E5 models work best with a specific prefix
texts_for_embedding = [f"passage: {t}" for t in df["full_text"].tolist()]

print("Generating embeddings... This may take a few minutes.")
# Generate embeddings
embeddings = embedding_model.encode(
    texts_for_embedding,
    show_progress_bar=True,
    normalize_embeddings=True
)

print(f"Generated embeddings with shape: {embeddings.shape}")

Loading embedding model...
Generating embeddings... This may take a few minutes.
Generating embeddings... This may take a few minutes.


Batches: 100%|██████████| 16/16 [00:12<00:00,  1.29it/s]

Generated embeddings with shape: (500, 384)





In [17]:
# Configuring the BERTopic Model Components (from M2_NLP_TopicModelBert)

# For keyword extraction
vectorizer_model = CountVectorizer(
    stop_words=ENGLISH_STOPWORDS,
    ngram_range=(1, 2),
    min_df=2
)

# For dimensionality reduction
umap_model = UMAP(
    n_neighbors=N_NEIGHBORS,
    n_components=N_COMPONENTS,
    metric="cosine",
    random_state=RANDOM_STATE
)

# For clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=MIN_TOPIC_SIZE,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True
)

# For topic representation (using KeyBERTInspired as in M2)
representation_model = KeyBERTInspired()

print("All model components configured successfully!")

All model components configured successfully!


In [18]:
#| label: representation-ollama
#| eval: false
# To use this block, set `eval: true` in the line above and `eval: false` in the KeyBERT block.

import openai
from bertopic.representation import OpenAI

# This prompt uses a "few-shot" technique to guide the LLM to generate high-quality topic names.
prompt = """
I will provide you with sample texts and keywords from a topic. Your task is to create a concise, descriptive name (3-7 words) that accurately captures the topic's essence.
Requirements:

Use clear, specific language
Focus on the core theme, not peripheral details
Use natural phrasing (avoid generic words like "issues" or "topics")
Be descriptive enough that someone unfamiliar with the content would understand the topic

###EXAMPLES###
Topic:
Sample texts from this topic:

I just started learning Python and I'm confused about when to use lists vs dictionaries.
My code keeps throwing a 'KeyError' and I can't figure out why.
What's the best way to learn data structures as a beginner programmer?
Keywords: python, code, programming, error, syntax, function, debug, learn
Topic Name: Beginner Programming and Debugging Help


Topic:
Sample texts from this topic:

I meal prep every Sunday but by Wednesday I'm tired of eating the same thing.
How do you make healthy eating sustainable when you have a busy schedule?
I want to eat better but healthy food is so expensive compared to fast food.
Keywords: food, healthy, diet, meal, eating, nutrition, cook, recipe
Topic Name: Healthy Eating Habits and Meal Planning
###REAL DATA###


Topic:
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
!!!Output only the topic name here. No explanations. No preamble. Just the topic name in English:
"""

# Set up the OpenAI client to point to your local Ollama server
client = openai.OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama" # required, but not used
)

# Create the representation model
representation_model = OpenAI(
    client,
    model="qwen3:1.7b", # Or your preferred model
    prompt=prompt,
    chat=True,
    #delay_in_seconds=2 # To avoid rate-limiting
)

## Optional: Enhanced Topic Naming with Local LLM

In [19]:
#| label: train-model

# Assemble the BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    min_topic_size=MIN_TOPIC_SIZE,
    top_n_words=50,
    calculate_probabilities=True,
    verbose=True
)

# Train the model on our texts and embeddings
topics, probs = topic_model.fit_transform(df["text_clean"].tolist(), embeddings=embeddings)

2025-10-30 22:52:26,470 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-30 22:52:27,939 - BERTopic - Dimensionality - Completed ✓
2025-10-30 22:52:27,941 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-30 22:52:27,939 - BERTopic - Dimensionality - Completed ✓
2025-10-30 22:52:27,941 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-30 22:52:28,004 - BERTopic - Cluster - Completed ✓
2025-10-30 22:52:28,004 - BERTopic - Cluster - Completed ✓
2025-10-30 22:52:28,010 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-30 22:52:28,010 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 14/14 [17:43<00:00, 75.99s/it]

2025-10-30 23:10:12,435 - BERTopic - Representation - Completed ✓
2025-10-30 23:10:12,435 - BERTopic - Representation - Completed ✓


In [20]:
#| label: inspect-results

# Get detailed information about each topic
topic_info = topic_model.get_topic_info()

# Create the primary results dataframe
results_df = df.copy()
results_df["Topic"] = topics
results_df["Topic_Probability"] = np.max(probs, axis=1)

# Map the generated topic names (e.g., "-1_word1_word2") to the results
name_map = dict(zip(topic_info["Topic"], topic_info["Name"]))
results_df["Topic_Name"] = results_df["Topic"].map(name_map)

# Save the primary outputs
topic_info.to_csv(f"{OUTPUT_DIR}/topic_info.csv", index=False)
results_df.to_csv(f"{OUTPUT_DIR}/posts_with_topics.csv", index=False)

print("Top 10 Found Topics:")
topic_info.head(10)

Top 10 Found Topics:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,192,-1_,[],"[inauguración de la estación olleros ., países..."
1,0,62,0_,[],[глава мид норвегии эйде: украина имеет право ...
2,1,36,1_,[],[aparecen restos de chapopote en costas de pla...
3,2,28,2_,[],[empresário abriu conta conjunta com namorada ...
4,3,27,3_,[],"[لولو معاك في العيد.. تردد قناة لولو ""وناسة"" ب..."
5,4,25,4_,[],[biden campaign admits “very delicate moment” ...
6,5,18,5_,[],[biden anuncia propuesta de acuerdo sobre el a...
7,6,17,6_,[],[ricciardo's focus on improving f1 performance...
8,7,17,7_,[],[dakota johnson on set of rom-com materialists...
9,8,17,8_,[],"[rattrapée par ses déficits, la france voit sa..."


In [21]:
#| label: crosstab
# Only run if there are labels in the data
if "label" in results_df.columns and results_df["label"].nunique() > 1:
    crosstab = pd.crosstab(results_df["label"], results_df["Topic_Name"])
    crosstab.to_csv(f"{OUTPUT_DIR}/label_topic_crosstab.csv")
    print("Crosstab of Labels vs. Topics:")
    display(crosstab.head(10))

In [22]:
#| label: visualizations
#| warning: false

print("Generating and saving interactive visualizations...")

# Inter-topic distance map
fig_topics = topic_model.visualize_topics()
fig_topics.write_html(f"{OUTPUT_DIR}/viz_topics.html")

# Keyword scores per topic
fig_barchart = topic_model.visualize_barchart(top_n_topics=20, n_words=10)
fig_barchart.write_html(f"{OUTPUT_DIR}/viz_barchart.html")

# Document projection map
fig_documents = topic_model.visualize_documents(
    docs=df["full_text"].tolist(),
    embeddings=embeddings,
    hide_annotations=True
)
fig_documents.write_html(f"{OUTPUT_DIR}/viz_documents.html")

# Hierarchical clustering of topics
try:
    fig_hierarchy = topic_model.visualize_hierarchy()
    fig_hierarchy.write_html(f"{OUTPUT_DIR}/viz_hierarchy.html")
except ValueError:
    print("Could not generate hierarchy plot (not enough topics for hierarchical reduction).")

print(f"Visualizations saved in '{OUTPUT_DIR}/'")

Generating and saving interactive visualizations...


Visualizations saved in 'outputs/'


## Conclusion:

* The dataset was very difficult to work with for two main reasons:
    1. The BERTopic analysis was probaly more fitting with an analysis based on only articles written in english.
    2. The sample size for the analysis created very long and slow cell running time in the notebook regarding the BERTopic analysis.

* The hierachial model and topics visualization clearly shows 4 different groups/clusters. The main differences between the clusters are the dominant keywords, topics, and types of news stories they contain, reflecting different areas of focus in global news coverage. For precise details, we could inspect the top keywords or example articles for each cluster in the results.