In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

df = pd.read_csv('bbc_news.csv')
df.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [3]:
nltk.download(['punkt_tab', 'stopwords', 'wordnet', 'names'])
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Hrishikesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hrishikesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hrishikesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Hrishikesh\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


**Preprocessing Steps**
1. URL removal
2. HTML tag removal
3. Hyphenated word split
4. Word Tokenization & stop word removal
5. Lemmatization(First for verbs & then for nouns)
Final strings <30 are dropped, while others are saved to an external CSV file

In [4]:
news_stopwords = {
    'say', 'says', 'said', 'also', 'like', 'could', 'would', 
    'according', 'year', 'new', 'one', 'two', 'bbc', 'rss'
}
stop_words = stop_words.union(news_stopwords)

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s.-]', ' ', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'(\w+)-(\w+)', r'\1 \2', text)  #split hyphenated words
    text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', '', text)  #remove dates
    words = word_tokenize(text)
    words = [
        word for word in words 
        if (word not in stop_words) 
        and (len(word) > 2) 
        and (not word.isnumeric())
    ]    
    words = [word.lower() for word in words]
    words = [lemmatizer.lemmatize(word, pos='v') for word in words]  
    words = [lemmatizer.lemmatize(word, pos='n') for word in words]  
    return " ".join(words)

df['combined_text'] = df['title'] + " " + df['description']
df['processed'] = df['combined_text'].apply(preprocess_text)
df = df[df['processed'].str.len() > 30].reset_index(drop=True)
df[['title', 'pubDate', 'processed']].to_csv('preprocessed_bbc_news.csv', index=False)

**Processed Dataframe**

In [5]:
df.head()

Unnamed: 0,title,pubDate,guid,link,description,combined_text,processed
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...,Ukraine: Angry Zelensky vows to punish Russian...,ukraine angry zelensky vow punish russian atro...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as...",War in Ukraine: Taking cover in a town under a...,war ukraine take cover town attack jeremy bowe...
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...,Ukraine war 'catastrophic for global food' One...,ukraine war catastrophic global food one world...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bomb saffie roussos parent he...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...,Ukraine conflict: Oil price soars to highest l...,ukraine conflict oil price soar highest level ...


In [6]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm



**Sentence Transformer:** Pretrained transformer model that maps each input text into a fixed length vector.  
**Dimensionality Reduction(UMAP):** Projects the high dimensional embeddings to a lower dimensional space(5 here).  
**Clustering using HDBSCAN:** Finds densely connected groups in the dataset and labels low density points as noise. Each formed cluster represents a topic.  
**CountVectorizer for term extraction:** Turns corpus into term-document matrix to identify the most frequent terms per topic.  

**BERTopic**  
It is a framework that ties together the above four. Goal is to produce human readable topics from the corpus without manually specifying the amount of topics in advance.


In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df['processed'].tolist(), 
                                    show_progress_bar=True)

umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine')

hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', 
                        cluster_selection_method='eom', 
                        prediction_data=True)

vectorizer_model = CountVectorizer(stop_words="english", 
                                   min_df=2, ngram_range=(1, 2))

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(df['processed'])

topic_model.save("bertopic_model") #saving the model

topic_info = topic_model.get_topic_info() #gets [topic, count, name, representation]
topic_info.to_csv('topic_info.csv', index=False)

representative_docs = topic_model.get_representative_docs()
pd.DataFrame(representative_docs.items(), 
             columns=['Topic', 'Representative_Docs']).to_csv('representative_docs.csv', index=False) #topic id to its top example(representative) documents

print("topic modeling complete")
print(f"topics created: {len(topic_info)}")


Batches: 100%|██████████| 1316/1316 [05:45<00:00,  3.81it/s]
2025-06-02 01:07:14,146 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1316/1316 [05:28<00:00,  4.01it/s]
2025-06-02 01:12:43,098 - BERTopic - Embedding - Completed ✓
2025-06-02 01:12:43,098 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-02 01:13:11,960 - BERTopic - Dimensionality - Completed ✓
2025-06-02 01:13:11,962 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-02 01:13:34,562 - BERTopic - Cluster - Completed ✓
2025-06-02 01:13:34,571 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-02 01:13:36,091 - BERTopic - Representation - Completed ✓
  self._set_arrayXarray(i, j, x)


topic modeling complete
topics created: 98


In [8]:
topic_info.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,14281,-1_bbc_england_woman_year,"[bbc, england, woman, year, world, people, ele...",[rishi sunak tax cutter the chancellor want cu...
1,0,2671,0_ukraine_russia_russian_war,"[ukraine, russia, russian, war, ukraine war, u...",[ukraine war grim ukrainian shadow russian att...
2,1,2193,1_manchester_league_unite_city,"[manchester, league, unite, city, liverpool, m...",[real madrid manchester city champion league f...
3,2,2124,2_murder_police_kill_shoot,"[murder, police, kill, shoot, stab, die, death...",[olivia pratt korbel arrest nine year-old murd...
4,3,1234,3_queen_king_prince_charles,"[queen, king, prince, charles, king charles, r...",[king charles see first time since diagnosis p...


**Dominant topics:**  
-1: Outliers
0: ukraine-russia war
1: crime/violence
2: manchester city/ united
3: british monarchy
.....


**Model Evaluation**  
**1. Bar Charts:**  
Bars represents top 5 words/bigrams from each topic.  
Length of each bar is the c-TF-IDF score/weight. Longer bar = term is more unique to that topic.  
  
**Intertopic Distance Map**  
Shows all topics discovered in a plane.  
Each bubble represents one topic, size of each bubble = how many docs belong to the topic.


In [12]:
topic_model.visualize_barchart(top_n_topics=10).show()

topic_model.visualize_topics().show()

In [11]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Prepare data
texts = [doc.split() for doc in df['processed']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Get topics from BERTopic
topics = []
for topic_id in topic_info['Topic']:
    if topic_id != -1:  # Skip outlier topic
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)]
        topics.append(topic_words)

# Calculate coherence
coherence_model = CoherenceModel(
    topics=topics,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'  # Best for human interpretability
)
coherence_score = coherence_model.get_coherence()
print(f"Topic Coherence (c_v): {coherence_score:.4f}")

Topic Coherence (c_v): 0.7604
