In [3]:
# Import libraries
# GoogleNews Documentation: https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
#BERTopic documentation: https://maartengr.github.io/BERTopic/index.html
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # pip install -U sentence-transformers
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")


04/08/2025 01:42:08 PM - NumExpr defaulting to 8 threads.
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Initializing
googlenews = GNews()

In [5]:
# Settings
googlenews = GNews(language='en',start_date=(2024, 6, 1), end_date=(2024, 8, 1))

In [6]:
# Search 
searchednews = googlenews.get_news('Spotify')
print(f"Articles found:", len(searchednews))

Articles found: 100


In [7]:
# Results
print(searchednews[1])

{'title': 'Why I quit Spotify - Vox', 'description': 'Why I quit Spotify  Vox', 'published date': 'Thu, 11 Jul 2024 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMif0FVX3lxTE8wSjQxdWJLbzhOZzc2R3EtRjF4NWZlVWpJSDRodkJQNVpNejBUT3laSFBoTm9LeDFlSFdhTU1FTEx0MjZwVm9OeUhtN2J5emZSaUo1ZXhCRjV3MXozZG1zdTI2clJFMkxiN2NCWkdweWg2cENjeWxGSnhfbTEtSGM?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://www.vox.com', 'title': 'Vox'}}


In [8]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(5)

Unnamed: 0,title,description,published date,url,publisher
95,Spotify to increase premium pricing in the US ...,Spotify to increase premium pricing in the US ...,"Mon, 03 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMioAFBV...,"{'href': 'https://techcrunch.com', 'title': 'T..."
96,Spotify Hits Record Q2 Earnings Amid Double-Di...,Spotify Hits Record Q2 Earnings Amid Double-Di...,"Tue, 23 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMioAFBV...,"{'href': 'https://www.pymnts.com', 'title': 'P..."
97,Spotify's Outside Voice evolves into a new pod...,Spotify's Outside Voice evolves into a new pod...,"Fri, 12 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiZkFVX...,"{'href': 'https://podnews.net', 'title': 'Podn..."
98,Spotify gives up on trying to charge for song ...,Spotify gives up on trying to charge for song ...,"Wed, 31 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMilAFBV...,"{'href': 'https://www.theverge.com', 'title': ..."
99,Spotify (SPOT) Grows Paid Subscribers for Quar...,Spotify (SPOT) Grows Paid Subscribers for Quar...,"Tue, 23 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMisAFBV...,"{'href': 'https://www.bloomberg.com', 'title':..."


In [9]:
# Check one article
df.iloc[43]['description']

"My top 5 tips to make more of your Spotify playlists – and trust me, I'm a playlist obsessive  TechRadar"

In [10]:
#Delete a source from the description and check one article
def split_text(text):
    return text.split('  ')[0]
df['description'] = df['description'].apply(split_text)
df.iloc[43]['description']

"My top 5 tips to make more of your Spotify playlists – and trust me, I'm a playlist obsessive"

In [11]:
# Change date format
df['published date'] = pd.to_datetime(df['published date'], format='%a, %d %b %Y %H:%M:%S GMT')
df['published date'] = df['published date'].dt.strftime('%d-%m-%Y')
df.head(5)

Unnamed: 0,title,description,published date,url,publisher
0,Adjusting Spotify Premium Prices in the US - S...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,"{'href': 'https://newsroom.spotify.com', 'titl..."
1,Why I quit Spotify - Vox,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,"{'href': 'https://www.vox.com', 'title': 'Vox'}"
2,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,"{'href': 'https://www.newyorker.com', 'title':..."
3,Spotify Removes Songs of Pro-War Russian Artis...,Spotify Removes Songs of Pro-War Russian Artists,28-06-2024,https://news.google.com/rss/articles/CBMingFBV...,"{'href': 'https://www.themoscowtimes.com', 'ti..."
4,Spotify hit with US Federal Trade Commission c...,Spotify hit with US Federal Trade Commission c...,12-06-2024,https://news.google.com/rss/articles/CBMiywFBV...,{'href': 'https://www.musicbusinessworldwide.c...


In [12]:
# Break publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,Adjusting Spotify Premium Prices in the US - S...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,https://newsroom.spotify.com,Spotify
1,Why I quit Spotify - Vox,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,https://www.vox.com,Vox
2,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,https://www.newyorker.com,The New Yorker
3,Spotify Removes Songs of Pro-War Russian Artis...,Spotify Removes Songs of Pro-War Russian Artists,28-06-2024,https://news.google.com/rss/articles/CBMingFBV...,https://www.themoscowtimes.com,The Moscow Times
4,Spotify hit with US Federal Trade Commission c...,Spotify hit with US Federal Trade Commission c...,12-06-2024,https://news.google.com/rss/articles/CBMiywFBV...,https://www.musicbusinessworldwide.com,Music Business Worldwide


In [13]:
# Clean DataFrame
df['media'] = df['title'].iloc[:,-1]
df = df[['description','published date','media','url']]
df.head(10)

Unnamed: 0,description,published date,media,url
0,Adjusting Spotify Premium Prices in the US,03-06-2024,Spotify,https://news.google.com/rss/articles/CBMiiwFBV...
1,Why I quit Spotify,11-07-2024,Vox,https://news.google.com/rss/articles/CBMif0FVX...
2,Why I Finally Quit Spotify,31-07-2024,The New Yorker,https://news.google.com/rss/articles/CBMiggFBV...
3,Spotify Removes Songs of Pro-War Russian Artists,28-06-2024,The Moscow Times,https://news.google.com/rss/articles/CBMingFBV...
4,Spotify hit with US Federal Trade Commission c...,12-06-2024,Music Business Worldwide,https://news.google.com/rss/articles/CBMiywFBV...
5,Spotify is hiking its prices again,03-06-2024,CNN,https://news.google.com/rss/articles/CBMieEFVX...
6,Spotify reports record quarterly earnings,23-07-2024,Reuters,https://news.google.com/rss/articles/CBMizAFBV...
7,"Spotify is no longer just a streaming app, it’...",10-07-2024,TechCrunch,https://news.google.com/rss/articles/CBMingFBV...
8,Comments on Podcasts Gives Creators and Listen...,09-07-2024,Spotify,https://news.google.com/rss/articles/CBMid0FVX...
9,More Choice for U.S. Subscribers,21-06-2024,Spotify,https://news.google.com/rss/articles/CBMie0FVX...


## Topic modelling

In [14]:
# Extract embeddings using a model all-MiniLM-L6-v2
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Pre-compute embeddings
embeddings = embedding_model.encode(df['description'], show_progress_bar=False)

04/08/2025 01:43:01 PM - Use pytorch device_name: mps
04/08/2025 01:43:01 PM - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [15]:
# Reduce dimensionality focusing more on a local structure of short texts
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.1, metric='cosine', random_state=42)

In [16]:
# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=1)

In [17]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [18]:
#Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [19]:
# Set λ to 0.5 for the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=10)

In [20]:
# Run the model
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - Diversify topic words         
  nr_topics=None,
  min_topic_size=1, 
  verbose=True,
  top_n_words=10                        
)

In [21]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['description'], embeddings)

2025-04-08 13:43:10,695 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-04-08 13:43:12,887 - BERTopic - Dimensionality - Completed ✓
2025-04-08 13:43:12,888 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-08 13:43:12,891 - BERTopic - Cluster - Completed ✓
2025-04-08 13:43:12,893 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-04-08 13:43:15,823 - BERTopic - Representation - Completed ✓


In [22]:
# Reduce outliers and update the topics if such are identified
# Warning refers to a switch to weighted c-TF-IDF embeddings instead of centroid embeddings, i.e. manual update. No reduction is used afterwards.
if -1 not in topics:
    print (topic_model.get_topic_info())
else: 
    topics = topic_model.reduce_outliers(df['description'], topics)
    topic_model.update_topics(df['description'], topics=topics, vectorizer_model=vectorizer_model, top_n_words=30)
    print ("Outliers were identified and topics were updated.")

100%|██████████| 1/1 [00:00<00:00, 171.12it/s]


Outliers were identified and topics were updated.


In [23]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(20)

Number of topics: 8


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,20,0_record_profit_q2_spotify,"[record, profit, q2, spotify, amid, earnings, ...",[Spotify stock jumps after reporting record pr...
1,1,18,1_prices_premium_spotify_plans,"[prices, premium, spotify, plans, subscription...",[Spotify raises US prices of its premium plans...
2,2,13,2_spotify_celebrate_quit spotify_quit,"[spotify, celebrate, quit spotify, quit, liste...",[My top 5 tips to make more of your Spotify pl...
3,3,11,3_lawsuit_publishers_bundle_music,"[lawsuit, publishers, bundle, music, spotify, ...",[‘Car Thing’ Class Action Lawsuit Voluntarily ...
4,4,13,4_video_comments_spotify_new,"[video, comments, spotify, new, video content,...",[Spotify quietly lets all podcasters upload vi...
5,5,9,5_subscription tier_music_tier_subscription,"[subscription tier, music, tier, subscription,...",[Amazon Music Unlimited vs. Spotify Premium: A...
6,6,10,6_artists_got_russian artists_war,"[artists, got, russian artists, war, russian, ...",[New Yorker Writer Decries the ‘Enshittificati...
7,7,6,7_summer_songs summer_heat_picks,"[summer, songs summer, heat, picks, prediction...",[Beat the Heat With These Bingeworthy Beach Re...


In [24]:
# Print the keywords
a_topic = freq.iloc[0]["Topic"]
topic_model.get_topic(a_topic)

[('record', 0.0723752525072841),
 ('profit', 0.0723752525072841),
 ('q2', 0.06587432314463486),
 ('spotify', 0.06369449494940926),
 ('amid', 0.04380712823713019),
 ('earnings', 0.04380712823713019),
 ('record profit', 0.03525562422231481),
 ('posts record', 0.03525562422231481),
 ('stock', 0.03525562422231481),
 ('posts', 0.03525562422231481),
 ('earnings spotify', 0.03525562422231481),
 ('high', 0.03285534617784765),
 ('year', 0.028231852776272083),
 ('record q2', 0.025771103878185254),
 ('q2 earnings', 0.025771103878185254),
 ('growth', 0.025771103878185254),
 ('ad', 0.025771103878185254),
 ('spotify hits', 0.025771103878185254),
 ('spotify reports', 0.025771103878185254),
 ('grows', 0.025771103878185254),
 ('jumps', 0.025771103878185254),
 ('quarterly', 0.025771103878185254),
 ('profitable', 0.025771103878185254),
 ('hits', 0.025771103878185254),
 ('quarter', 0.025771103878185254),
 ('spotify stock', 0.025771103878185254),
 ('q2 spotify', 0.025771103878185254),
 ('reports', 0.023503

In [25]:
#Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=5)

In [26]:
# Visualise clusters of topics
topic_model.visualize_topics()

In [27]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=9)

In [28]:
# Create a new DataFrame with a topic column
df_BERTopics = pd.DataFrame({"description": df['description'], "topic": topics})
df_BERTopics.head(5)

Unnamed: 0,description,topic
0,Adjusting Spotify Premium Prices in the US,1
1,Why I quit Spotify,2
2,Why I Finally Quit Spotify,2
3,Spotify Removes Songs of Pro-War Russian Artists,6
4,Spotify hit with US Federal Trade Commission c...,3


In [29]:
# Merge the two DataFrames on the description column
df_BERTopics = pd.merge(df, df_BERTopics, on='description', how='inner')
df_BERTopics.tail(5)

Unnamed: 0,description,published date,media,url,topic
97,Spotify to increase premium pricing in the US ...,03-06-2024,TechCrunch,https://news.google.com/rss/articles/CBMioAFBV...,1
98,Spotify Hits Record Q2 Earnings Amid Double-Di...,23-07-2024,PYMNTS.com,https://news.google.com/rss/articles/CBMioAFBV...,0
99,Spotify's Outside Voice evolves into a new pod...,12-07-2024,Podnews,https://news.google.com/rss/articles/CBMiZkFVX...,4
100,Spotify gives up on trying to charge for song ...,31-07-2024,The Verge,https://news.google.com/rss/articles/CBMilAFBV...,6
101,Spotify (SPOT) Grows Paid Subscribers for Quar...,23-07-2024,Bloomberg,https://news.google.com/rss/articles/CBMisAFBV...,0


In [30]:
# Count the distribution of topics in articles
df_BERTopics.topic.value_counts()

topic
0    20
1    18
2    13
4    13
3    11
5    11
6    10
7     6
Name: count, dtype: int64