In [36]:
# Import libraries
# GoogleNews Documentation: https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
#BERTopic documentation: https://maartengr.github.io/BERTopic/index.html
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # pip install -U sentence-transformers
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")


In [37]:
# Initializing
googlenews = GNews()

In [38]:
# Settings
googlenews = GNews(language='en',start_date=(2024, 6, 1), end_date=(2024, 8, 1))

In [39]:
# Search 
searchednews = googlenews.get_news('Spotify')
print(f"Articles found:", len(searchednews))

Articles found: 92


In [40]:
# Results
print(searchednews[1])

{'title': 'Adjusting Spotify Premium Prices in the US - spotify.com', 'description': 'Adjusting Spotify Premium Prices in the US  spotify.com', 'published date': 'Mon, 03 Jun 2024 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMiiwFBVV95cUxPd1ZoTmxLSkZaYlVVRDQxT0lfNERNRGVmM1BUZjNWd3Z6VFcwMXYwUmJ6VU1zMmlHVmJhd0VMUXpzOHdNX28xSFc3QmY1RFlZellwdjc3VWoxTElOb2k2RzlEU2VybW4wdGRZeGxjeDFmMktWRzJXQ2M1ZHlXSlktQWNpdHpnQVlsenM4?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://newsroom.spotify.com', 'title': 'spotify.com'}}


In [41]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(5)

Unnamed: 0,title,description,published date,url,publisher
87,Spotify pays homage to media planning in Titan...,Spotify pays homage to media planning in Titan...,"Tue, 04 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMitgFBV...,"{'href': 'https://www.campaignlive.com', 'titl..."
88,LIV Golf releases first official Spotify playl...,LIV Golf releases first official Spotify playl...,"Wed, 26 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMisgFBV...,"{'href': 'https://www.golfdigest.com', 'title'..."
89,Spotify Technology S.A. to Announce Financial ...,Spotify Technology S.A. to Announce Financial ...,"Wed, 26 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMilgFBV...,"{'href': 'https://finance.yahoo.com', 'title':..."
90,Spotify Faces FTC Complaint From Music Publish...,Spotify Faces FTC Complaint From Music Publish...,"Thu, 13 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMikgFBV...,"{'href': 'https://www.pymnts.com', 'title': 'P..."
91,"Camilo Calls Latest Release, cuatro, ‘An Album...","Camilo Calls Latest Release, cuatro, ‘An Album...","Wed, 19 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiowFBV...,"{'href': 'https://newsroom.spotify.com', 'titl..."


In [42]:
# Check one article
df.iloc[43]['description']

'Spotify adds Comments; and a new mobile app for podcasters  Podnews'

In [43]:
#Delete a source from the description and check one article
def split_text(text):
    return text.split('  ')[0]
df['description'] = df['description'].apply(split_text)
df.iloc[43]['description']

'Spotify adds Comments; and a new mobile app for podcasters'

In [44]:
# Change date format
df['published date'] = pd.to_datetime(df['published date'], format='%a, %d %b %Y %H:%M:%S GMT')
df['published date'] = df['published date'].dt.strftime('%d-%m-%Y')
df.head(5)

Unnamed: 0,title,description,published date,url,publisher
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,"{'href': 'https://www.newyorker.com', 'title':..."
1,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,"{'href': 'https://newsroom.spotify.com', 'titl..."
2,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,"{'href': 'https://www.vox.com', 'title': 'Vox...."
3,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,"{'href': 'https://www.bloomberg.com', 'title':..."
4,Spotify quietly lets all podcasters upload vid...,Spotify quietly lets all podcasters upload vid...,20-06-2024,https://news.google.com/rss/articles/CBMihgFBV...,"{'href': 'https://techcrunch.com', 'title': 'T..."


In [45]:
# Break publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,https://www.newyorker.com,The New Yorker
1,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,https://newsroom.spotify.com,spotify.com
2,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,https://www.vox.com,Vox.com
3,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,https://www.bloomberg.com,Bloomberg
4,Spotify quietly lets all podcasters upload vid...,Spotify quietly lets all podcasters upload vid...,20-06-2024,https://news.google.com/rss/articles/CBMihgFBV...,https://techcrunch.com,TechCrunch


In [46]:
# Clean DataFrame
df['media'] = df['title'].iloc[:,-1]
df = df[['description','published date','media','url']]
df.head(10)

Unnamed: 0,description,published date,media,url
0,Why I Finally Quit Spotify,31-07-2024,The New Yorker,https://news.google.com/rss/articles/CBMiggFBV...
1,Adjusting Spotify Premium Prices in the US,03-06-2024,spotify.com,https://news.google.com/rss/articles/CBMiiwFBV...
2,Why I quit Spotify,11-07-2024,Vox.com,https://news.google.com/rss/articles/CBMif0FVX...
3,Spotify Has One Big Advantage on Every Other S...,02-06-2024,Bloomberg,https://news.google.com/rss/articles/CBMitwFBV...
4,Spotify quietly lets all podcasters upload vid...,20-06-2024,TechCrunch,https://news.google.com/rss/articles/CBMihgFBV...
5,Sabrina Carpenter and Spotify conspiracy theories,02-07-2024,The Week,https://news.google.com/rss/articles/CBMimgFBV...
6,Spotify passes UMG as the world’s most valuabl...,15-07-2024,hypebot.com,https://news.google.com/rss/articles/CBMiqAFBV...
7,Spotify Just Did Something Entirely Unexpected...,23-06-2024,Inc.,https://news.google.com/rss/articles/CBMivgFBV...
8,Spotify raises US prices of its premium plans ...,03-06-2024,Reuters,https://news.google.com/rss/articles/CBMilAFBV...
9,Spotify’s HiFi add-on could cost an extra $5 p...,11-06-2024,The Verge,https://news.google.com/rss/articles/CBMioAFBV...


## Topic modelling

In [47]:
# Extract embeddings using a model all-MiniLM-L6-v2
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Pre-compute embeddings
embeddings = embedding_model.encode(df['description'], show_progress_bar=False)

09/04/2024 04:12:23 PM - Use pytorch device_name: mps
09/04/2024 04:12:23 PM - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [48]:
# Reduce dimensionality focusing more on a local structure of short texts
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.1, metric='cosine', random_state=42)

In [49]:
# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=1)

In [50]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [51]:
#Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [52]:
# Set λ to 0.5 for the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=10)

In [53]:
# Run the model
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - Diversify topic words         
  nr_topics=None,
  min_topic_size=1, 
  verbose=True,
  top_n_words=10                        
)

In [54]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['description'], embeddings)

2024-09-04 16:12:26,289 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-04 16:12:27,217 - BERTopic - Dimensionality - Completed ✓
2024-09-04 16:12:27,217 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-04 16:12:27,221 - BERTopic - Cluster - Completed ✓
2024-09-04 16:12:27,223 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-04 16:12:27,792 - BERTopic - Representation - Completed ✓


In [55]:
# Reduce outliers and update the topics if such are identified
# Warning refers to a switch to weighted c-TF-IDF embeddings instead of centroid embeddings, i.e. manual update. No reduction is used afterwards.
if -1 not in topics:
    print (topic_model.get_topic_info())
else: 
    topics = topic_model.reduce_outliers(df['description'], topics)
    topic_model.update_topics(df['description'], topics=topics, vectorizer_model=vectorizer_model, top_n_words=30)
    print ("Outliers were identified and topics were updated.")

100%|██████████| 1/1 [00:00<00:00, 274.05it/s]


Outliers were identified and topics were updated.


In [56]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(20)

Number of topics: 7


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,22,0_spotify_new_subscription_comments,"[spotify, new, subscription, comments, podcast...",[Spotify quietly lets all podcasters upload vi...
1,1,21,1_prices_spotify_premium_price,"[prices, spotify, premium, price, second time,...",[Wall Street praises Spotify price hikes — and...
2,2,13,2_spotify_dj_ai_biggest,"[spotify, dj, ai, biggest, wrapped, celebrate,...",[Billie Eilish and Spotify Bring the Mysteriou...
3,3,11,3_2024_spotify_quarter 2024_second quarter,"[2024, spotify, quarter 2024, second quarter, ...",[Spotify Technology S.A. to Announce Financial...
4,4,10,4_ceo_spotify ceo_spotify_theories,"[ceo, spotify ceo, spotify, theories, remove m...",[Deadmau5 Threatens to Remove His Music From S...
5,5,8,5_publishers_music publishers_complaint_file,"[publishers, music publishers, complaint, file...",[Music Publishers File FTC Complaint Against S...
6,6,7,6_plan_basic plan_audiobooks_basic,"[plan, basic plan, audiobooks, basic, 10 99, 1...",[Spotify Rolls Out Basic Plan In The US Starti...


In [57]:
# Print the keywords
a_topic = freq.iloc[0]["Topic"]
topic_model.get_topic(a_topic)

[('spotify', 0.07677863511441815),
 ('new', 0.05992398359779899),
 ('subscription', 0.05036012906640186),
 ('comments', 0.04289748882620913),
 ('podcasts', 0.04289748882620913),
 ('video', 0.04289748882620913),
 ('app', 0.04000352010516657),
 ('spotify adds', 0.04000352010516657),
 ('streaming', 0.04000352010516657),
 ('adds', 0.04000352010516657),
 ('golf', 0.03133144401486719),
 ('adds comments', 0.03133144401486719),
 ('video content', 0.03133144401486719),
 ('gives', 0.03133144401486719),
 ('priced', 0.03133144401486719),
 ('podcasters', 0.03133144401486719),
 ('comments podcasts', 0.03133144401486719),
 ('subscribers', 0.03133144401486719),
 ('social', 0.03133144401486719),
 ('creators', 0.028598325884139422),
 ('music', 0.028388280432489227),
 ('content', 0.02666901340344438),
 ('basic', 0.023969593439119596),
 ('premium', 0.022073477388136187),
 ('plan', 0.022073477388136187),
 ('shows spotify', 0.018014043205564902),
 ('reportedly planning', 0.018014043205564902),
 ('planning h

In [58]:
#Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=5)

In [59]:
# Visualise clusters of topics
topic_model.visualize_topics()

In [60]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=9)

In [61]:
# Create a new DataFrame with a topic column
df_BERTopics = pd.DataFrame({"description": df['description'], "topic": topics})
df_BERTopics.head(5)

Unnamed: 0,description,topic
0,Why I Finally Quit Spotify,1
1,Adjusting Spotify Premium Prices in the US,1
2,Why I quit Spotify,1
3,Spotify Has One Big Advantage on Every Other S...,0
4,Spotify quietly lets all podcasters upload vid...,0


In [62]:
# Merge the two DataFrames on the description column
df_BERTopics = pd.merge(df, df_BERTopics, on='description', how='inner')
df_BERTopics.tail(5)

Unnamed: 0,description,published date,media,url,topic
87,Spotify pays homage to media planning in Titan...,04-06-2024,Campaign US,https://news.google.com/rss/articles/CBMitgFBV...,2
88,LIV Golf releases first official Spotify playl...,26-06-2024,GolfDigest.com,https://news.google.com/rss/articles/CBMisgFBV...,0
89,Spotify Technology S.A. to Announce Financial ...,26-06-2024,Yahoo Finance,https://news.google.com/rss/articles/CBMilgFBV...,3
90,Spotify Faces FTC Complaint From Music Publishers,13-06-2024,PYMNTS.com,https://news.google.com/rss/articles/CBMikgFBV...,5
91,"Camilo Calls Latest Release, cuatro, ‘An Album...",19-06-2024,spotify.com,https://news.google.com/rss/articles/CBMiowFBV...,3


In [63]:
# Count the distribution of topics in articles
df_BERTopics.topic.value_counts()

topic
0    22
1    21
2    13
3    11
4    10
5     8
6     7
Name: count, dtype: int64