In [64]:
# Import libraries
# GoogleNews Documentation: https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
#BERTopic documentation: https://maartengr.github.io/BERTopic/index.html
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # pip install -U sentence-transformers
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")


In [65]:
# Initializing
googlenews = GNews()

In [66]:
# Settings
googlenews = GNews(language='en',start_date=(2024, 6, 1), end_date=(2024, 8, 1))

In [67]:
# Search 
searchednews = googlenews.get_news('Spotify')
print(f"Articles found:", len(searchednews))

Articles found: 93


In [68]:
# Results
print(searchednews[1])

{'title': 'Adjusting Spotify Premium Prices in the US - spotify.com', 'description': 'Adjusting Spotify Premium Prices in the US  spotify.com', 'published date': 'Mon, 03 Jun 2024 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMiiwFBVV95cUxPd1ZoTmxLSkZaYlVVRDQxT0lfNERNRGVmM1BUZjNWd3Z6VFcwMXYwUmJ6VU1zMmlHVmJhd0VMUXpzOHdNX28xSFc3QmY1RFlZellwdjc3VWoxTElOb2k2RzlEU2VybW4wdGRZeGxjeDFmMktWRzJXQ2M1ZHlXSlktQWNpdHpnQVlsenM4?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://newsroom.spotify.com', 'title': 'spotify.com'}}


In [69]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(5)

Unnamed: 0,title,description,published date,url,publisher
88,Spotify increases subscriber prices for second...,Spotify increases subscriber prices for second...,"Mon, 03 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMibEFVX...,"{'href': 'https://fortune.com', 'title': 'Fort..."
89,"Spotify launches Spanish language AI DJ ""Livi""...","Spotify launches Spanish language AI DJ ""Livi""...","Wed, 17 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMid0FVX...,"{'href': 'https://www.axios.com', 'title': 'Ax..."
90,Spotify to increase premium pricing in the US ...,Spotify to increase premium pricing in the US ...,"Mon, 03 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMioAFBV...,"{'href': 'https://techcrunch.com', 'title': 'T..."
91,Spotify raises prices on subscriptions days af...,Spotify raises prices on subscriptions days af...,"Mon, 03 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiggFBV...,"{'href': 'https://www.miamiherald.com', 'title..."
92,Spotify CEO expects profit growth to continue ...,Spotify CEO expects profit growth to continue ...,"Tue, 23 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMie0FVX...,"{'href': 'https://www.axios.com', 'title': 'Ax..."


In [70]:
# Check one article
df.iloc[43]['description']

'Spotify trims ad sales team as it focuses on smaller clients and programmatic  Ad Age'

In [71]:
#Delete a source from the description and check one article
def split_text(text):
    return text.split('  ')[0]
df['description'] = df['description'].apply(split_text)
df.iloc[43]['description']

'Spotify trims ad sales team as it focuses on smaller clients and programmatic'

In [72]:
# Change date format
df['published date'] = pd.to_datetime(df['published date'], format='%a, %d %b %Y %H:%M:%S GMT')
df['published date'] = df['published date'].dt.strftime('%d-%m-%Y')
df.head(5)

Unnamed: 0,title,description,published date,url,publisher
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,"{'href': 'https://www.newyorker.com', 'title':..."
1,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,"{'href': 'https://newsroom.spotify.com', 'titl..."
2,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,"{'href': 'https://www.vox.com', 'title': 'Vox...."
3,"Spotify Review: Best Music Streaming Service, ...","Spotify Review: Best Music Streaming Service, ...",06-06-2024,https://news.google.com/rss/articles/CBMipAFBV...,"{'href': 'https://www.cnet.com', 'title': 'CNET'}"
4,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,"{'href': 'https://www.bloomberg.com', 'title':..."


In [73]:
# Break publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,https://www.newyorker.com,The New Yorker
1,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,https://newsroom.spotify.com,spotify.com
2,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,https://www.vox.com,Vox.com
3,"Spotify Review: Best Music Streaming Service, ...","Spotify Review: Best Music Streaming Service, ...",06-06-2024,https://news.google.com/rss/articles/CBMipAFBV...,https://www.cnet.com,CNET
4,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,https://www.bloomberg.com,Bloomberg


In [74]:
# Clean DataFrame
df['media'] = df['title'].iloc[:,-1]
df = df[['description','published date','media','url']]
df.head(10)

Unnamed: 0,description,published date,media,url
0,Why I Finally Quit Spotify,31-07-2024,The New Yorker,https://news.google.com/rss/articles/CBMiggFBV...
1,Adjusting Spotify Premium Prices in the US,03-06-2024,spotify.com,https://news.google.com/rss/articles/CBMiiwFBV...
2,Why I quit Spotify,11-07-2024,Vox.com,https://news.google.com/rss/articles/CBMif0FVX...
3,"Spotify Review: Best Music Streaming Service, ...",06-06-2024,CNET,https://news.google.com/rss/articles/CBMipAFBV...
4,Spotify Has One Big Advantage on Every Other S...,02-06-2024,Bloomberg,https://news.google.com/rss/articles/CBMitwFBV...
5,Spotify launches new basic streaming plan in US,21-06-2024,Reuters,https://news.google.com/rss/articles/CBMilgFBV...
6,Spotify quietly lets all podcasters upload vid...,20-06-2024,TechCrunch,https://news.google.com/rss/articles/CBMihgFBV...
7,Sabrina Carpenter and Spotify conspiracy theories,02-07-2024,The Week,https://news.google.com/rss/articles/CBMimgFBV...
8,Spotify passes UMG as the world’s most valuabl...,15-07-2024,hypebot.com,https://news.google.com/rss/articles/CBMiqAFBV...
9,Spotify Just Did Something Entirely Unexpected...,23-06-2024,Inc.,https://news.google.com/rss/articles/CBMivgFBV...


## Topic modelling

In [75]:
# Extract embeddings using a model all-MiniLM-L6-v2
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Pre-compute embeddings
embeddings = embedding_model.encode(df['description'], show_progress_bar=False)

09/04/2024 07:42:45 PM - Use pytorch device_name: mps
09/04/2024 07:42:45 PM - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [76]:
# Reduce dimensionality focusing more on a local structure of short texts
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.1, metric='cosine', random_state=42)

In [77]:
# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=1)

In [78]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [79]:
#Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [80]:
# Set λ to 0.5 for the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=10)

In [81]:
# Run the model
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - Diversify topic words         
  nr_topics=None,
  min_topic_size=1, 
  verbose=True,
  top_n_words=10                        
)

In [82]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['description'], embeddings)

2024-09-04 19:42:48,246 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-04 19:42:48,941 - BERTopic - Dimensionality - Completed ✓
2024-09-04 19:42:48,941 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-04 19:42:48,946 - BERTopic - Cluster - Completed ✓
2024-09-04 19:42:48,948 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-04 19:42:49,794 - BERTopic - Representation - Completed ✓


In [83]:
# Reduce outliers and update the topics if such are identified
# Warning refers to a switch to weighted c-TF-IDF embeddings instead of centroid embeddings, i.e. manual update. No reduction is used afterwards.
if -1 not in topics:
    print (topic_model.get_topic_info())
else: 
    topics = topic_model.reduce_outliers(df['description'], topics)
    topic_model.update_topics(df['description'], topics=topics, vectorizer_model=vectorizer_model, top_n_words=30)
    print ("Outliers were identified and topics were updated.")

100%|██████████| 1/1 [00:00<00:00, 290.69it/s]


Outliers were identified and topics were updated.


In [84]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(20)

Number of topics: 8


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,20,0_prices_premium_spotify_prices premium,"[prices, premium, spotify, prices premium, tim...",[Wall Street praises Spotify price hikes — and...
1,1,14,1_spotify_video_podcasts_new,"[spotify, video, podcasts, new, streaming, bas...",[Partnering With Industry To Create a Safer On...
2,2,15,2_spotify_2024_artists_cannes,"[spotify, 2024, artists, cannes, cannes lions,...",[Billie Eilish and Spotify Bring the Mysteriou...
3,3,14,3_spotify_price_spotify spotify_price hikes,"[spotify, price, spotify spotify, price hikes,...",[Spotify CEO expects profit growth to continue...
4,4,7,4_music_streaming service_review_service,"[music, streaming service, review, service, sp...",[Spotify passes UMG as the world’s most valuab...
5,5,9,5_spotify ceo_ceo_spotify_spotify conspiracy,"[spotify ceo, ceo, spotify, spotify conspiracy...",[deadmau5 Responds to Controversial Remarks by...
6,6,7,6_basic plan_plan_audiobooks_basic,"[basic plan, plan, audiobooks, basic, 10 99, 1...",[Spotify Rolls Out Basic Plan In The US Starti...
7,7,7,7_file_music publishers_complaint_publishers file,"[file, music publishers, complaint, publishers...",[Democrat and Republican lawmakers unite to qu...


In [85]:
# Print the keywords
a_topic = freq.iloc[0]["Topic"]
topic_model.get_topic(a_topic)

[('prices', 0.10379286485060735),
 ('premium', 0.08303624126097867),
 ('spotify', 0.07069328212552531),
 ('prices premium', 0.052420907535798776),
 ('time', 0.052420907535798776),
 ('second time', 0.052420907535798776),
 ('year', 0.049391688948622704),
 ('second', 0.049391688948622704),
 ('time year', 0.042261859942521435),
 ('subscribers spotify', 0.042261859942521435),
 ('subscribers', 0.042261859942521435),
 ('spotify raises', 0.042261859942521435),
 ('spotify increases', 0.042261859942521435),
 ('increases', 0.042261859942521435),
 ('raises prices', 0.042261859942521435),
 ('raises', 0.042261859942521435),
 ('prices second', 0.042261859942521435),
 ('plans', 0.042261859942521435),
 ('hikes', 0.0351982142507297),
 ('subscription', 0.0351982142507297),
 ('price', 0.03364678562580001),
 ('spotify hikes', 0.030959242205507432),
 ('priced', 0.030959242205507432),
 ('premium plans', 0.030959242205507432),
 ('year spotify', 0.030959242205507432),
 ('audio', 0.02817457329501429),
 ('hifi',

In [86]:
#Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=5)

In [87]:
# Visualise clusters of topics
topic_model.visualize_topics()

In [88]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=9)

In [89]:
# Create a new DataFrame with a topic column
df_BERTopics = pd.DataFrame({"description": df['description'], "topic": topics})
df_BERTopics.head(5)

Unnamed: 0,description,topic
0,Why I Finally Quit Spotify,3
1,Adjusting Spotify Premium Prices in the US,0
2,Why I quit Spotify,3
3,"Spotify Review: Best Music Streaming Service, ...",4
4,Spotify Has One Big Advantage on Every Other S...,4


In [90]:
# Merge the two DataFrames on the description column
df_BERTopics = pd.merge(df, df_BERTopics, on='description', how='inner')
df_BERTopics.tail(5)

Unnamed: 0,description,published date,media,url,topic
88,Spotify increases subscriber prices for second...,03-06-2024,Fortune,https://news.google.com/rss/articles/CBMibEFVX...,0
89,"Spotify launches Spanish language AI DJ ""Livi""...",17-07-2024,Axios,https://news.google.com/rss/articles/CBMid0FVX...,6
90,Spotify to increase premium pricing in the US ...,03-06-2024,TechCrunch,https://news.google.com/rss/articles/CBMioAFBV...,0
91,Spotify raises prices on subscriptions days af...,03-06-2024,Miami Herald,https://news.google.com/rss/articles/CBMiggFBV...,0
92,Spotify CEO expects profit growth to continue ...,23-07-2024,Axios,https://news.google.com/rss/articles/CBMie0FVX...,3


In [91]:
# Count the distribution of topics in articles
df_BERTopics.topic.value_counts()

topic
0    20
2    15
3    14
1    14
5     9
4     7
7     7
6     7
Name: count, dtype: int64