In [502]:
# Import libraries
# GoogleNews Documentation: https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
#BERTopic documentation: https://maartengr.github.io/BERTopic/index.html
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # pip install -U sentence-transformers
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")


In [503]:
# Initializing
googlenews = GNews()

In [504]:
# Settings
googlenews = GNews(language='en',start_date=(2024, 6, 1), end_date=(2024, 8, 1))

In [505]:
# Search 
searchednews = googlenews.get_news('Spotify')
print(f"Articles found:", len(searchednews))

Articles found: 97


In [506]:
# Results
print(searchednews[1])

{'title': 'Why I quit Spotify - Vox.com', 'description': 'Why I quit Spotify  Vox.com', 'published date': 'Thu, 11 Jul 2024 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMif0FVX3lxTE8wSjQxdWJLbzhOZzc2R3EtRjF4NWZlVWpJSDRodkJQNVpNejBUT3laSFBoTm9LeDFlSFdhTU1FTEx0MjZwVm9OeUhtN2J5emZSaUo1ZXhCRjV3MXozZG1zdTI2clJFMkxiN2NCWkdweWg2cENjeWxGSnhfbTEtSGM?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://www.vox.com', 'title': 'Vox.com'}}


In [507]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(5)

Unnamed: 0,title,description,published date,url,publisher
92,"Spotify posts record gross margin, profit and ...","Spotify posts record gross margin, profit and ...","Tue, 23 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMickFVX...,"{'href': 'https://www.axios.com', 'title': 'Ax..."
93,Spotify is raising its prices again starting n...,Spotify is raising its prices again starting n...,"Mon, 03 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMikgFBV...,"{'href': 'https://www.weau.com', 'title': 'WEAU'}"
94,Spotify lyrics are back for free users - Engadget,Spotify lyrics are back for free users Engadget,"Tue, 30 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMihAFBV...,"{'href': 'https://www.engadget.com', 'title': ..."
95,Spotify is going to let you leave comments on ...,Spotify is going to let you leave comments on ...,"Tue, 09 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMidkFVX...,"{'href': 'https://www.theverge.com', 'title': ..."
96,Music Publishers File FTC Complaint Against Sp...,Music Publishers File FTC Complaint Against Sp...,"Wed, 12 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMirgFBV...,"{'href': 'https://www.rollingstone.com', 'titl..."


In [508]:
# Check one article
df.iloc[43]['description']

'Spotify CEO confirms a ‘deluxe’ version with hi-fi audio is coming soon  The Verge'

In [509]:
#Delete a source from the description and check one article
def split_text(text):
    return text.split('  ')[0]
df['description'] = df['description'].apply(split_text)
df.iloc[43]['description']

'Spotify CEO confirms a ‘deluxe’ version with hi-fi audio is coming soon'

In [510]:
# Change date format
df['published date'] = pd.to_datetime(df['published date'], format='%a, %d %b %Y %H:%M:%S GMT')
df['published date'] = df['published date'].dt.strftime('%d-%m-%Y')
df.head(5)

Unnamed: 0,title,description,published date,url,publisher
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,"{'href': 'https://www.newyorker.com', 'title':..."
1,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,"{'href': 'https://www.vox.com', 'title': 'Vox...."
2,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,"{'href': 'https://newsroom.spotify.com', 'titl..."
3,Spotify Increases Prices for Second Time in a ...,Spotify Increases Prices for Second Time in a ...,03-06-2024,https://news.google.com/rss/articles/CBMioAFBV...,"{'href': 'https://www.cnet.com', 'title': 'CNET'}"
4,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,"{'href': 'https://www.bloomberg.com', 'title':..."


In [511]:
# Break publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,https://www.newyorker.com,The New Yorker
1,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,https://www.vox.com,Vox.com
2,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,https://newsroom.spotify.com,spotify.com
3,Spotify Increases Prices for Second Time in a ...,Spotify Increases Prices for Second Time in a ...,03-06-2024,https://news.google.com/rss/articles/CBMioAFBV...,https://www.cnet.com,CNET
4,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,https://www.bloomberg.com,Bloomberg


In [512]:
# Clean DataFrame
df['media'] = df['title'].iloc[:,-1]
df = df[['description','published date','media','url']]
df.head(10)

Unnamed: 0,description,published date,media,url
0,Why I Finally Quit Spotify,31-07-2024,The New Yorker,https://news.google.com/rss/articles/CBMiggFBV...
1,Why I quit Spotify,11-07-2024,Vox.com,https://news.google.com/rss/articles/CBMif0FVX...
2,Adjusting Spotify Premium Prices in the US,03-06-2024,spotify.com,https://news.google.com/rss/articles/CBMiiwFBV...
3,Spotify Increases Prices for Second Time in a ...,03-06-2024,CNET,https://news.google.com/rss/articles/CBMioAFBV...
4,Spotify Has One Big Advantage on Every Other S...,02-06-2024,Bloomberg,https://news.google.com/rss/articles/CBMitwFBV...
5,Spotify Royalty Drama Casts Shadow Over Songwr...,25-06-2024,Bloomberg Law,https://news.google.com/rss/articles/CBMingFBV...
6,Spotify raises US prices of its premium plans ...,03-06-2024,Reuters,https://news.google.com/rss/articles/CBMilAFBV...
7,Spotify Reportedly Planning Higher-Priced Prem...,11-06-2024,PYMNTS.com,https://news.google.com/rss/articles/CBMiswFBV...
8,You Can Use Spotify Entirely From the Command ...,10-07-2024,Lifehacker,https://news.google.com/rss/articles/CBMiZkFVX...
9,Spotify's Biggest Campaign Since 'Wrapped' Is ...,11-06-2024,Adweek,https://news.google.com/rss/articles/CBMiqwFBV...


## Topic modelling

In [513]:
# Extract embeddings using a model all-MiniLM-L6-v2
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Pre-compute embeddings
embeddings = embedding_model.encode(df['description'], show_progress_bar=False)

08/08/2024 02:09:33 PM - Use pytorch device_name: mps
08/08/2024 02:09:33 PM - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [514]:
# Reduce dimensionality focusing more on a local structure of short texts
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.1, metric='cosine', random_state=42)

In [515]:
# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=1)

In [516]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [517]:
#Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [518]:
# Set λ to 0.5 for the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=10)

In [519]:
# Run the model
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - Diversify topic words         
  nr_topics=None,
  min_topic_size=1, 
  verbose=True,
  top_n_words=10                        
)

In [520]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['description'], embeddings)

2024-08-08 14:09:35,707 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-08 14:09:36,665 - BERTopic - Dimensionality - Completed ✓
2024-08-08 14:09:36,666 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-08 14:09:36,671 - BERTopic - Cluster - Completed ✓
2024-08-08 14:09:36,672 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-08 14:09:37,413 - BERTopic - Representation - Completed ✓


In [521]:
# Reduce outliers using the `embeddings` strategy
new_topics = topic_model.reduce_outliers(df['description'], topics)

100%|██████████| 1/1 [00:00<00:00, 162.73it/s]


In [522]:
# Update topic representation
topic_model.update_topics(df['description'], topics=new_topics, vectorizer_model=vectorizer_model, top_n_words=30)



In [523]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(20)

Number of topics: 9


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,14,0_premium_subscription_prices premium_plans,"[premium, subscription, prices premium, plans,...",[Spotify raises prices of premium subscription...
1,1,19,1_prices_spotify_raising_spotify raising,"[prices, spotify, raising, spotify raising, se...",[Spotify Increases Prices for Second Time in a...
2,2,13,2_streaming_spotify_launches_spotify launches,"[streaming, spotify, launches, spotify launche...","[Spotify is no longer just a streaming app, it..."
3,3,11,3_celebrate_spotify_campaign wrapped_wrapped,"[celebrate, spotify, campaign wrapped, wrapped...",[7 Things to Know About Syncing Spotify to Gal...
4,4,10,4_video_content_video content_cannes,"[video, content, video content, cannes, spotif...",[Digiday Podcast at Cannes: What Spotify’s pus...
5,5,6,5_songwriters_quit_quit spotify_royalty,"[songwriters, quit, quit spotify, royalty, spo...","[Spotify, Songwriters Want You to Succeed. Why..."
6,6,7,6_music_music publishers_publishers_complaint,"[music, music publishers, publishers, complain...",[Music Publishers File FTC Complaint Against S...
7,7,7,7_price_price hikes_hikes_shares,"[price, price hikes, hikes, shares, spotify st...",[Spotify co-founder Martin Lorentzon sells $81...
8,8,10,8_hifi_bring_audio coming_spotify hifi,"[hifi, bring, audio coming, spotify hifi, comi...",[Spotify Just Did Something Entirely Unexpecte...


In [524]:
# Print the keywords
a_topic = freq.iloc[0]["Topic"]
topic_model.get_topic(a_topic)

[('premium', 0.12282074130344385),
 ('subscription', 0.08258184145139215),
 ('prices premium', 0.08082056918893604),
 ('plans', 0.07603133911194335),
 ('spotify', 0.06880400313575359),
 ('prices', 0.0664071156700092),
 ('premium plans', 0.0652763587913622),
 ('11', 0.04792586453958552),
 ('subscribers', 0.04792586453958552),
 ('spotify raises', 0.04792586453958552),
 ('premium subscription', 0.04792586453958552),
 ('margin', 0.04792586453958552),
 ('subscribers spotify', 0.04792586453958552),
 ('priced', 0.04792586453958552),
 ('spotify adds', 0.04792586453958552),
 ('adds', 0.04792586453958552),
 ('raises prices', 0.04792586453958552),
 ('raises', 0.04792586453958552),
 ('spotify premium', 0.0435175725275748),
 ('new', 0.03801566955597167),
 ('plan', 0.03443850700521164),
 ('vs', 0.02775624597152639),
 ('premium pricing', 0.02775624597152639),
 ('pricing 11', 0.02775624597152639),
 ('pricing', 0.02775624597152639),
 ('profit cash', 0.02775624597152639),
 ('plans spotify', 0.0277562459

In [525]:
#Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=5)

In [526]:
# Visualise clusters of topics
topic_model.visualize_topics()

In [527]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=9)

In [528]:
# Create a new DataFrame with a topic column
df_BERTopics = pd.DataFrame({"description": df['description'], "topic": new_topics})
df_BERTopics.head(5)

Unnamed: 0,description,topic
0,Why I Finally Quit Spotify,5
1,Why I quit Spotify,5
2,Adjusting Spotify Premium Prices in the US,0
3,Spotify Increases Prices for Second Time in a ...,1
4,Spotify Has One Big Advantage on Every Other S...,2


In [529]:
# Merge the two DataFrames on the description column
df_BERTopics = pd.merge(df, df_BERTopics, on='description', how='inner')
df_BERTopics.tail(5)

Unnamed: 0,description,published date,media,url,topic
92,"Spotify posts record gross margin, profit and ...",23-07-2024,Axios,https://news.google.com/rss/articles/CBMickFVX...,0
93,Spotify is raising its prices again starting n...,03-06-2024,WEAU,https://news.google.com/rss/articles/CBMikgFBV...,1
94,Spotify lyrics are back for free users,30-07-2024,Engadget,https://news.google.com/rss/articles/CBMihAFBV...,1
95,Spotify is going to let you leave comments on ...,09-07-2024,The Verge,https://news.google.com/rss/articles/CBMidkFVX...,4
96,Music Publishers File FTC Complaint Against Sp...,12-06-2024,Rolling Stone,https://news.google.com/rss/articles/CBMirgFBV...,6


In [530]:
# Count the distribution of topics in articles
df_BERTopics.topic.value_counts()

topic
1    19
0    14
2    13
3    11
4    10
8    10
6     7
7     7
5     6
Name: count, dtype: int64