In [101]:
# Import libraries
# GoogleNews Documentation: https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
#BERTopic documentation: https://maartengr.github.io/BERTopic/index.html
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # pip install -U sentence-transformers
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")


In [102]:
# Initializing
googlenews = GNews()

In [103]:
# Settings
googlenews = GNews(language='en',start_date=(2024, 6, 1), end_date=(2024, 8, 1))

In [104]:
# Search 
searchednews = googlenews.get_news('Spotify')
print(f"Articles found:", len(searchednews))

Articles found: 92


In [105]:
# Results
print(searchednews[1])

{'title': 'Adjusting Spotify Premium Prices in the US - spotify.com', 'description': 'Adjusting Spotify Premium Prices in the US  spotify.com', 'published date': 'Mon, 03 Jun 2024 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMiiwFBVV95cUxPd1ZoTmxLSkZaYlVVRDQxT0lfNERNRGVmM1BUZjNWd3Z6VFcwMXYwUmJ6VU1zMmlHVmJhd0VMUXpzOHdNX28xSFc3QmY1RFlZellwdjc3VWoxTElOb2k2RzlEU2VybW4wdGRZeGxjeDFmMktWRzJXQ2M1ZHlXSlktQWNpdHpnQVlsenM4?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://newsroom.spotify.com', 'title': 'spotify.com'}}


In [106]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(5)

Unnamed: 0,title,description,published date,url,publisher
87,Who Owns Spotify Now? Largest Shareholders - T...,Who Owns Spotify Now? Largest Shareholders Th...,"Wed, 03 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMihgFBV...,"{'href': 'https://www.fool.com', 'title': 'The..."
88,Spotify Under Fire: NMPA Exposes Alleged Decep...,Spotify Under Fire: NMPA Exposes Alleged Decep...,"Thu, 13 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMi4wFBV...,"{'href': 'https://subscriptioninsider.com', 't..."
89,Spotify co-founder Martin Lorentzon sells $81m...,Spotify co-founder Martin Lorentzon sells $81m...,"Mon, 10 Jun 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMipwFBV...,{'href': 'https://www.musicbusinessworldwide.c...
90,Spotify takes the music industry market cap cr...,Spotify takes the music industry market cap cr...,"Wed, 10 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMi3gFBV...,"{'href': 'https://omdia.tech.informa.com', 'ti..."
91,Spotify revamps self-serve ad platform on heel...,Spotify revamps self-serve ad platform on heel...,"Thu, 25 Jul 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMimgFBV...,"{'href': 'https://www.marketingdive.com', 'tit..."


In [107]:
# Check one article
df.iloc[43]['description']

'Spotify Launches Basic Plan in U.S. That Excludes Audiobooks for $10.99 per Month — a Discount of One Whole Dollar  Variety'

In [108]:
#Delete a source from the description and check one article
def split_text(text):
    return text.split('  ')[0]
df['description'] = df['description'].apply(split_text)
df.iloc[43]['description']

'Spotify Launches Basic Plan in U.S. That Excludes Audiobooks for $10.99 per Month — a Discount of One Whole Dollar'

In [109]:
# Change date format
df['published date'] = pd.to_datetime(df['published date'], format='%a, %d %b %Y %H:%M:%S GMT')
df['published date'] = df['published date'].dt.strftime('%d-%m-%Y')
df.head(5)

Unnamed: 0,title,description,published date,url,publisher
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,"{'href': 'https://www.newyorker.com', 'title':..."
1,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,"{'href': 'https://newsroom.spotify.com', 'titl..."
2,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,"{'href': 'https://www.vox.com', 'title': 'Vox...."
3,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,"{'href': 'https://www.bloomberg.com', 'title':..."
4,Spotify launches new basic streaming plan in U...,Spotify launches new basic streaming plan in US,21-06-2024,https://news.google.com/rss/articles/CBMilgFBV...,"{'href': 'https://www.reuters.com', 'title': '..."


In [110]:
# Break publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,Why I Finally Quit Spotify - The New Yorker,Why I Finally Quit Spotify,31-07-2024,https://news.google.com/rss/articles/CBMiggFBV...,https://www.newyorker.com,The New Yorker
1,Adjusting Spotify Premium Prices in the US - s...,Adjusting Spotify Premium Prices in the US,03-06-2024,https://news.google.com/rss/articles/CBMiiwFBV...,https://newsroom.spotify.com,spotify.com
2,Why I quit Spotify - Vox.com,Why I quit Spotify,11-07-2024,https://news.google.com/rss/articles/CBMif0FVX...,https://www.vox.com,Vox.com
3,Spotify Has One Big Advantage on Every Other S...,Spotify Has One Big Advantage on Every Other S...,02-06-2024,https://news.google.com/rss/articles/CBMitwFBV...,https://www.bloomberg.com,Bloomberg
4,Spotify launches new basic streaming plan in U...,Spotify launches new basic streaming plan in US,21-06-2024,https://news.google.com/rss/articles/CBMilgFBV...,https://www.reuters.com,Reuters


In [111]:
# Clean DataFrame
df['media'] = df['title'].iloc[:,-1]
df = df[['description','published date','media','url']]
df.head(10)

Unnamed: 0,description,published date,media,url
0,Why I Finally Quit Spotify,31-07-2024,The New Yorker,https://news.google.com/rss/articles/CBMiggFBV...
1,Adjusting Spotify Premium Prices in the US,03-06-2024,spotify.com,https://news.google.com/rss/articles/CBMiiwFBV...
2,Why I quit Spotify,11-07-2024,Vox.com,https://news.google.com/rss/articles/CBMif0FVX...
3,Spotify Has One Big Advantage on Every Other S...,02-06-2024,Bloomberg,https://news.google.com/rss/articles/CBMitwFBV...
4,Spotify launches new basic streaming plan in US,21-06-2024,Reuters,https://news.google.com/rss/articles/CBMilgFBV...
5,Spotify quietly lets all podcasters upload vid...,20-06-2024,TechCrunch,https://news.google.com/rss/articles/CBMihgFBV...
6,Sabrina Carpenter and Spotify conspiracy theories,02-07-2024,The Week,https://news.google.com/rss/articles/CBMimgFBV...
7,Spotify passes UMG as the world’s most valuabl...,15-07-2024,hypebot.com,https://news.google.com/rss/articles/CBMiqAFBV...
8,Spotify Just Did Something Entirely Unexpected...,23-06-2024,Inc.,https://news.google.com/rss/articles/CBMivgFBV...
9,Spotify’s HiFi add-on could cost an extra $5 p...,11-06-2024,The Verge,https://news.google.com/rss/articles/CBMioAFBV...


## Topic modelling

In [112]:
# Extract embeddings using a model all-MiniLM-L6-v2
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Pre-compute embeddings
embeddings = embedding_model.encode(df['description'], show_progress_bar=False)

09/02/2024 11:32:50 AM - Use pytorch device_name: mps
09/02/2024 11:32:50 AM - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [113]:
# Reduce dimensionality focusing more on a local structure of short texts
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.1, metric='cosine', random_state=42)

In [114]:
# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=1)

In [115]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [116]:
#Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [117]:
# Set λ to 0.5 for the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=10)

In [118]:
# Run the model
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - Diversify topic words         
  nr_topics=None,
  min_topic_size=1, 
  verbose=True,
  top_n_words=10                        
)

In [119]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['description'], embeddings)

2024-09-02 11:32:53,233 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-02 11:32:53,853 - BERTopic - Dimensionality - Completed ✓
2024-09-02 11:32:53,853 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-02 11:32:53,858 - BERTopic - Cluster - Completed ✓
2024-09-02 11:32:53,859 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-02 11:32:54,471 - BERTopic - Representation - Completed ✓


In [120]:
#if has_outliers(topics):
    # Reduce outliers using the `embeddings` strategy
def process_topics_with_outliers(topics, df, topic_model, vectorizer_model, threshold=5):
    topic_counts = Counter(topics)
    
    # Identify topics with fewer than `threshold` documents
    outlier_topics = [topic for topic, count in topic_counts.items() if count < threshold]
    
    # If outliers are detected, reduce them and update topics
    if outlier_topics: 
        new_topics = topic_model.reduce_outliers(df['description'], topics)
    # Update topic representation
        topic_model.update_topics(df['description'], topics=new_topics, vectorizer_model=vectorizer_model, top_n_words=30)
        return new_topics
    else:
        # If no outliers, return the original topics
        return topics

In [121]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(20)

Number of topics: 10


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,14,-1_audiobooks_basic plan_10 99_cost,"[audiobooks, basic plan, 10 99, cost, app, red...",[Spotify Launches Basic Plan in U.S. That Excl...
1,0,22,0_hikes_price hikes_prices second_raising,"[hikes, price hikes, prices second, raising, t...",[Wall Street praises Spotify price hikes — and...
2,1,10,1_music publishers_practices_complaint spotify...,"[music publishers, practices, complaint spotif...",[Music Publishers File FTC Complaint Against S...
3,2,9,2_sabrina carpenter_playlists spotify_music ta...,"[sabrina carpenter, playlists spotify, music t...",[7 Things to Know About Syncing Spotify to Gal...
4,3,8,3_video content_comments podcasts_adds comment...,"[video content, comments podcasts, adds commen...",[Nebula Strikes Deal With Spotify to Stream Vi...
5,4,7,4_quit spotify_drives waste_hell happened_mout...,"[quit spotify, drives waste, hell happened, mo...",[Essay: Spotify’s CEO just made me throw up in...
6,5,7,5_ai dj_infused youtube_voiceover ads_songs su...,"[ai dj, infused youtube, voiceover ads, songs ...",[Spotify Brings AI DJ to Spanish-Speaking Musi...
7,6,5,6_new basic_streaming plan_launches new_alerts...,"[new basic, streaming plan, launches new, aler...","[Spotify tests emergency alerts in Sweden, Spo..."
8,7,5,7_2024_earnings spotify_spotify reports_second...,"[2024, earnings spotify, spotify reports, seco...",[Spotify Technology S.A. to Announce Financial...
9,8,5,8_audio coming_meaning golf_superpremium_tier ...,"[audio coming, meaning golf, superpremium, tie...",[Spotify CEO confirms a ‘deluxe’ version with ...


In [122]:
# Print the keywords
a_topic = freq.iloc[0]["Topic"]
topic_model.get_topic(a_topic)

[('audiobooks', 0.43501484281285646),
 ('basic plan', 0.39193694479527663),
 ('10 99', 0.39193694479527663),
 ('cost', 0.39193694479527663),
 ('app', 0.3293005310583562),
 ('reduces', 0.3218895872331584),
 ('republican lawmakers', 0.3218895872331584),
 ('question spotify', 0.3218895872331584),
 ('platform heels', 0.3218895872331584),
 ('record q2', 0.3218895872331584)]

In [123]:
#Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=5)

In [124]:
# Visualise clusters of topics
topic_model.visualize_topics()

In [125]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=9)

In [126]:
# Create a new DataFrame with a topic column
df_BERTopics = pd.DataFrame({"description": df['description'], "topic": new_topics})
df_BERTopics.head(5)

ValueError: array length 91 does not match index length 92

In [None]:
# Merge the two DataFrames on the description column
df_BERTopics = pd.merge(df, df_BERTopics, on='description', how='inner')
df_BERTopics.tail(5)

Unnamed: 0,description,published date,media,url,topic
87,Who Owns Spotify Now? Largest Shareholders,03-07-2024,The Motley Fool,https://news.google.com/rss/articles/CBMihgFBV...,0
88,Spotify Under Fire: NMPA Exposes Alleged Decep...,13-06-2024,Subscription Insider,https://news.google.com/rss/articles/CBMi4wFBV...,1
89,Spotify co-founder Martin Lorentzon sells $81m...,10-06-2024,Music Business Worldwide,https://news.google.com/rss/articles/CBMipwFBV...,0
90,Spotify takes the music industry market cap cr...,10-07-2024,Informa PLC,https://news.google.com/rss/articles/CBMi3gFBV...,0
91,Spotify revamps self-serve ad platform on heel...,25-07-2024,Marketing Dive,https://news.google.com/rss/articles/CBMimgFBV...,7


In [None]:
# Count the distribution of topics in articles
df_BERTopics.topic.value_counts()

topic
0    23
6    11
1    11
3    10
2    10
4     7
8     7
5     7
7     6
Name: count, dtype: int64