In [1]:
import pandas as pd
from bertopic import BERTopic
import re
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv("df_sisal_processed.csv")

In [3]:
df.head()

Unnamed: 0,url,Dominio,published,published_date,published_time,title,content,text,sentiment,Brand,engagement,reach,Tipo_di_media,post_type,Genere,Paese,Citt√†
0,https://www.corrieredellosport.it/news/scommes...,http://corrieredellosport.it/,2025-10-09 20:30:00,2025-10-09,20:30:00,"Pronostico Germania-Lussemburgo, tanti gol a S...",Tutto esaurito alla PreZero Arena di Sinsheim ...,"Pronostico Germania-Lussemburgo, tanti gol a S...",0,"bet365,Snai",0,3863234,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
1,https://ilcentrotirreno.it/sito/sport/228401-m...,http://ilcentrotirreno.it/,2025-08-14 13:34:00,2025-08-14,13:34:00,"MotoGP, in Austria Marc Marquez favorito",questo tab√π e a consolidare il suo primato vis...,"MotoGP, in Austria Marc Marquez favorito. ques...",0,Sisal,0,14992,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,MALE,Italy,Catanzaro
2,https://www.infobetting.com/calcio/quote/svezi...,http://infobetting.com/,2025-08-11 12:36:55,2025-08-11,12:36:55,Hammarby - GAIS | quote scommesse 1X2 + under/...,Hammarby - GAIS quote scommesse 1X2 + UNDER/O...,Hammarby - GAIS | quote scommesse 1X2 + under/...,0,bet365,0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
3,https://www.infobetting.com/calcio/quote/giapp...,http://infobetting.com/,2025-09-19 21:22:44,2025-09-19,21:22:44,Cerezo Osaka - Kashiwa Reysol,Cerezo Osaka - Kashiwa Reysol Cerezo Osaka - ...,Cerezo Osaka - Kashiwa Reysol. Cerezo Osaka - ...,0,"bet365,Snai",0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
4,https://www.sportitalia.it/2025/08/12/supercop...,http://sportitalia.it/,2025-08-12 15:22:19,2025-08-12,15:22:19,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",Si fa sempre pi√π interessante il calcio d‚Äôagos...,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",0,Planetwin365,0,852123,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome


In [4]:
#merge title and content into a single text field
def safe_str(x):
    return "" if pd.isna(x) else str(x)

df["text"] = df.apply(
    lambda r: safe_str(r.get("title", "")) + ". " + safe_str(r.get("content", "")),
    axis=1
)

In [4]:
print(df["text"].iloc[0])

Pronostico Germania-Lussemburgo, tanti gol a Sinsheim? Le quote del match. Tutto esaurito alla PreZero Arena di Sinsheim per Germania-Lussemburgo . La sfida, valevole per la 3¬™ giornata del gruppo A delle qualificazioni Mondiali , vede i tedeschi favoriti per distacco contro una nazionale che ha zero punti dopo le prime due...


In [5]:
#select only the text column and timestamps for BERTopic
docs = df["text"].tolist()
timestamps = df["published"].tolist()

# Analisi Topic: ***modello base***

Remove stopwords to improve **default representation**
Use count vectorizer to remove stopwords + have 2 words together ***(ngram_range)***

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

custom_stopwords = [
    "di", "a", "da", "in", "con", "su", "per", "tra", "fra",
    "il", "lo", "la", "i", "gli", "le",
    "un", "uno", "una",
    "che", "come", "dove", "quando",
    "e", "o", "ma", "anche", "https", "http"
]
vectorizer_model = CountVectorizer(stop_words=custom_stopwords, min_df=2, ngram_range=(1, 2))

We need to load env for openAI API to get topics name automatically with chatgpt

In [7]:
import os
from dotenv import load_dotenv

# This will read .env from the current working directory
load_dotenv()   

True

Now i'll use openAI to define the labels

In [8]:
import os
import openai
from bertopic.representation import OpenAI as OpenAIRep

# Create OpenAI client (better from environment variable)
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])  # set this in your OS / .env

# Prompt suggested in BERTopic best practices
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""

openai_model = OpenAIRep(
    client,
    model="gpt-4o-mini",       # or another model you prefer
    exponential_backoff=True,
    prompt=prompt,
)

# This dict name will be passed into BERTopic
representation_model = {
    "OpenAI": openai_model
}


In [9]:
# use an Italian-capable sentence-transformers model ("paraphrase-multilingual-MiniLM-L12-v2") for embeddings and cap topics to 50

topic_model = BERTopic(
    language="multilingual",
    nr_topics="auto", 
    min_topic_size=200, 
    vectorizer_model=vectorizer_model, # use custom vectorizer
    representation_model=representation_model, # use OpenAI for topic labeling
    verbose=True,
)
topics, probs = topic_model.fit_transform(docs)

2026-02-13 12:25:51,776 - BERTopic - Embedding - Transforming documents to embeddings.


tokenizer_config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

Batches:   0%|          | 0/3071 [00:00<?, ?it/s]

2026-02-13 13:51:54,104 - BERTopic - Embedding - Completed ‚úì
2026-02-13 13:51:54,105 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-13 13:53:43,231 - BERTopic - Dimensionality - Completed ‚úì
2026-02-13 13:53:43,235 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-13 13:54:03,785 - BERTopic - Cluster - Completed ‚úì
2026-02-13 13:54:03,786 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-02-13 13:54:13,652 - BERTopic - Representation - Completed ‚úì
2026-02-13 13:54:13,667 - BERTopic - Topic reduction - Reducing number of topics
2026-02-13 13:54:13,714 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:18<00:00,  1.14it/s]
2026-02-13 13:54:42,554 - BERTopic - Representation - Completed ‚úì
2026-02-13 13:54:42,583 - BERTopic - Topic reduction - Reduced number of topics from 59 to 21


In [18]:
topic_model.save("sisal+competitors_bertopic_model-openai")


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [None]:
#load topic model already trained
topic_model = BERTopic.load("sisal+competitors_bertopic_model-openai")

In [56]:
topic_model.get_topic_info()
#topic_info = topic_model.get_topic_info()
#topic_info.to_csv("topic_info_openAI.csv", index=False)
#print("Saved topic_info_allbrand.csv")

Saved topic_info_allbrand.csv


In [20]:
n_topics = topic_model.get_topic_info().shape[0]
n_topics

21

In [43]:
# Use the "c-TF-IDF" strategy with a threshold
new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)

In [60]:
topic_model.get_topic(2)

[('snai', 0.03763046142537823),
 ('sisal', 0.030651879496701995),
 ('co', 0.025508650130221123),
 ('pokerstars', 0.02269233585795326),
 ('calcio', 0.022345768127887005),
 ('goldbet', 0.022094464875771526),
 ('eurobet', 0.021635636238825605),
 ('seriea', 0.01992986462290225),
 ('non', 0.018814787466565647),
 ('scommesse', 0.01815706667334375)]

In [31]:
topic_model.get_representative_docs(3)

['Aree interne, dalla Regione 37 milioni per il ripopolamento e lo sviluppo economico. Schifani: ¬´Intervento dal forte impatto sociale¬ª. Un intervento da 37,2 milioni di euro per favorire la rinascita demografica dei borghi rurali delle aree interne della Sicilia. L‚Äôassessorato della Famiglia e delle politiche sociali ha pubblicato sul sito istituzionale della Regione  l‚Äôavviso per la...',
 'Aree interne, dalla Regione 37 milioni per il ripopolamento e lo sviluppo economico. Un intervento da 37,2 milioni di euro per favorire la rinascita demografica dei borghi rurali delle aree interne della Sicilia. L‚Äôassessorato della Famiglia e delle politiche sociali ha pubblicato sul sito istituzionale della Regione l‚Äôavviso per la ...',
 'Dalla Regione un milione e mezzo per le imprese delle aree interne; 200mila euro alla Valle Bormida. AREE INTERNE La Giunta regionale ha approvato un ulteriore stanziamento di 1,5 milioni di euro per il sostegno alle imprese delle aree interne. La misu

Create a dataframe for further analysis

In [50]:
df_topics = pd.DataFrame({"text": docs, "Topic": topics, "Timestamp": timestamps, "Probability": probs})
df_topics.head(20)

Unnamed: 0,text,Topic,Timestamp,Probability
0,"Pronostico Germania-Lussemburgo, tanti gol a S...",-1,2025-10-09 20:30:00,0.0
1,"MotoGP, in Austria Marc Marquez favorito. ques...",-1,2025-08-14 13:34:00,0.0
2,Hammarby - GAIS | quote scommesse 1X2 + under/...,0,2025-08-11 12:36:55,1.0
3,Cerezo Osaka - Kashiwa Reysol. Cerezo Osaka - ...,0,2025-09-19 21:22:44,1.0
4,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",-1,2025-08-12 15:22:19,0.0
5,". In Italia, il plinko √® diventato popolare so...",8,2025-08-19 17:53:04,1.0
6,Premio Free Spin In assenza di Deposito Codice...,-1,2025-09-20 19:13:43,0.0
7,". üê∫üêÇ La #Roma a caccia del tris vincente, il #...",0,2025-09-12 12:30:00,0.856362
8,Pronostico Muhdar | AL KHALEEJ | 6 ottobre 202...,9,2025-10-05 17:00:00,0.581185
9,Statistiche basket - USK Praha scontri diretti...,-1,2025-08-22 00:24:53,0.0


In [23]:
# Intertopic distance map (this is what you want)
fig_map = topic_model.visualize_topics()
fig_map.write_html("topics_map.html")
fig_map.show()

In [24]:
# count topics and show them in descending order (uses existing `topics` and `topic_model`)
counts = pd.Series(topics).value_counts().rename_axis('Topic').reset_index(name='Count')
topic_info = topic_model.get_topic_info()[['Topic', 'Name']]
counts = counts.merge(topic_info, on='Topic', how='left')
counts['Name'] = counts['Name'].fillna('Outliers')
counts = counts.sort_values('Count', ascending=False).reset_index(drop=True)
counts


Unnamed: 0,Topic,Count,Name
0,-1,37880,-1_slot_bonus_del_online
1,0,30640,0_quote_quote scommesse_scommesse_serie
2,1,14820,1_online_slot_migliori_2025
3,2,3756,2_snai_sisal_co_pokerstars
4,3,1945,3_aree_interne_aree interne_del
5,4,1507,4_premier_premier league_league_pronostico
6,5,1507,5_milano_borse_borsa_piazza affari
7,6,1012,6_lottomatica_bentornata pagina_lottomatica be...
8,7,729,7_mooney_xs2092610141_sisal pay_mooney ex
9,8,606,8_plinko_gioco plinko_gioco_plinko nei


In [25]:
counts.to_csv("topic_counts_allmarket.csv", index=False)
print("Saved topic counts.csv")

Saved topic counts.csv


In [28]:
#define custom labels using OpenAI
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"

topic_model.set_topic_labels(chatgpt_topic_labels)
#change name that chatGPT got unclearly
topic_model.set_topic_labels({30:"Casin√≤ online",
                              19:"Scommesse",
                              7:"Discussione forum Mooney",
                             25: "Pronostici serie A", 
                             15: "Superenalotto generale", 
                             28:"Estrazioni lotteria",
                             2:"Formula 1",
                             29:"Tornei Pokerstars",
                             12:"Vittoria Sinner Master Cincinnati"})

'topic_model.set_topic_labels({30:"Casin√≤ online",\n                              19:"Scommesse",\n                              7:"Discussione forum Mooney",\n                             25: "Pronostici serie A", \n                             15: "Superenalotto generale", \n                             28:"Estrazioni lotteria",\n                             2:"Formula 1",\n                             29:"Tornei Pokerstars",\n                             12:"Vittoria Sinner Master Cincinnati"})'

In [57]:
# Order topics by descending count, excluding outliers topic (-1) so the 11th fills any gap
ordered = counts[counts['Topic'] != -1].reset_index(drop=True)
ordered_topics = ordered['Topic'].tolist()[:10]
ordered_labels = ordered['Name'].tolist()[:10]

fig = topic_model.visualize_barchart(width=320, title="Top 10 Topics", topics=ordered_topics, custom_labels=True)
fig

In [58]:
fig.write_html("top_10_topics.html")

In [59]:
# select documents assigned to topic 
n_topic = 2
mask = df_topics["Topic"] == n_topic
df_topicx = df_topics[mask].reset_index(drop=True)

print(f"Found {len(df_topicx)} documents for topic {n_topic}")
df_topicx.head(50)
#df_topicx.to_csv("topic_27_documents.csv", index=False)

Found 3756 documents for topic 2


Unnamed: 0,text,Topic,Timestamp,Probability,Labels
0,. OGGI TORNA LA BARI!ü§ç‚ù§Ô∏èüêî Questa sera torniamo...,2,2025-08-17 09:38:25,1.0,Betting and Gambling Content
1,". üëâ ...LA PI√ô BELLA DEL WEEKEND! VINTI: 2.563,...",2,2025-09-02 14:53:28,1.0,Betting and Gambling Content
2,. üî• Serata di grande calcio da Eurobet Cenacol...,2,2025-10-18 17:30:52,1.0,Betting and Gambling Content
3,Guadagnare Con Scommesse ‚Äì Ora Come Allora. Gu...,2,2025-10-18 21:37:23,1.0,Betting and Gambling Content
4,. Stefano mvp . . . . . . . supersantos Supers...,2,2025-09-22 12:05:21,1.0,Betting and Gambling Content
5,. Lascia un FOLLOW se ti piacciono questi cont...,2,2025-10-04 17:30:42,0.848067,Betting and Gambling Content
6,. @AndreaVenanzoni La SNAI non quota questa po...,2,2025-10-01 22:52:24,1.0,Betting and Gambling Content
7,". üí∞R$100,00 0,25un ü™ô - Link direto da aposta (...",2,2025-08-07 23:46:09,1.0,Betting and Gambling Content
8,. #snai per üí™ü•∑üöÄüá∏üáØüá®üá≤üá©üáØüá®üáøüá®üáæüá™üá≠üáÆüá®üá¨üá∑üá¨üá≤üá¨üá´üá´üá¥dds.,2,2025-09-14 12:45:23,1.0,Betting and Gambling Content
9,. I regali di bet365 https://t.co/QUqMSzuMHD,2,2025-09-05 22:54:27,1.0,Betting and Gambling Content


In [None]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

In [61]:
df_topics = pd.DataFrame({
    "text": docs,
    "Topic": topics,
    "Timestamp": timestamps,
    "Probability": probs,
    "Labels": [chatgpt_topic_labels.get(topic, "Unknown") for topic in topics]
})

df_topics.head(20)

Unnamed: 0,text,Topic,Timestamp,Probability,Labels
0,"Pronostico Germania-Lussemburgo, tanti gol a S...",-1,2025-10-09 20:30:00,0.0,Outlier Topic
1,"MotoGP, in Austria Marc Marquez favorito. ques...",-1,2025-08-14 13:34:00,0.0,Outlier Topic
2,Hammarby - GAIS | quote scommesse 1X2 + under/...,0,2025-08-11 12:36:55,1.0,Scommesse e Quote Calcio
3,Cerezo Osaka - Kashiwa Reysol. Cerezo Osaka - ...,0,2025-09-19 21:22:44,1.0,Scommesse e Quote Calcio
4,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",-1,2025-08-12 15:22:19,0.0,Outlier Topic
5,". In Italia, il plinko √® diventato popolare so...",8,2025-08-19 17:53:04,1.0,Gioco Plinko nei Casin√≤ Online
6,Premio Free Spin In assenza di Deposito Codice...,-1,2025-09-20 19:13:43,0.0,Outlier Topic
7,". üê∫üêÇ La #Roma a caccia del tris vincente, il #...",0,2025-09-12 12:30:00,0.856362,Scommesse e Quote Calcio
8,Pronostico Muhdar | AL KHALEEJ | 6 ottobre 202...,9,2025-10-05 17:00:00,0.581185,Coppa d'Africa 2025 Pronostici
9,Statistiche basket - USK Praha scontri diretti...,-1,2025-08-22 00:24:53,0.0,Outlier Topic


In [62]:
df_topics.to_csv("df_topics.csv", index=False)

# Dynimic topic modeling

How  the topics behave over time

In [64]:
topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)

20it [02:09,  6.46s/it]


In [65]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
# filter to August (8) through december (12)
mask_aug_dec = topics_over_time["Timestamp"].dt.month.between(8, 12)
topics_aug_dec = topics_over_time[mask_aug_dec].reset_index(drop=True)

fig = topic_model.visualize_topics_over_time(topics_aug_dec, top_n_topics=10)
fig.write_html("topics_over_time_Aug_Dec.html")
fig.show()