In [9]:
import pandas as pd
import re

In [10]:
df1 = pd.read_csv("Sisal&competitors-1.csv")

In [11]:
df2 = pd.read_csv("Sisal&competitors-2.csv")

In [12]:
df1.shape[0]

48696

In [13]:
df2.shape[0]

49567

In [14]:
df = pd.concat([df1, df2], ignore_index=True)

In [15]:
df.shape[0]

98263

In [23]:
df.columns

Index(['url', 'domain_url', 'published', 'title', 'content', 'sentiment',
       'matched_profile', 'engagement', 'reach', 'source_type', 'post_type',
       'extra_author_attributes.gender',
       'extra_source_attributes.world_data.country',
       'extra_source_attributes.world_data.city'],
      dtype='object')

# Preprocessing

In [28]:
df=df.rename(columns={
    "domain_url": "Dominio",
    "matched_profile": "Brand",
    "source_type": "Tipo_di_media",
    "extra_author_attributes.gender": "Genere",
    "extra_source_attributes.world_data.country": "Paese",
    "extra_source_attributes.world_data.city": "Città",
    })


In [29]:
df.columns

Index(['url', 'Dominio', 'published', 'title', 'content', 'sentiment', 'Brand',
       'engagement', 'reach', 'Tipo_di_media', 'post_type', 'Genere', 'Paese',
       'Città'],
      dtype='object')

Now lets process some columns which have not workable values yet

In [30]:
df.head

Unnamed: 0,url,Dominio,published,title,content,sentiment,Brand,engagement,reach,Tipo_di_media,post_type,Genere,Paese,Città
0,https://www.corrieredellosport.it/news/scommes...,http://corrieredellosport.it/,09/10/25 20:30:00,"Pronostico Germania-Lussemburgo, tanti gol a S...",Tutto esaurito alla PreZero Arena di Sinsheim ...,0,"SEARCH / Società di Gambling / bet365,SEARCH /...",0,3863234,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
1,https://ilcentrotirreno.it/sito/sport/228401-m...,http://ilcentrotirreno.it/,14/08/25 13:34:00,"MotoGP, in Austria Marc Marquez favorito",questo tabù e a consolidare il suo primato vis...,0,SEARCH / Società di Gambling / Sisal,0,14992,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,MALE,Italy,Catanzaro
2,https://www.infobetting.com/calcio/quote/svezi...,http://infobetting.com/,11/08/25 12:36:55,Hammarby - GAIS | quote scommesse 1X2 + under/...,Hammarby - GAIS quote scommesse 1X2 + UNDER/O...,0,SEARCH / Società di Gambling / bet365,0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
3,https://www.infobetting.com/calcio/quote/giapp...,http://infobetting.com/,19/09/25 21:22:44,Cerezo Osaka - Kashiwa Reysol,Cerezo Osaka - Kashiwa Reysol Cerezo Osaka - ...,0,"SEARCH / Società di Gambling / bet365,SEARCH /...",0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
4,https://www.sportitalia.it/2025/08/12/supercop...,http://sportitalia.it/,12/08/25 15:22:19,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",Si fa sempre più interessante il calcio d’agos...,0,SEARCH / Società di Gambling / Planetwin365,0,852123,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome


In [31]:
# Remove repeated prefix from Brand column
prefix_pattern = r"(?:SEARCH / Società di Gambling / )+"
df["Brand"] = (
    df["Brand"]
    .astype(str)
    .str.replace(prefix_pattern, "", regex=True)
    .str.strip()
)

df.head()

Unnamed: 0,url,Dominio,published,title,content,sentiment,Brand,engagement,reach,Tipo_di_media,post_type,Genere,Paese,Città
0,https://www.corrieredellosport.it/news/scommes...,http://corrieredellosport.it/,09/10/25 20:30:00,"Pronostico Germania-Lussemburgo, tanti gol a S...",Tutto esaurito alla PreZero Arena di Sinsheim ...,0,"bet365,Snai",0,3863234,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
1,https://ilcentrotirreno.it/sito/sport/228401-m...,http://ilcentrotirreno.it/,14/08/25 13:34:00,"MotoGP, in Austria Marc Marquez favorito",questo tabù e a consolidare il suo primato vis...,0,Sisal,0,14992,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,MALE,Italy,Catanzaro
2,https://www.infobetting.com/calcio/quote/svezi...,http://infobetting.com/,11/08/25 12:36:55,Hammarby - GAIS | quote scommesse 1X2 + under/...,Hammarby - GAIS quote scommesse 1X2 + UNDER/O...,0,bet365,0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
3,https://www.infobetting.com/calcio/quote/giapp...,http://infobetting.com/,19/09/25 21:22:44,Cerezo Osaka - Kashiwa Reysol,Cerezo Osaka - Kashiwa Reysol Cerezo Osaka - ...,0,"bet365,Snai",0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
4,https://www.sportitalia.it/2025/08/12/supercop...,http://sportitalia.it/,12/08/25 15:22:19,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",Si fa sempre più interessante il calcio d’agos...,0,Planetwin365,0,852123,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome


In [34]:
#update sentiment values to be binary
df["sentiment"] = df["sentiment"].replace({5: 1, -5: -1})

In [35]:
df["sentiment"].value_counts()

sentiment
 0    75257
 1    22405
-1      601
Name: count, dtype: int64

In [37]:
df['published'] = pd.to_datetime(df['published'], format='%d/%m/%y %H:%M:%S')

In [38]:
# Separate date and time from published column


df['published_date'] = df['published'].dt.date
df['published_time'] = df['published'].dt.time

df.head()

Unnamed: 0,url,Dominio,published,title,content,sentiment,Brand,engagement,reach,Tipo_di_media,post_type,Genere,Paese,Città,published_date,published_time
0,https://www.corrieredellosport.it/news/scommes...,http://corrieredellosport.it/,2025-10-09 20:30:00,"Pronostico Germania-Lussemburgo, tanti gol a S...",Tutto esaurito alla PreZero Arena di Sinsheim ...,0,"bet365,Snai",0,3863234,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,2025-10-09,20:30:00
1,https://ilcentrotirreno.it/sito/sport/228401-m...,http://ilcentrotirreno.it/,2025-08-14 13:34:00,"MotoGP, in Austria Marc Marquez favorito",questo tabù e a consolidare il suo primato vis...,0,Sisal,0,14992,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,MALE,Italy,Catanzaro,2025-08-14,13:34:00
2,https://www.infobetting.com/calcio/quote/svezi...,http://infobetting.com/,2025-08-11 12:36:55,Hammarby - GAIS | quote scommesse 1X2 + under/...,Hammarby - GAIS quote scommesse 1X2 + UNDER/O...,0,bet365,0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,2025-08-11,12:36:55
3,https://www.infobetting.com/calcio/quote/giapp...,http://infobetting.com/,2025-09-19 21:22:44,Cerezo Osaka - Kashiwa Reysol,Cerezo Osaka - Kashiwa Reysol Cerezo Osaka - ...,0,"bet365,Snai",0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,2025-09-19,21:22:44
4,https://www.sportitalia.it/2025/08/12/supercop...,http://sportitalia.it/,2025-08-12 15:22:19,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",Si fa sempre più interessante il calcio d’agos...,0,Planetwin365,0,852123,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,2025-08-12,15:22:19


In [None]:
#reorder columns to have published_date and published_time after published
cols = df.columns.tolist()
cols.remove("published_date")
cols.remove("published_time")
published_idx = cols.index("published") + 1
cols[published_idx:published_idx] = ["published_date", "published_time"]
df = df[cols]

Now join title and snippet for bertopic

In [43]:
#merge title and content into a single text field
def safe_str(x):
    return "" if pd.isna(x) else str(x)

df["text"] = df.apply(
    lambda r: safe_str(r.get("title", "")) + ". " + safe_str(r.get("content", "")),
    axis=1
)

In [44]:
df.head()

Unnamed: 0,url,Dominio,published,published_date,published_time,title,content,sentiment,Brand,engagement,reach,Tipo_di_media,post_type,Genere,Paese,Città,text
0,https://www.corrieredellosport.it/news/scommes...,http://corrieredellosport.it/,2025-10-09 20:30:00,2025-10-09,20:30:00,"Pronostico Germania-Lussemburgo, tanti gol a S...",Tutto esaurito alla PreZero Arena di Sinsheim ...,0,"bet365,Snai",0,3863234,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,"Pronostico Germania-Lussemburgo, tanti gol a S..."
1,https://ilcentrotirreno.it/sito/sport/228401-m...,http://ilcentrotirreno.it/,2025-08-14 13:34:00,2025-08-14,13:34:00,"MotoGP, in Austria Marc Marquez favorito",questo tabù e a consolidare il suo primato vis...,0,Sisal,0,14992,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,MALE,Italy,Catanzaro,"MotoGP, in Austria Marc Marquez favorito. ques..."
2,https://www.infobetting.com/calcio/quote/svezi...,http://infobetting.com/,2025-08-11 12:36:55,2025-08-11,12:36:55,Hammarby - GAIS | quote scommesse 1X2 + under/...,Hammarby - GAIS quote scommesse 1X2 + UNDER/O...,0,bet365,0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,Hammarby - GAIS | quote scommesse 1X2 + under/...
3,https://www.infobetting.com/calcio/quote/giapp...,http://infobetting.com/,2025-09-19 21:22:44,2025-09-19,21:22:44,Cerezo Osaka - Kashiwa Reysol,Cerezo Osaka - Kashiwa Reysol Cerezo Osaka - ...,0,"bet365,Snai",0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,Cerezo Osaka - Kashiwa Reysol. Cerezo Osaka - ...
4,https://www.sportitalia.it/2025/08/12/supercop...,http://sportitalia.it/,2025-08-12 15:22:19,2025-08-12,15:22:19,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",Si fa sempre più interessante il calcio d’agos...,0,Planetwin365,0,852123,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ..."


In [48]:
#reorder columns to have published_date and published_time after published
cols = df.columns.tolist()
cols.remove("text")
published_idx = cols.index("content") + 1
cols[published_idx:published_idx] = ["text"]
df = df[cols]

In [49]:
df.head()

Unnamed: 0,url,Dominio,published,published_date,published_time,title,content,text,sentiment,Brand,engagement,reach,Tipo_di_media,post_type,Genere,Paese,Città
0,https://www.corrieredellosport.it/news/scommes...,http://corrieredellosport.it/,2025-10-09 20:30:00,2025-10-09,20:30:00,"Pronostico Germania-Lussemburgo, tanti gol a S...",Tutto esaurito alla PreZero Arena di Sinsheim ...,"Pronostico Germania-Lussemburgo, tanti gol a S...",0,"bet365,Snai",0,3863234,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
1,https://ilcentrotirreno.it/sito/sport/228401-m...,http://ilcentrotirreno.it/,2025-08-14 13:34:00,2025-08-14,13:34:00,"MotoGP, in Austria Marc Marquez favorito",questo tabù e a consolidare il suo primato vis...,"MotoGP, in Austria Marc Marquez favorito. ques...",0,Sisal,0,14992,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,MALE,Italy,Catanzaro
2,https://www.infobetting.com/calcio/quote/svezi...,http://infobetting.com/,2025-08-11 12:36:55,2025-08-11,12:36:55,Hammarby - GAIS | quote scommesse 1X2 + under/...,Hammarby - GAIS quote scommesse 1X2 + UNDER/O...,Hammarby - GAIS | quote scommesse 1X2 + under/...,0,bet365,0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
3,https://www.infobetting.com/calcio/quote/giapp...,http://infobetting.com/,2025-09-19 21:22:44,2025-09-19,21:22:44,Cerezo Osaka - Kashiwa Reysol,Cerezo Osaka - Kashiwa Reysol Cerezo Osaka - ...,Cerezo Osaka - Kashiwa Reysol. Cerezo Osaka - ...,0,"bet365,Snai",0,94439,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome
4,https://www.sportitalia.it/2025/08/12/supercop...,http://sportitalia.it/,2025-08-12 15:22:19,2025-08-12,15:22:19,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",Si fa sempre più interessante il calcio d’agos...,"Supercoppa UEFA, PSG-Tottenham: Les Parisiens ...",0,Planetwin365,0,852123,"ONLINENEWS,ONLINENEWS_OTHER",TEXT,UNKNOWN,Italy,Rome


In [None]:
df.to_csv("df_sisal_processed.csv", index=False)