In [5]:
import torch
import pandas as pd
import numpy as np

In [20]:
# Define a dictionary with file paths and metadata
politicians_data = {
    'FdI_Meloni': {'file': 'politicians_data/GiorgiaMeloni_tweets_combined.csv', 'politician': 'Meloni', 'party': 'FdI'},
    'FdI_LaRussa': {'file': 'politicians_data/IgnazioLaRussa_tweets_combined.csv', 'politician': 'LaRussa', 'party': 'FdI'},
    'FI_Berlusconi': {'file': 'politicians_data/SilvioBerlusconi_tweets_combined.csv', 'politician': 'Berlusconi', 'party': 'FI'},
    'FI_Tajani': {'file': 'politicians_data/AntonioTajani_tweets_combined.csv', 'politician': 'Tajani', 'party': 'FI'},
    'Lega_Salvini': {'file': 'politicians_data/MatteoSalvini_tweets_combined.csv', 'politician': 'Salvini', 'party': 'Lega'},
    'M5S_DiMaio': {'file': 'politicians_data/luigidimaio_tweets_combined.csv', 'politician': 'Di Maio', 'party': 'M5S'},
    'M5S_Conte': {'file': 'politicians_data/GiuseppeConte_tweets_combined.csv', 'politician': 'Conte', 'party': 'M5S'},
    'Az_Calenda': {'file': 'politicians_data/CarloCalenda_tweets_combined.csv', 'politician': 'Calenda', 'party': 'Azione'},
    'IV_Renzi': {'file': 'politicians_data/MatteoRenzi_tweets_combined.csv', 'politician': 'Renzi', 'party': 'IV'},
    'PEeur_Bonino': {'file': 'politicians_data/emmabonino_tweets_combined.csv', 'politician': 'Bonino', 'party': 'PEeur'},
    'PD_Shlein': {'file': 'politicians_data/EllySchlein_tweets_combined.csv', 'politician': 'Schlein', 'party': 'PD'},
    'PD_Letta': {'file': 'politicians_data/EnricoLetta_tweets_combined.csv', 'politician': 'Letta', 'party': 'PD'},
    'EV_Fratoianni': {'file': 'politicians_data/NicolaFratoianni_tweets_combined.csv', 'politician': 'Fratoianni', 'party': 'EV'},
    'NcI_Lupi': {'file': 'politicians_data/MaurizioLupi_tweets_combined.csv', 'politician': 'Lupi', 'party': 'NcI'}
}

# Read and process each file
politicians_list = []
for key, data in politicians_data.items():
    df = pd.read_csv(data['file'])
    df['politician'] = data['politician']
    df['party'] = data['party']
    politicians_list.append(df)

# Concatenate all DataFrames into one
politicians = pd.concat(politicians_list, ignore_index=True)

# Create a csv file with the combined data
politicians.to_csv('politicians_combined.csv', index=False)

In [21]:
politicians.head()

Unnamed: 0,Date,ID,URL,Content,Likes,Retweets,politician,party
0,2022-12-30 15:30:11+00:00,1608847928625434625,https://twitter.com/GiorgiaMeloni/status/16088...,Ecco l’ultimo appuntamento con #gliappuntidiGi...,3385,0,Meloni,FdI
1,2022-12-29 20:55:30+00:00,1608567407500754944,https://twitter.com/GiorgiaMeloni/status/16085...,Congratulations to @netanyahu on the formation...,6021,0,Meloni,FdI
2,2022-12-29 19:48:50+00:00,1608550629005488130,https://twitter.com/GiorgiaMeloni/status/16085...,Grazie al suo estro e alla sua classe è riusci...,32003,0,Meloni,FdI
3,2022-12-29 17:31:05+00:00,1608515965041651712,https://twitter.com/GiorgiaMeloni/status/16085...,Nel corso della conferenza stampa di fine anno...,4146,0,Meloni,FdI
4,2022-12-29 10:38:45+00:00,1608412198182830081,https://twitter.com/GiorgiaMeloni/status/16084...,Conferenza stampa di fine anno. Seguitemi in d...,3523,0,Meloni,FdI


### RoBERTa pre-trained on Twitter

In [16]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request


In [17]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()  # Set to evaluation mode

# Define labels
labels = ['negative', 'neutral', 'positive']

In [26]:
def get_sentiment(text):
    # Preprocess text
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits[0].numpy()
    scores = torch.nn.functional.softmax(torch.tensor(scores), dim=0)
    return labels[scores.argmax()], scores.numpy()


### Try it on a sample

In [25]:
sampled_tweets = politicians.sample(n=1000, random_state=42)

sentiment_sample = sampled_tweets['Content'].apply(get_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [26]:
# Create a DataFrame with the results keeping the index
sentiment_df = pd.DataFrame(sentiment_sample.tolist(), index=sampled_tweets.index, columns=['sentiment', 'scores'])
sentiment_df['scores'] = sentiment_df['scores'].apply(lambda x: x.tolist())
# create a new column with each score
sentiment_df[['negative', 'neutral', 'positive']] = pd.DataFrame(sentiment_df['scores'].tolist(), index=sentiment_df.index)
sentiment_df. drop(columns=['scores'], inplace=True)
sentiment_df

Unnamed: 0,sentiment,negative,neutral,positive
6111,neutral,0.073046,0.867966,0.058988
14778,neutral,0.169538,0.645370,0.185092
5554,neutral,0.090288,0.862527,0.047185
19101,neutral,0.104139,0.834537,0.061324
14322,neutral,0.058969,0.853090,0.087941
...,...,...,...,...
25766,neutral,0.084699,0.858908,0.056393
21142,neutral,0.118631,0.827843,0.053527
5804,neutral,0.069216,0.769665,0.161119
7525,positive,0.010527,0.369351,0.620122


In [27]:
# add the columns of sentiment_df to sampled_tweets
sampled_tweets = sampled_tweets.join(sentiment_df)
sampled_tweets

Unnamed: 0,Date,ID,URL,Content,Likes,Retweets,politician,party,sentiment,negative,neutral,positive
6111,2022-12-17 13:06:53+00:00,1604100821754462209,https://twitter.com/matteosalvini/status/16041...,Era una delle nostre priorità e non abbiamo pe...,163,0,Salvini,Lega,neutral,0.073046,0.867966,0.058988
14778,2022-08-12 08:41:00+00:00,1558010653302030340,https://twitter.com/CarloCalenda/status/155801...,Senza pudore,756,0,Calenda,Azione,neutral,0.169538,0.645370,0.185092
5554,2021-05-03 14:53:08+00:00,1389231557207023622,https://twitter.com/Antonio_Tajani/status/1389...,Bene il #governo che decide di stanziare 2mld ...,43,0,Tajani,FI,neutral,0.090288,0.862527,0.047185
19101,2021-08-01 15:45:26+00:00,1421859627558395904,https://twitter.com/CarloCalenda/status/142185...,@francetomm @marketingpmi Credo che questa pos...,0,0,Calenda,Azione,neutral,0.104139,0.834537,0.061324
14322,2022-09-05 18:28:21+00:00,1566855773283573762,https://twitter.com/CarloCalenda/status/156685...,Anche per SWG arriva il sorpasso su ⁦@forza_it...,2106,0,Calenda,Azione,neutral,0.058969,0.853090,0.087941
...,...,...,...,...,...,...,...,...,...,...,...,...
25766,2022-08-18 15:44:23+00:00,1560291528941109251,https://twitter.com/NFratoianni/status/1560291...,Una mia intervista a #RaiParlamento\n\n#Elezio...,30,0,Fratoianni,EV,neutral,0.084699,0.858908,0.056393
21142,2021-03-11 19:31:15+00:00,1370094988873375748,https://twitter.com/CarloCalenda/status/137009...,@paolo23949716 @amodio_enzo @Azione_it Le mie ...,5,0,Calenda,Azione,neutral,0.118631,0.827843,0.053527
5804,2021-03-04 15:34:40+00:00,1367498739221475330,https://twitter.com/Antonio_Tajani/status/1367...,Iscriviti a @forza_italia.\nEntra a far parte ...,98,0,Tajani,FI,neutral,0.069216,0.769665,0.161119
7525,2022-06-04 20:24:50+00:00,1533183015266000897,https://twitter.com/matteosalvini/status/15331...,Trionfo della Pro Recco nella massima competiz...,133,0,Salvini,Lega,positive,0.010527,0.369351,0.620122


### Apply on entire dataset

In [28]:
# Apply sentiment analysis
sentiment_results = politicians['Content'].apply(get_sentiment)

In [29]:
# Create a DataFrame with the results keeping the index
roberta_df = pd.DataFrame(sentiment_results.tolist(), index=politicians.index, columns=['roberta_sentiment', 'scores'])
roberta_df['scores'] = roberta_df['scores'].apply(lambda x: x.tolist())
# create a new column with each score
roberta_df[['negative', 'neutral', 'positive']] = pd.DataFrame(roberta_df['scores'].tolist(), index=roberta_df.index)
roberta_df. drop(columns=['scores'], inplace=True)
roberta_df

Unnamed: 0,roberta_sentiment,negative,neutral,positive
0,neutral,0.066206,0.845798,0.087996
1,positive,0.001067,0.027091,0.971842
2,neutral,0.093626,0.834294,0.072081
3,neutral,0.074911,0.866046,0.059043
4,neutral,0.053506,0.870213,0.076281
...,...,...,...,...
27860,neutral,0.096552,0.851842,0.051606
27861,neutral,0.210613,0.751147,0.038240
27862,neutral,0.208396,0.742678,0.048926
27863,neutral,0.224726,0.724392,0.050883


In [30]:
politicians_roberta = politicians.join(roberta_df)
politicians_roberta

Unnamed: 0,Date,ID,URL,Content,Likes,Retweets,politician,party,roberta_sentiment,negative,neutral,positive
0,2022-12-30 15:30:11+00:00,1608847928625434625,https://twitter.com/GiorgiaMeloni/status/16088...,Ecco l’ultimo appuntamento con #gliappuntidiGi...,3385,0,Meloni,FdI,neutral,0.066206,0.845798,0.087996
1,2022-12-29 20:55:30+00:00,1608567407500754944,https://twitter.com/GiorgiaMeloni/status/16085...,Congratulations to @netanyahu on the formation...,6021,0,Meloni,FdI,positive,0.001067,0.027091,0.971842
2,2022-12-29 19:48:50+00:00,1608550629005488130,https://twitter.com/GiorgiaMeloni/status/16085...,Grazie al suo estro e alla sua classe è riusci...,32003,0,Meloni,FdI,neutral,0.093626,0.834294,0.072081
3,2022-12-29 17:31:05+00:00,1608515965041651712,https://twitter.com/GiorgiaMeloni/status/16085...,Nel corso della conferenza stampa di fine anno...,4146,0,Meloni,FdI,neutral,0.074911,0.866046,0.059043
4,2022-12-29 10:38:45+00:00,1608412198182830081,https://twitter.com/GiorgiaMeloni/status/16084...,Conferenza stampa di fine anno. Seguitemi in d...,3523,0,Meloni,FdI,neutral,0.053506,0.870213,0.076281
...,...,...,...,...,...,...,...,...,...,...,...,...
27860,2021-01-14 13:11:21+00:00,1349705663224623104,https://twitter.com/Maurizio_Lupi/status/13497...,Conte o va al Quirinale o viene in aula subito...,30,0,Lupi,NcI,neutral,0.096552,0.851842,0.051606
27861,2021-01-13 14:40:00+00:00,1349365587122782209,https://twitter.com/Maurizio_Lupi/status/13493...,"Oltre al dannato Covid, oggi c’è un altro viru...",15,0,Lupi,NcI,neutral,0.210613,0.751147,0.038240
27862,2021-01-12 11:36:03+00:00,1348956904836517889,https://twitter.com/Maurizio_Lupi/status/13489...,"L'improvvisazione, le liti, l'incertezza di qu...",57,0,Lupi,NcI,neutral,0.208396,0.742678,0.048926
27863,2021-01-11 12:31:27+00:00,1348608460108275712,https://twitter.com/Maurizio_Lupi/status/13486...,La situazione sta diventando insostenibile e i...,43,0,Lupi,NcI,neutral,0.224726,0.724392,0.050883


In [31]:
# add a column in politicians with the column 'roberta_sentiment' in politicians_roberta
politicians['roberta_sentiment'] = politicians_roberta['roberta_sentiment']

In [32]:
# count the number of tweets for each sentiment
roberta_counts = politicians['roberta_sentiment'].value_counts()
roberta_counts

roberta_sentiment
neutral     27117
positive      649
negative       99
Name: count, dtype: int64

In [53]:
neu_sample_rob = politicians[politicians['roberta_sentiment'] == 'neutral']

## Balancing classes

In [45]:
# check agreement between the two methods
agreement_df = politicians[['roberta_sentiment', 'vader_sentiment']].copy()
agreement_df['agreement'] = agreement_df['roberta_sentiment'] == agreement_df['vader_sentiment']
agreement_df['agreement'].value_counts()



agreement
True     21469
False     6396
Name: count, dtype: int64

### Undersampling majority class (neutral)

In [49]:
from sklearn.utils import resample

# Separate classes
neutral = politicians[politicians['vader_sentiment'] == 'neutral']
positive = politicians[politicians['vader_sentiment'] == 'positive']
negative = politicians[politicians['vader_sentiment'] == 'negative']

# Downsample neutral to match the smaller class
neutral_downsampled = resample(neutral, 
                               replace=False, 
                               n_samples=min(len(positive), len(negative)), 
                               random_state=42)

# Combine
balanced_df = pd.concat([neutral_downsampled, positive, negative])
balanced_df['vader_sentiment'].value_counts()



(8947, 10)

---

## Roberta on cleaned dataset

In [10]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from langdetect import detect
import re
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import spacy
from tqdm import tqdm
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
import os

In [6]:
politicians_cleaned = pd.read_csv('politicians_data/politicians_classified.csv')

In [27]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load stopwords
stop_words_italian = set(stopwords.words('italian'))
stop_words_english = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load spaCy models for Italian and English
# You need to install these models first using:
nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing to the tweets
politicians_tweets = politicians_cleaned['Content']

# Method 1: Using tqdm with apply (simpler)
tqdm.pandas(desc="Processing tweets")
politicians_cleaned['processed_tweet'] = politicians_tweets.progress_apply(lambda x: preprocess_text(x))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinaserandrei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martinaserandrei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/martinaserandrei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing tweets: 100%|██████████| 17245/17245 [00:00<00:00, 215416.97it/s]


In [28]:
sentiment_results2 = politicians_cleaned['processed_tweet'].apply(get_sentiment)

In [29]:
# Create a DataFrame with the results keeping the index
roberta_df2 = pd.DataFrame(sentiment_results2.tolist(), index=politicians_cleaned.index, columns=['roberta_sentiment', 'scores'])
roberta_df2['scores'] = roberta_df2['scores'].apply(lambda x: x.tolist())
# create a new column with each score
roberta_df2[['negative', 'neutral', 'positive']] = pd.DataFrame(roberta_df2['scores'].tolist(), index=roberta_df2.index)
roberta_df2. drop(columns=['scores'], inplace=True)
roberta_df2

Unnamed: 0,roberta_sentiment,negative,neutral,positive
0,positive,0.001159,0.025998,0.972843
1,neutral,0.063336,0.862081,0.074583
2,neutral,0.071610,0.885653,0.042737
3,neutral,0.088932,0.837134,0.073934
4,neutral,0.094576,0.865722,0.039702
...,...,...,...,...
17240,neutral,0.122503,0.824101,0.053396
17241,neutral,0.205459,0.757824,0.036717
17242,neutral,0.163984,0.791811,0.044205
17243,neutral,0.142545,0.806770,0.050686


In [30]:
politicians_cleaned['roberta_sentiment'] = roberta_df2['roberta_sentiment']

In [31]:
# count the number of tweets for each sentiment
politicians_cleaned['roberta_sentiment'].value_counts()

roberta_sentiment
neutral     17070
positive      135
negative       40
Name: count, dtype: int64

## Trying another model

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# Carica il modello italiano
model_name = "MilaNLProc/feel-it-italian-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Funzione di analisi
def italian_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    labels = ['negative', 'neutral', 'positive']
    return labels[scores.argmax()], dict(zip(labels, scores.round(3)))


tokenizer_config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/794k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [8]:
sampled_tweets = politicians_cleaned.sample(n=1000, random_state=42)
# Apply Italian sentiment analysis
ita_sample = sampled_tweets['Content'].apply(italian_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [9]:
# Create a DataFrame with the results keeping the index
ita_df = pd.DataFrame(ita_sample.tolist(), index=sampled_tweets.index, columns=['ita_sentiment', 'scores'])
# create a new column with each score
ita_df[['negative', 'neutral', 'positive']] = pd.DataFrame(ita_df['scores'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).tolist(), index=ita_df.index)
ita_df.drop(columns=['scores'], inplace=True)
ita_df

Unnamed: 0,ita_sentiment,scores
101,neutral,"{'negative': 0.001, 'neutral': 0.999}"
7501,negative,"{'negative': 1.0, 'neutral': 0.0}"
8437,negative,"{'negative': 1.0, 'neutral': 0.0}"
217,negative,"{'negative': 1.0, 'neutral': 0.0}"
14866,negative,"{'negative': 0.999, 'neutral': 0.001}"
...,...,...
9798,negative,"{'negative': 1.0, 'neutral': 0.0}"
3421,negative,"{'negative': 1.0, 'neutral': 0.0}"
14328,neutral,"{'negative': 0.141, 'neutral': 0.859}"
5133,negative,"{'negative': 1.0, 'neutral': 0.0}"


In [None]:
sentiment_ita = politicians_cleaned['Content'].apply(italian_sentiment)

zero-shot learning / few-shot 

In [33]:
from transformers import pipeline

# Load zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define your custom labels
labels = ["positive", "neutral", "negative"]

# Example tweet
text = "Non sono contento di come sta andando la situazione politica"

# Apply zero-shot
result = classifier(text, candidate_labels=labels)
print(result)


Device set to use mps:0


{'sequence': 'Non sono contento di come sta andando la situazione politica', 'labels': ['negative', 'neutral', 'positive'], 'scores': [0.9891007542610168, 0.005554403178393841, 0.005344857927411795]}


In [39]:
sampled_tweets = politicians_cleaned.sample(n=1000, random_state=42)
# Apply zero-shot classification
zero_shot_sample = sampled_tweets['Content'].apply(lambda x: classifier(x, candidate_labels=labels))

In [50]:
zero_shot_sample= zero_shot_sample.to_dict()

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


AttributeError: 'dict' object has no attribute 'to_dict'

In [58]:
# Create a DataFrame with the results keeping the index
zero_shot_df = pd.DataFrame.from_dict(zero_shot_sample, orient='index')
zero_shot_df['tweets'] = sampled_tweets['Content']
zero_shot_df.drop(columns=['tweets', 'scores'], inplace=True)
zero_shot_df['labels'] = zero_shot_df['labels'].apply(lambda x: x[0])
zero_shot_df

Unnamed: 0,sequence,labels
101,Alla fine di questa avventura a me interesserà...,positive
7501,#Salvini: Ho ricordato all’avvocato Conte che ...,negative
8437,Dobbiamo spingere sulla ripartenza. Dobbiamo c...,positive
217,"Si mettano l’animo in pace: il #25settembre, s...",negative
14866,L’#Ucraina nella #UE? Per le scelte e la stori...,negative
...,...,...
9798,Zanda si ricordi che ancora tre giorni fa ripr...,negative
3421,.@berlusconi assolto a Siena nel processo Ruby...,negative
14328,A due anni dall'uscita della sentenza della Co...,negative
5133,Mentre in Italia la “giustizia” mette in galer...,negative


In [60]:
# using the zero-shot classifier
zero_shot = politicians_cleaned['Content'].apply(lambda x: classifier(x, candidate_labels=labels))

zero_shot_df1= zero_shot.to_dict()

In [62]:
# Create a DataFrame with the results keeping the index
zero_shot_df1 = pd.DataFrame.from_dict(zero_shot_df1, orient='index')
zero_shot_df1.drop(columns=['scores'], inplace=True)


zero_shot_df1['labels'] = zero_shot_df1['labels'].apply(lambda x: x[0])
zero_shot_df1

Unnamed: 0,sequence,labels
0,Congratulations to @netanyahu on the formation...,positive
1,Vogliamo restituire a questa Nazione l’ottimis...,negative
2,Cordiale conversazione telefonica con @Zelensk...,positive
3,Franco Frattini era un uomo garbato e intellig...,negative
4,Proteggere la libertà religiosa è un obiettivo...,positive
...,...,...
17240,Conte o va al Quirinale o viene in aula subito...,negative
17241,"Oltre al dannato Covid, oggi c’è un altro viru...",negative
17242,"L'improvvisazione, le liti, l'incertezza di qu...",negative
17243,La situazione sta diventando insostenibile e i...,negative


In [63]:
zero_shot_df1['labels'].value_counts()

labels
negative    11888
positive     5149
neutral       208
Name: count, dtype: int64

In [64]:
# using the zero-shot classifier
zero_shot = politicians_cleaned['processed_tweet'].apply(lambda x: classifier(x, candidate_labels=labels))

zero_shot_df1= zero_shot.to_dict()

In [65]:
# Create a DataFrame with the results keeping the index
zero_shot_df1 = pd.DataFrame.from_dict(zero_shot_df1, orient='index')
zero_shot_df1.drop(columns=['scores'], inplace=True)


zero_shot_df1['labels'] = zero_shot_df1['labels'].apply(lambda x: x[0])
zero_shot_df1

Unnamed: 0,sequence,labels
0,congratulations to netanyahu on the formation ...,positive
1,vogliamo restituire a questa nazione lottimism...,negative
2,cordiale conversazione telefonica con zelensky...,negative
3,franco frattini era un uomo garbato e intellig...,positive
4,proteggere la libert religiosa un obiettivo m...,positive
...,...,...
17240,conte o va al quirinale o viene in aula subito...,negative
17241,oltre al dannato covid oggi c un altro virus c...,negative
17242,limprovvisazione le liti lincertezza di questo...,negative
17243,la situazione sta diventando insostenibile e i...,negative


In [66]:
zero_shot_df1['labels'].value_counts()

labels
negative    12743
positive     4213
neutral       289
Name: count, dtype: int64

## Vader on translated tweets

In [67]:
from langdetect import detect
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import pandas as pd

analyzer = SentimentIntensityAnalyzer()

def analyze_multilingual_sentiment(text):
    try:
        lang = detect(text)
        if lang == 'en':
            text_en = text
        elif lang == 'it':
            text_en = GoogleTranslator(source='it', target='en').translate(text)
        else:
            return 'unsupported', {"neg": 0, "neu": 0, "pos": 0, "compound": 0}
        
        scores = analyzer.polarity_scores(text_en)
        compound = scores['compound']
        if compound >= 0.05:
            sentiment = 'positive'
        elif compound <= -0.05:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'
        return sentiment, scores
    except Exception as e:
        return "error", {"neg": 0, "neu": 0, "pos": 0, "compound": 0}


In [68]:
sampled = politicians_cleaned.sample(n=1000, random_state=42)


In [69]:

results = sampled['Content'].apply(analyze_multilingual_sentiment)


In [70]:

sampled['vader_sentiment'], vader_scores = zip(*results)
vader_df = pd.DataFrame(vader_scores)
sampled = pd.concat([sampled.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

print(sampled[['Content', 'vader_sentiment', 'neg', 'neu', 'pos', 'compound']].head())

                                             Content vader_sentiment    neg  \
0  Alla fine di questa avventura a me interesserà...        positive  0.000   
1  #Salvini: Ho ricordato all’avvocato Conte che ...        positive  0.078   
2  Dobbiamo spingere sulla ripartenza. Dobbiamo c...        positive  0.000   
3  Si mettano l’animo in pace: il #25settembre, s...        positive  0.000   
4  L’#Ucraina nella #UE? Per le scelte e la stori...         neutral  0.000   

     neu    pos  compound  
0  0.786  0.214    0.8805  
1  0.811  0.111    0.2960  
2  0.754  0.246    0.7351  
3  0.758  0.242    0.5859  
4  1.000  0.000    0.0000  


In [73]:
results = politicians_cleaned['processed_tweet'].apply(analyze_multilingual_sentiment)

KeyboardInterrupt: 

In [None]:
politicians_cleaned['vader_sentiment'], vader_scores = zip(*results)
vader_df = pd.DataFrame(vader_scores)
politicians_cleaned = pd.concat([politicians_cleaned.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

print(politicians_cleaned[['Content', 'vader_sentiment']].head())

In [None]:
politicians_cleaned['vader_sentiment'].value_counts()