In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from langdetect import detect
import re
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import spacy
from tqdm import tqdm
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
import os

### RoBERTa pre-trained on Twitter

In [2]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()  # Set to evaluation mode

# Define labels
labels = ['negative', 'neutral', 'positive']

In [3]:
def get_sentiment(text):
    # Preprocess text
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits[0].numpy()
    scores = torch.nn.functional.softmax(torch.tensor(scores), dim=0)
    return labels[scores.argmax()], scores.numpy()


---

## Roberta on cleaned dataset

In [4]:
politicians_cleaned = pd.read_csv('politicians_data/politicians_classified.csv')

In [5]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load stopwords
stop_words_italian = set(stopwords.words('italian'))
stop_words_english = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load spaCy models for Italian and English
# You need to install these models first using:
nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing to the tweets
politicians_tweets = politicians_cleaned['Content']

# Method 1: Using tqdm with apply (simpler)
tqdm.pandas(desc="Processing tweets")
politicians_cleaned['processed_tweet'] = politicians_tweets.progress_apply(lambda x: preprocess_text(x))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinaserandrei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martinaserandrei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/martinaserandrei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing tweets: 100%|██████████| 17245/17245 [00:00<00:00, 242350.68it/s]


In [28]:
sentiment_results2 = politicians_cleaned['processed_tweet'].apply(get_sentiment)

In [29]:
# Create a DataFrame with the results keeping the index
roberta_df2 = pd.DataFrame(sentiment_results2.tolist(), index=politicians_cleaned.index, columns=['roberta_sentiment', 'scores'])
roberta_df2['scores'] = roberta_df2['scores'].apply(lambda x: x.tolist())
# create a new column with each score
roberta_df2[['negative', 'neutral', 'positive']] = pd.DataFrame(roberta_df2['scores'].tolist(), index=roberta_df2.index)
roberta_df2. drop(columns=['scores'], inplace=True)
roberta_df2

Unnamed: 0,roberta_sentiment,negative,neutral,positive
0,positive,0.001159,0.025998,0.972843
1,neutral,0.063336,0.862081,0.074583
2,neutral,0.071610,0.885653,0.042737
3,neutral,0.088932,0.837134,0.073934
4,neutral,0.094576,0.865722,0.039702
...,...,...,...,...
17240,neutral,0.122503,0.824101,0.053396
17241,neutral,0.205459,0.757824,0.036717
17242,neutral,0.163984,0.791811,0.044205
17243,neutral,0.142545,0.806770,0.050686


In [30]:
politicians_cleaned['roberta_sentiment'] = roberta_df2['roberta_sentiment']

In [31]:
# count the number of tweets for each sentiment
politicians_cleaned['roberta_sentiment'].value_counts()

roberta_sentiment
neutral     17070
positive      135
negative       40
Name: count, dtype: int64

## Trying another model for italian tweets

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# Carica il modello italiano
model_name = "MilaNLProc/feel-it-italian-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Funzione di analisi
def italian_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    labels = ['negative', 'neutral', 'positive']
    return labels[scores.argmax()], dict(zip(labels, scores.round(3)))


In [8]:
sampled_tweets = politicians_cleaned.sample(n=1000, random_state=42)
# Apply Italian sentiment analysis
ita_sample = sampled_tweets['Content'].apply(italian_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [9]:
# Create a DataFrame with the results keeping the index
ita_df = pd.DataFrame(ita_sample.tolist(), index=sampled_tweets.index, columns=['ita_sentiment', 'scores'])
# create a new column with each score
ita_df[['negative', 'neutral', 'positive']] = pd.DataFrame(ita_df['scores'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).tolist(), index=ita_df.index)
ita_df.drop(columns=['scores'], inplace=True)
ita_df

Unnamed: 0,ita_sentiment,scores
101,neutral,"{'negative': 0.001, 'neutral': 0.999}"
7501,negative,"{'negative': 1.0, 'neutral': 0.0}"
8437,negative,"{'negative': 1.0, 'neutral': 0.0}"
217,negative,"{'negative': 1.0, 'neutral': 0.0}"
14866,negative,"{'negative': 0.999, 'neutral': 0.001}"
...,...,...
9798,negative,"{'negative': 1.0, 'neutral': 0.0}"
3421,negative,"{'negative': 1.0, 'neutral': 0.0}"
14328,neutral,"{'negative': 0.141, 'neutral': 0.859}"
5133,negative,"{'negative': 1.0, 'neutral': 0.0}"


In [None]:
sentiment_ita = politicians_cleaned['Content'].apply(italian_sentiment)

---

## Zero-shot learning

In [8]:
from transformers import pipeline

# Load zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define your custom labels
labels = ["positive", "neutral", "negative"]

# Example tweet
text = "Non sono contento di come sta andando la situazione politica"

# Apply zero-shot
result = classifier(text, candidate_labels=labels)
print(result)


Device set to use mps:0


{'sequence': 'Non sono contento di come sta andando la situazione politica', 'labels': ['negative', 'neutral', 'positive'], 'scores': [0.9891007542610168, 0.005554403178393841, 0.005344857927411795]}


In [39]:
sampled_tweets = politicians_cleaned.sample(n=1000, random_state=42)
# Apply zero-shot classification
zero_shot_sample = sampled_tweets['Content'].apply(lambda x: classifier(x, candidate_labels=labels))

In [None]:
zero_shot_sample= zero_shot_sample.to_dict()

In [58]:
# Create a DataFrame with the results keeping the index
zero_shot_df = pd.DataFrame.from_dict(zero_shot_sample, orient='index')
zero_shot_df['tweets'] = sampled_tweets['Content']
zero_shot_df.drop(columns=['tweets', 'scores'], inplace=True)
zero_shot_df['labels'] = zero_shot_df['labels'].apply(lambda x: x[0])
zero_shot_df

Unnamed: 0,sequence,labels
101,Alla fine di questa avventura a me interesserà...,positive
7501,#Salvini: Ho ricordato all’avvocato Conte che ...,negative
8437,Dobbiamo spingere sulla ripartenza. Dobbiamo c...,positive
217,"Si mettano l’animo in pace: il #25settembre, s...",negative
14866,L’#Ucraina nella #UE? Per le scelte e la stori...,negative
...,...,...
9798,Zanda si ricordi che ancora tre giorni fa ripr...,negative
3421,.@berlusconi assolto a Siena nel processo Ruby...,negative
14328,A due anni dall'uscita della sentenza della Co...,negative
5133,Mentre in Italia la “giustizia” mette in galer...,negative


In [9]:
# using the zero-shot classifier
zero_shot = politicians_cleaned['Content'].apply(lambda x: classifier(x, candidate_labels=labels))

zero_shot_df1= zero_shot.to_dict()

In [10]:
# Create a DataFrame with the results keeping the index
zero_shot_df1 = pd.DataFrame.from_dict(zero_shot_df1, orient='index')
zero_shot_df1.drop(columns=['scores'], inplace=True)


zero_shot_df1['labels'] = zero_shot_df1['labels'].apply(lambda x: x[0])
zero_shot_df1

Unnamed: 0,sequence,labels
0,Congratulations to @netanyahu on the formation...,positive
1,Vogliamo restituire a questa Nazione l’ottimis...,negative
2,Cordiale conversazione telefonica con @Zelensk...,positive
3,Franco Frattini era un uomo garbato e intellig...,negative
4,Proteggere la libertà religiosa è un obiettivo...,positive
...,...,...
17240,Conte o va al Quirinale o viene in aula subito...,negative
17241,"Oltre al dannato Covid, oggi c’è un altro viru...",negative
17242,"L'improvvisazione, le liti, l'incertezza di qu...",negative
17243,La situazione sta diventando insostenibile e i...,negative


In [11]:
zero_shot_df1['labels'].value_counts()

labels
negative    11888
positive     5149
neutral       208
Name: count, dtype: int64

In [64]:
# using the zero-shot classifier
zero_shot = politicians_cleaned['processed_tweet'].apply(lambda x: classifier(x, candidate_labels=labels))

zero_shot_df1= zero_shot.to_dict()

In [65]:
# Create a DataFrame with the results keeping the index
zero_shot_df1 = pd.DataFrame.from_dict(zero_shot_df1, orient='index')
zero_shot_df1.drop(columns=['scores'], inplace=True)


zero_shot_df1['labels'] = zero_shot_df1['labels'].apply(lambda x: x[0])
zero_shot_df1

Unnamed: 0,sequence,labels
0,congratulations to netanyahu on the formation ...,positive
1,vogliamo restituire a questa nazione lottimism...,negative
2,cordiale conversazione telefonica con zelensky...,negative
3,franco frattini era un uomo garbato e intellig...,positive
4,proteggere la libert religiosa un obiettivo m...,positive
...,...,...
17240,conte o va al quirinale o viene in aula subito...,negative
17241,oltre al dannato covid oggi c un altro virus c...,negative
17242,limprovvisazione le liti lincertezza di questo...,negative
17243,la situazione sta diventando insostenibile e i...,negative


In [66]:
zero_shot_df1['labels'].value_counts()

labels
negative    12743
positive     4213
neutral       289
Name: count, dtype: int64

---

## Vader on translated tweets

In [12]:
from langdetect import detect
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import pandas as pd

analyzer = SentimentIntensityAnalyzer()

def analyze_multilingual_sentiment(text):
    try:
        lang = detect(text)
        if lang == 'en':
            text_en = text
        elif lang == 'it':
            text_en = GoogleTranslator(source='it', target='en').translate(text)
        else:
            return 'unsupported', {"neg": 0, "neu": 0, "pos": 0, "compound": 0}
        
        scores = analyzer.polarity_scores(text_en)
        compound = scores['compound']
        if compound >= 0.05:
            sentiment = 'positive'
        elif compound <= -0.05:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'
        return sentiment, scores
    except Exception as e:
        return "error", {"neg": 0, "neu": 0, "pos": 0, "compound": 0}


In [13]:
sampled = politicians_cleaned.sample(n=1000, random_state=42)


In [14]:

results = sampled['Content'].apply(analyze_multilingual_sentiment)


In [16]:

sampled['vader_sentiment'], vader_scores = zip(*results)
vader_df = pd.DataFrame(vader_scores)
sampled = pd.concat([sampled.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

final = sampled[['Content', 'vader_sentiment']]

In [None]:
results = politicians_cleaned['processed_tweet'].apply(analyze_multilingual_sentiment)

In [None]:
politicians_cleaned['vader_sentiment'], vader_scores = zip(*results)
vader_df = pd.DataFrame(vader_scores)
politicians_cleaned = pd.concat([politicians_cleaned.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

print(politicians_cleaned[['Content', 'vader_sentiment']].head())

In [None]:
politicians_cleaned['vader_sentiment'].value_counts()