## **Data Import**

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import XLMRobertaTokenizer
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from langdetect import detect
import re
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import spacy
from tqdm import tqdm
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
import os

import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm
import torch.nn.functional as F


### RoBERTa pre-trained on Twitter

As a first try, we want to use a pre-trained model called XLM-Roberta-base that is trained on ~198M multilingual tweets (among which Italian) and we implement the version for sentiment analysis.

Source: https://arxiv.org/abs/2104.12250
https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment

In [4]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()
# Define labels
labels = ['negative', 'neutral', 'positive']

pytorch_model.bin:  44%|####4     | 493M/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [5]:
politicians_cleaned = pd.read_csv('../politicians_data/politicians_classified.csv')

In [6]:
# Twitter-specific preprocessing
def clean_tweet(text):
    text = str(text)
    text = re.sub(r"http\S+|www.\S+", "", text)      # Remove URLs
    text = re.sub(r"@\w+", "@user", text)            # Replace mentions
    text = re.sub(r"#", "", text)                    # Remove hashtag symbols
    text = re.sub(r"\s+", " ", text).strip()         # Remove extra spaces
    return text.lower()

In [7]:
# Sentiment analysis function (CPU version)
def get_sentiment(text):
    text = clean_tweet(text)
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        output = model(**encoded_input)
        scores = F.softmax(output.logits, dim=1)[0]

    label = labels[torch.argmax(scores).item()]
    return {
        'label': label,
        'scores': {labels[i]: float(scores[i]) for i in range(len(labels))}
    }




In [8]:
# Apply sentiment analysis with progress bar
tqdm.pandas(desc="Analyzing sentiment")
politicians_cleaned['sentiment'] = politicians_cleaned['Content'].progress_apply(get_sentiment)


Analyzing sentiment: 100%|██████████| 17245/17245 [12:32<00:00, 22.91it/s]


In [9]:

# Optionally, split into label and confidence columns
politicians_cleaned['sentiment_label'] = politicians_cleaned['sentiment'].apply(lambda x: x['label'])
politicians_cleaned['sentiment_scores'] = politicians_cleaned['sentiment'].apply(lambda x: x['scores'])


In [11]:
politicians_cleaned = politicians_cleaned.drop(columns=['sentiment'])

In [13]:
politicians_cleaned['sentiment_label'].value_counts()

sentiment_label
negative    10333
positive     5144
neutral      1768
Name: count, dtype: int64

## Trying another model for italian tweets

As a second shot, we apply a pre-trained model based on UmBERTo model which has been fine-tuned precisely on Italian tweets and there is one in particular made for sentiment analysis.

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# Carica il modello italiano
model_name = "MilaNLProc/feel-it-italian-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Funzione di analisi
def italian_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    labels = ['negative', 'neutral', 'positive']
    return labels[scores.argmax()], dict(zip(labels, scores.round(3)))


In [8]:
sampled_tweets = politicians_cleaned.sample(n=1000, random_state=42)
# Apply Italian sentiment analysis
ita_sample = sampled_tweets['Content'].apply(italian_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [9]:
# Create a DataFrame with the results keeping the index
ita_df = pd.DataFrame(ita_sample.tolist(), index=sampled_tweets.index, columns=['ita_sentiment', 'scores'])
# create a new column with each score
ita_df[['negative', 'neutral', 'positive']] = pd.DataFrame(ita_df['scores'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).tolist(), index=ita_df.index)
ita_df.drop(columns=['scores'], inplace=True)
ita_df

Unnamed: 0,ita_sentiment,scores
101,neutral,"{'negative': 0.001, 'neutral': 0.999}"
7501,negative,"{'negative': 1.0, 'neutral': 0.0}"
8437,negative,"{'negative': 1.0, 'neutral': 0.0}"
217,negative,"{'negative': 1.0, 'neutral': 0.0}"
14866,negative,"{'negative': 0.999, 'neutral': 0.001}"
...,...,...
9798,negative,"{'negative': 1.0, 'neutral': 0.0}"
3421,negative,"{'negative': 1.0, 'neutral': 0.0}"
14328,neutral,"{'negative': 0.141, 'neutral': 0.859}"
5133,negative,"{'negative': 1.0, 'neutral': 0.0}"


In [None]:
sentiment_ita = politicians_cleaned['Content'].apply(italian_sentiment)

---

## Zero-shot learning

In [8]:
from transformers import pipeline

# Load zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define your custom labels
labels = ["positive", "neutral", "negative"]

# Example tweet
text = "Non sono contento di come sta andando la situazione politica"

# Apply zero-shot
result = classifier(text, candidate_labels=labels)
print(result)


Device set to use mps:0


{'sequence': 'Non sono contento di come sta andando la situazione politica', 'labels': ['negative', 'neutral', 'positive'], 'scores': [0.9891007542610168, 0.005554403178393841, 0.005344857927411795]}


In [39]:
sampled_tweets = politicians_cleaned.sample(n=1000, random_state=42)
# Apply zero-shot classification
zero_shot_sample = sampled_tweets['Content'].apply(lambda x: classifier(x, candidate_labels=labels))

In [None]:
zero_shot_sample= zero_shot_sample.to_dict()

In [58]:
# Create a DataFrame with the results keeping the index
zero_shot_df = pd.DataFrame.from_dict(zero_shot_sample, orient='index')
zero_shot_df['tweets'] = sampled_tweets['Content']
zero_shot_df.drop(columns=['tweets', 'scores'], inplace=True)
zero_shot_df['labels'] = zero_shot_df['labels'].apply(lambda x: x[0])
zero_shot_df

Unnamed: 0,sequence,labels
101,Alla fine di questa avventura a me interesserà...,positive
7501,#Salvini: Ho ricordato all’avvocato Conte che ...,negative
8437,Dobbiamo spingere sulla ripartenza. Dobbiamo c...,positive
217,"Si mettano l’animo in pace: il #25settembre, s...",negative
14866,L’#Ucraina nella #UE? Per le scelte e la stori...,negative
...,...,...
9798,Zanda si ricordi che ancora tre giorni fa ripr...,negative
3421,.@berlusconi assolto a Siena nel processo Ruby...,negative
14328,A due anni dall'uscita della sentenza della Co...,negative
5133,Mentre in Italia la “giustizia” mette in galer...,negative


In [9]:
# using the zero-shot classifier
zero_shot = politicians_cleaned['Content'].apply(lambda x: classifier(x, candidate_labels=labels))

zero_shot_df1= zero_shot.to_dict()

In [10]:
# Create a DataFrame with the results keeping the index
zero_shot_df1 = pd.DataFrame.from_dict(zero_shot_df1, orient='index')
zero_shot_df1.drop(columns=['scores'], inplace=True)


zero_shot_df1['labels'] = zero_shot_df1['labels'].apply(lambda x: x[0])
zero_shot_df1

Unnamed: 0,sequence,labels
0,Congratulations to @netanyahu on the formation...,positive
1,Vogliamo restituire a questa Nazione l’ottimis...,negative
2,Cordiale conversazione telefonica con @Zelensk...,positive
3,Franco Frattini era un uomo garbato e intellig...,negative
4,Proteggere la libertà religiosa è un obiettivo...,positive
...,...,...
17240,Conte o va al Quirinale o viene in aula subito...,negative
17241,"Oltre al dannato Covid, oggi c’è un altro viru...",negative
17242,"L'improvvisazione, le liti, l'incertezza di qu...",negative
17243,La situazione sta diventando insostenibile e i...,negative


In [11]:
zero_shot_df1['labels'].value_counts()

labels
negative    11888
positive     5149
neutral       208
Name: count, dtype: int64

In [64]:
# using the zero-shot classifier
zero_shot = politicians_cleaned['processed_tweet'].apply(lambda x: classifier(x, candidate_labels=labels))

zero_shot_df1= zero_shot.to_dict()

In [65]:
# Create a DataFrame with the results keeping the index
zero_shot_df1 = pd.DataFrame.from_dict(zero_shot_df1, orient='index')
zero_shot_df1.drop(columns=['scores'], inplace=True)


zero_shot_df1['labels'] = zero_shot_df1['labels'].apply(lambda x: x[0])
zero_shot_df1

Unnamed: 0,sequence,labels
0,congratulations to netanyahu on the formation ...,positive
1,vogliamo restituire a questa nazione lottimism...,negative
2,cordiale conversazione telefonica con zelensky...,negative
3,franco frattini era un uomo garbato e intellig...,positive
4,proteggere la libert religiosa un obiettivo m...,positive
...,...,...
17240,conte o va al quirinale o viene in aula subito...,negative
17241,oltre al dannato covid oggi c un altro virus c...,negative
17242,limprovvisazione le liti lincertezza di questo...,negative
17243,la situazione sta diventando insostenibile e i...,negative


In [66]:
zero_shot_df1['labels'].value_counts()

labels
negative    12743
positive     4213
neutral       289
Name: count, dtype: int64