In [35]:
import re
import pandas as pd

import spacy
import nltk
from nltk.corpus import stopwords

from nrclex import NRCLex

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.sentiments import PatternAnalyzer

import eng_spacysentiment

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

In [None]:
#preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

#vader
analyzer = SentimentIntensityAnalyzer()

#spacy
nlp2 = eng_spacysentiment.load()

#bert
MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"  # Utile perché dovrebbe funzionare anche in italiano
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

#distilbert
MODEL_NAME2 = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer2 = AutoTokenizer.from_pretrained(MODEL_NAME2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME2)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sylcherry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device set to use cpu


In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return tokens

df = pd.read_csv('./csv_chunks_en_filtered.csv')

df['tokens'] = df['chunk'].apply(preprocess)

emotion_mapping = {
    'anger': 'anger',
    'anticipation': 'anticipation',
    'disgust': 'disgust',
    'fear': 'fear',
    'joy': 'joy',
    'sadness': 'sadness',
    'surprise': 'surprise',
    'trust': 'trust',
    'anticip': 'anticipation',
    'positive': 'positive',
    'negative': 'negative'
}

final_emotions = list(set(emotion_mapping.values()))

def sentiment_NCR(tokens):
    total_emotions = {emotion: 0 for emotion in final_emotions}
    emotion_count = 0
    
    for text in tokens:
        emotion = NRCLex(text)
        total_score = sum(emotion.affect_frequencies.values())
    
        if total_score > 0:
            normalized_emotions = {emotion_mapping.get(emotion_name, None): emotion_score / total_score
                                for emotion_name, emotion_score in emotion.affect_frequencies.items()
                                if emotion_mapping.get(emotion_name, None)}
            
            for emotion_name, normalized_score in normalized_emotions.items():
                total_emotions[emotion_name] += normalized_score
            
            emotion_count += 1

    average_emotions = {emotion_name: (score / emotion_count) if emotion_count > 0 else 0
                        for emotion_name, score in total_emotions.items()}
    
    total_sum = sum(average_emotions.values())
    if total_sum > 0:
        average_emotions = {key: round(value / total_sum, 3) for key, value in average_emotions.items()}
    
    return average_emotions

df[final_emotions] = df['tokens'].apply(sentiment_NCR).apply(pd.Series)

df['strongest_emotion'] = df[final_emotions].idxmax(axis=1)
df['tot_pos'] = df[['joy', 'trust', 'positive', 'surprise', 'anticipation']].sum(axis=1)
df['tot_neg'] = df[['sadness', 'disgust', 'fear', 'anger', 'negative']].sum(axis=1)

df

In [None]:
def sentiment_VADER(testo):
    sentiment = analyzer.polarity_scores(testo)
    return sentiment['pos'], sentiment['neg'], sentiment['neu'], sentiment['compound']

df = pd.read_csv('csv_chunks_en_filtered.csv')

df[['pos', 'neg', 'neu', 'polarità']] = df['chunk'].apply(lambda x: pd.Series(sentiment_VADER(x)))

df

In [None]:
def sentiment_textblob_def(testo):
    blob = TextBlob(testo, analyzer=PatternAnalyzer())
    polarità = blob.sentiment.polarity 
    soggettività = blob.sentiment.subjectivity
    return polarità, soggettività

print('Done :)')

def sentiment_textblob_bayes(testo):
    blob = TextBlob(testo, analyzer=NaiveBayesAnalyzer())
    classificazione = blob.sentiment.classification  
    p_pos = blob.sentiment.p_pos 
    p_neg = blob.sentiment.p_neg
    return classificazione, p_pos, p_neg

df = pd.read_csv('csv_chunks_en_filtered.csv')

df[['polarità', 'soggettività']] = df['chunk'].apply(lambda x: pd.Series(sentiment_textblob_def(x)))
df[['classificazione', 'p_pos', 'p_neg']] = df['chunk'].apply(lambda x: pd.Series(sentiment_textblob_bayes(x)))

df

In [None]:
def sentiment_spacy(text):
    doc = nlp2(text)
    return doc.cats

df = pd.read_csv('csv_chunks_en_filtered.csv')

df[['positive','negative', 'neutral']] = df['chunk'].apply(sentiment_spacy).apply(pd.Series)

df

In [None]:
def sentiment_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=-1).squeeze().tolist()
    
    sentiment_scores = {
        "very_negative": probs[0],
        "negative": probs[1],
        "neutral": probs[2],
        "positive": probs[3],
        "very_positive": probs[4]
    }
    
    return sentiment_scores

df = pd.read_csv('csv_chunks_en_filtered.csv')

df = df.join(df['chunk'].apply(sentiment_bert).apply(pd.Series))

df

In [None]:
def sentiment_distilbert(text):
    inputs = tokenizer2(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model2(**inputs)
    probs = softmax(outputs.logits, dim=-1).squeeze().tolist()
    
    sentiment_scores = {
        "negative": probs[0],
        "positive": probs[1]
    }
    
    return sentiment_scores

df = pd.read_csv('csv_chunks_en_filtered.csv')

df = df.join(df['chunk'].apply(sentiment_distilbert).apply(pd.Series))

df