In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from tqdm import tqdm 

INPUT_FILE = '/kaggle/input/spy-news/SPY_news_only.csv'
OUTPUT_FILE = '/kaggle/working/spy_news_sentiment.csv'
MODEL_NAME = "ProsusAI/finbert"

if torch.cuda.is_available():
    device = 0 
    print(f" GPU Attiva: {torch.cuda.get_device_name(0)}")

df = pd.read_csv(INPUT_FILE)
df = df.dropna(subset=['Article_title'])
titles = df['Article_title'].tolist()
print(f"Totale titoli da analizzare: {len(titles)}")

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)


nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device, return_all_scores=True)

print("Inizio analisi del sentiment")

results = []
batch_size = 32  

for i in tqdm(range(0, len(titles), batch_size)):
    batch = titles[i:i+batch_size]
    batch_results = nlp(batch)
    results.extend(batch_results)


positive_scores = []
negative_scores = []
neutral_scores = []

for res in results:
    scores = {item['label']: item['score'] for item in res}
    
    positive_scores.append(scores.get('positive', 0))
    negative_scores.append(scores.get('negative', 0))
    neutral_scores.append(scores.get('neutral', 0))

df['sentiment_positive'] = positive_scores
df['sentiment_negative'] = negative_scores
df['sentiment_neutral'] = neutral_scores

df['sentiment_label'] = df[['sentiment_positive', 'sentiment_negative', 'sentiment_neutral']].idxmax(axis=1).str.replace('sentiment_', '')

df.to_csv(OUTPUT_FILE, index=False)
print(df[['Article_title', 'sentiment_label', 'sentiment_positive', 'sentiment_negative']].head())

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from tqdm import tqdm 

INPUT_FILE = '/kaggle/input/spy-news/SPY_news_only.csv'
OUTPUT_FILE = '/kaggle/working/spy_news_sentiment_lexrank_summary.csv'
MODEL_NAME = "ProsusAI/finbert"

if torch.cuda.is_available():
    device = 0 
    print(f"GPU Attiva: {torch.cuda.get_device_name(0)}")
col = 'Lexrank_summary'
df = pd.read_csv(INPUT_FILE)
df = df.dropna(subset=[col])
titles = df[col].tolist()

print(f"Totale istanze da analizzare: {len(titles)}")

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)

nlp = pipeline(
    "sentiment-analysis", 
    model=model, 
    tokenizer=tokenizer, 
    device=device, 
    return_all_scores=True
)

def analyze_long_text(text, max_length=500):

    if pd.isna(text) or not isinstance(text, str):
        text = ""
    
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    if len(tokens) <= max_length:
        try:
            return nlp(text)[0]
        except:
            # Fallback per testi vuoti
            return [
                {'label': 'positive', 'score': 0.33},
                {'label': 'negative', 'score': 0.33},
                {'label': 'neutral', 'score': 0.34}
            ]
    
    overlap = max_length // 8
    chunks = []
    
    for i in range(0, len(tokens), max_length - overlap):
        chunk_tokens = tokens[i:i+max_length]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        if chunk_text.strip():  
            chunks.append(chunk_text)
    
    if not chunks:
        return [
            {'label': 'positive', 'score': 0.33},
            {'label': 'negative', 'score': 0.33},
            {'label': 'neutral', 'score': 0.34}
        ]
    
    all_scores = []
    for chunk in chunks:
        try:
            result = nlp(chunk)[0]
            all_scores.append(result)
        except Exception as e:
            print(f"Errore nell'analisi di un chunk: {e}")
            continue
    
    if not all_scores:
        return [
            {'label': 'positive', 'score': 0.33},
            {'label': 'negative', 'score': 0.33},
            {'label': 'neutral', 'score': 0.34}
        ]
    
    avg_scores = {}
    for label in ['positive', 'negative', 'neutral']:
        scores_for_label = [
            next((item['score'] for item in scores if item['label'] == label), 0)
            for scores in all_scores
        ]
        avg_scores[label] = np.mean(scores_for_label)
    
    return [{'label': k, 'score': v} for k, v in avg_scores.items()]

print("Inizio analisi del sentiment con sliding window")

results = []

for text in tqdm(titles, desc="Analisi sentiment"):
    result = analyze_long_text(text)
    results.append(result)

positive_scores = []
negative_scores = []
neutral_scores = []

for res in results:
    scores = {item['label']: item['score'] for item in res}
    positive_scores.append(scores.get('positive', 0))
    negative_scores.append(scores.get('negative', 0))
    neutral_scores.append(scores.get('neutral', 0))

df['sentiment_positive'] = positive_scores
df['sentiment_negative'] = negative_scores
df['sentiment_neutral'] = neutral_scores
df['sentiment_label'] = df[['sentiment_positive', 'sentiment_negative', 'sentiment_neutral']].idxmax(axis=1).str.replace('sentiment_', '')

df['text_length'] = df[col].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=False)) if pd.notna(x) else 0)
df['num_chunks'] = df['text_length'].apply(lambda x: max(1, int(np.ceil(x / 510))))

df.to_csv(OUTPUT_FILE, index=False)

print("\n=== ANALISI COMPLETATA ===")
print(f"File salvato in: {OUTPUT_FILE}")
print("\n=== STATISTICHE ===")
print(f"Testi analizzati: {len(df)}")
print("\n=== DISTRIBUZIONE SENTIMENT ===")
print(df['sentiment_label'].value_counts())
print("\n=== ESEMPI ===")
print(df[['Lsa_summary', 'sentiment_label', 'sentiment_positive', 'sentiment_negative', 'text_length', 'num_chunks']].head(10))