# Sentiment Analysis with FinBERT-PT-BR

This notebook implements sentiment analysis of InfoMoney news using the **FinBERT-PT-BR** model (state-of-the-art for finance in Portuguese).

## Objectives:
1. Load and test the FinBERT-PT-BR model
2. Process all 11,504 news articles
3. Perform exploratory analysis of sentiments
4. Aggregate sentiments by date

---

## 1. Setup & Imports

In [None]:
import sys
import os

# Fix for Unicode error on Windows
os.environ["PYTHONUTF8"] = "1"

# Add root directory to path to import modules
sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Project modules
from src.sentiment.sentiment_analyzer import (
    setup_sentiment_model,
    predict_sentiment,
    predict_batch,
    analyze_news_file
)
from src.sentiment.daily_aggregation import aggregate_daily_sentiment

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("Imports complete!")

## 2. GPU Verification

In [None]:
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("GPU not detected. Processing will be on CPU (slower).")

## 2.1 Manual Validation (TCC Requirement)
Comparison of model predictions with manual labels to validate accuracy.

In [None]:
# --- MANUAL MODEL VALIDATION ---

# Load manual validation dataset
manual_labels_path = r'../src/dataset/sentiment/news_with_sentiment_manual_labeling.csv'
df_manual = pd.read_csv(manual_labels_path)

print(f"Loaded validation dataset with {len(df_manual)} samples.")

# Convert manual sentiment to model format if necessary (-1, 0, 1) -> (negative, neutral, positive)
# Assuming file has 'sentiment' column with numeric or mappable string values
label_map = {
    -1: 'negative',
    0: 'neutral',
    1: 'positive',
    '-1': 'negative',
    '0': 'neutral',
    '1': 'positive'
}

if 'sentiment' in df_manual.columns:
    # Ensure we have comparable labels
    df_manual['manual_label'] = df_manual['sentiment'].map(label_map)
    
    # Load model
    model, tokenizer, device = setup_sentiment_model()
    
    # Make predictions on validation dataset
    print("Making predictions on validation dataset...")
    # predict_batch returns a DataFrame with columns: prob_neg, prob_neu, prob_pos, sentiment_score
    df_results = predict_batch(df_manual['title'].tolist(), model, tokenizer, device)
    
    # Extract predicted label from probabilities
    # Map columns to labels: prob_neg -> negative, prob_neu -> neutral, prob_pos -> positive
    prob_cols = ['prob_neg', 'prob_neu', 'prob_pos']
    col_to_label = {'prob_neg': 'negative', 'prob_neu': 'neutral', 'prob_pos': 'positive'}
    
    # Find the column with max probability for each row and map to label
    df_manual['predicted_label'] = df_results[prob_cols].idxmax(axis=1).map(col_to_label)
    df_manual['predicted_score'] = df_results['sentiment_score']
    
    # Calculate metrics
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    
    # Remove NaN if any (in case some manual labels were not mapped)
    df_valid = df_manual.dropna(subset=['manual_label', 'predicted_label'])
    
    print("\n" + "="*50)
    print("MANUAL VALIDATION REPORT")
    print("="*50)
    print(f"Samples used: {len(df_valid)}")
    
    print("\n--- General Metrics ---")
    acc = accuracy_score(df_valid['manual_label'], df_valid['predicted_label'])
    print(f"Accuracy: {acc:.4f}")
    
    print("\n--- Classification Report ---")
    print(classification_report(df_valid['manual_label'], df_valid['predicted_label']))
    
    print("\n--- Confusion Matrix ---")
    labels = ['negative', 'neutral', 'positive']
    cm = confusion_matrix(df_valid['manual_label'], df_valid['predicted_label'], labels=labels)
    
    # Plot matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual (Manual)')
    plt.title('Confusion Matrix: FinBERT vs Manual Classification')
    plt.show()
    
    # Save validation results
    output_val_path = manual_labels_path.replace('.csv', '_results.csv')
    df_manual.to_csv(output_val_path, index=False)
    print(f"\nDetailed results saved to: {output_val_path}")
else:
    print("ERROR: 'sentiment' column not found in manual validation file.")

## 4. Full Dataset Processing

Load the model:


In [None]:
# Load model
print("Loading FinBERT-PT-BR model...\n")
tokenizer, model, device = setup_sentiment_model()

print(f"\nModel ready for use!")

Now let's process the InfoMoney news.

In [5]:
END_DATE = datetime(2025, 12, 31)
# Caminhos dos arquivos
input_path = '../src/dataset/scraper/consolidated_news_20260209.csv'
output_path = '../src/dataset/sentiment/news_with_sentiment.csv'

print(f" Arquivo de entrada: {input_path}")
print(f" Arquivo de saída: {output_path}")
print("\n Processamento iniciado... (isso pode levar alguns minutos)\n")


 Arquivo de entrada: ../src/dataset/scraper/consolidated_news_20260209.csv
 Arquivo de saída: ../src/dataset/sentiment/news_with_sentiment.csv

 Processamento iniciado... (isso pode levar alguns minutos)



In [None]:
df_input = pd.read_csv(input_path)
df_input['date'] = pd.to_datetime(df_input['date'])
df_input = df_input[df_input['date'] <= END_DATE]
input_path_filtered = input_path.replace('.csv', '_filtered.csv')
df_input.to_csv(input_path_filtered, index=False)

# Process all news
# NOTE: With GPU, should take ~5-10 minutes. With CPU, ~20-30 minutes.

df_with_sentiment = analyze_news_file(
    input_csv_path=input_path_filtered,
    output_csv_path=output_path,
    text_column='title',
    batch_size=32  # Adjust to 64 or 128 if you have a powerful GPU
)

print("\nProcessing complete!")


## 5. Exploratory Data Analysis

In [None]:
# Load results (in case notebook is restarted)
df_with_sentiment = pd.read_csv(output_path)
df_with_sentiment['date'] = pd.to_datetime(df_with_sentiment['date'])

print(f"Total news processed: {len(df_with_sentiment):,}")
print(f"Period: {df_with_sentiment['date'].min()} to {df_with_sentiment['date'].max()}")
print(f"\nFirst rows:")
df_with_sentiment.head()

### 5.1 Descriptive Statistics

In [None]:
print("="*60)
print("SENTIMENT STATISTICS")
print("="*60)

print("\nSentiment Score:")
print(df_with_sentiment['sentiment_score'].describe())

print("\nProbabilities:")
print(df_with_sentiment[['prob_neg', 'prob_neu', 'prob_pos']].describe())

### 5.2 Sentiment Distribution

In [None]:
# Classify news
df_with_sentiment['sentiment_class'] = pd.cut(
    df_with_sentiment['sentiment_score'],
    bins=[-np.inf, -0.2, 0.2, np.inf],
    labels=['Negative', 'Neutral', 'Positive']
)

# Count
sentiment_counts = df_with_sentiment['sentiment_class'].value_counts()
print("\nSentiment Distribution:")
print(sentiment_counts)
print(f"\nPercentages:")
print(sentiment_counts / len(df_with_sentiment) * 100)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Score histogram
axes[0, 0].hist(df_with_sentiment['sentiment_score'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Neutral')
axes[0, 0].set_title('Sentiment Score Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Sentiment Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# 2. Classification pie chart
colors_pie = ['#e74c3c', '#95a5a6', '#2ecc71']
sentiment_counts.plot(kind='pie', ax=axes[0, 1], autopct='%1.1f%%', colors=colors_pie, startangle=90)
axes[0, 1].set_title('Sentiment Classification', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('')

# 3. Probability boxplot
df_with_sentiment[['prob_neg', 'prob_neu', 'prob_pos']].boxplot(ax=axes[1, 0])
axes[1, 0].set_title('Probability Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Probability')
axes[1, 0].set_xticklabels(['Negative', 'Neutral', 'Positive'])

# 4. Monthly time evolution (sample)
df_monthly = df_with_sentiment.set_index('date').resample('ME')['sentiment_score'].mean()
df_monthly.plot(ax=axes[1, 1], marker='o', linewidth=2)
axes[1, 1].axhline(y=0, color='red', linestyle='--', linewidth=1)
axes[1, 1].set_title('Monthly Average Sentiment', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Average Sentiment Score')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 5.3 Top Most Positive and Negative News

In [None]:
print("="*80)
print("TOP 10 MOST POSITIVE NEWS")
print("="*80 + "\n")

top_positive = df_with_sentiment.nlargest(10, 'sentiment_score')[['date', 'title', 'sentiment_score']]
for idx, row in top_positive.iterrows():
    print(f"Score: {row['sentiment_score']:+.3f} | {row['date'].date()}")
    print(f"  '{row['title']}'")
    print()

print("\n" + "="*80)
print("TOP 10 MOST NEGATIVE NEWS")
print("="*80 + "\n")

top_negative = df_with_sentiment.nsmallest(10, 'sentiment_score')[['date', 'title', 'sentiment_score']]
for idx, row in top_negative.iterrows():
    print(f"Score: {row['sentiment_score']:+.3f} | {row['date'].date()}")
    print(f"  '{row['title']}'")
    print()

## 6. Daily Aggregation

Group sentiments by date to create the time series that will be correlated with BOVA11 returns.

In [None]:
# Aggregate by date
daily_sentiment = aggregate_daily_sentiment(df_with_sentiment)

print(f"Total days with news: {len(daily_sentiment)}")
print(f"Period: {daily_sentiment['date'].min().date()} to {daily_sentiment['date'].max().date()}")
print(f"\nFirst rows:")
daily_sentiment.head(10)

### 6.1 Daily Statistics

In [None]:
print("="*60)
print("DAILY STATISTICS")
print("="*60 + "\n")

print("News per day:")
print(daily_sentiment['news_count'].describe())

print("\nDaily average sentiment:")
print(daily_sentiment['sentiment_mean'].describe())

print("\nSentiment momentum:")
print(daily_sentiment['sentiment_momentum'].describe())

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

# 1. Daily average sentiment
axes[0].plot(daily_sentiment['date'], daily_sentiment['sentiment_mean'], linewidth=1.5, alpha=0.7)
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=1)
axes[0].fill_between(
    daily_sentiment['date'],
    daily_sentiment['sentiment_mean'],
    0,
    where=(daily_sentiment['sentiment_mean'] > 0),
    alpha=0.3,
    color='green',
    label='Positive'
)
axes[0].fill_between(
    daily_sentiment['date'],
    daily_sentiment['sentiment_mean'],
    0,
    where=(daily_sentiment['sentiment_mean'] <= 0),
    alpha=0.3,
    color='red',
    label='Negative'
)
axes[0].set_title('Daily Average Sentiment', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Sentiment Score')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. News volume
axes[1].bar(daily_sentiment['date'], daily_sentiment['news_count'], alpha=0.7)
axes[1].set_title('News Volume per Day', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Number of News')
axes[1].grid(True, alpha=0.3, axis='y')

# 3. Stacked sentiment distribution
axes[2].bar(daily_sentiment['date'], daily_sentiment['count_negative'], label='Negative', color='#e74c3c')
axes[2].bar(daily_sentiment['date'], daily_sentiment['count_neutral'], 
            bottom=daily_sentiment['count_negative'], label='Neutral', color='#95a5a6')
axes[2].bar(daily_sentiment['date'], daily_sentiment['count_positive'],
            bottom=daily_sentiment['count_negative'] + daily_sentiment['count_neutral'],
            label='Positive', color='#2ecc71')
axes[2].set_title('Daily Sentiment Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Date')
axes[2].set_ylabel('Number of News')
axes[2].legend()
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

### 6.2 Save Aggregated Data

In [None]:
# Save daily aggregation
daily_output_path = '../src/dataset/sentiment/daily_sentiment.csv'
daily_sentiment.to_csv(daily_output_path, index=False)

print(f"Daily data saved to: {daily_output_path}")
print(f"   Total days: {len(daily_sentiment)}")
print(f"   Columns: {list(daily_sentiment.columns)}")

## 7. Final Summary

In [None]:
print("="*80)
print("SENTIMENT ANALYSIS SUMMARY")
print("="*80)

print(f"\nProcessed Data:")
print(f"   - Total news: {len(df_with_sentiment):,}")
print(f"   - Period: {df_with_sentiment['date'].min().date()} to {df_with_sentiment['date'].max().date()}")
print(f"   - Days with news: {len(daily_sentiment)}")

print(f"\nGeneral Statistics:")
print(f"   - Overall average sentiment: {df_with_sentiment['sentiment_score'].mean():+.4f}")
print(f"   - Standard deviation: {df_with_sentiment['sentiment_score'].std():.4f}")
print(f"   - Positive news: {sentiment_counts['Positive']:,} ({sentiment_counts['Positive']/len(df_with_sentiment)*100:.1f}%)")
print(f"   - Neutral news: {sentiment_counts['Neutral']:,} ({sentiment_counts['Neutral']/len(df_with_sentiment)*100:.1f}%)")
print(f"   - Negative news: {sentiment_counts['Negative']:,} ({sentiment_counts['Negative']/len(df_with_sentiment)*100:.1f}%)")

print(f"\nGenerated Files:")
print(f"   1. {output_path}")
print(f"      All news with sentiment score")
print(f"   2. {daily_output_path}")
print(f"      Daily sentiment aggregation")

print(f"\nNext step: Run notebook 03_sentiment_market_merge.ipynb")
print(f"   to merge with BOVA11 data!")
print("\n" + "="*80)