# Merge: Sentiment + BOVA11 Returns

This notebook merges daily sentiment data with logarithmic returns from BOVA11.

## Objectives:
1. Load sentiment and returns data
2. Perform date-based merge
3. Create lagged variables
4. Preliminary correlation analysis
5. Save final dataset for statistical analysis

---

## 1. Setup & Imports

In [None]:
import sys
import os

# Add root directory to path
sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Project modules
from src.sentiment.daily_aggregation import (
    merge_with_market_data,
    create_lagged_features,
    calculate_sentiment_correlation
)

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("Imports complete!")

## 2. Data Loading

### 2.1 Daily Sentiment Data

In [None]:
# Path to the file generated in the previous notebook
sentiment_path = '../src/dataset/sentiment/daily_sentiment.csv'

print(f"Loading daily sentiment: {sentiment_path}")
df_sentiment = pd.read_csv(sentiment_path)
df_sentiment['date'] = pd.to_datetime(df_sentiment['date'])

print(f"   Total days: {len(df_sentiment)}")
print(f"   Period: {df_sentiment['date'].min().date()} to {df_sentiment['date'].max().date()}")
print(f"\nFirst rows:")
df_sentiment.head()

### 2.2 BOVA11 Returns Data

In [None]:
import glob

# Find the most recent log returns file
market_dir = '../src/dataset/market_data/'
log_return_files = glob.glob(os.path.join(market_dir, 'BOVA11_log_returns_*.csv'))

if not log_return_files:
    print("ERROR: No log returns files found!")
    print("   Run first: src/cotation/calculate_log_returns.py")
else:
    # Get the most recent
    market_path = max(log_return_files)
    
    print(f"Loading BOVA11 returns: {market_path}")
    df_market = pd.read_csv(market_path)
    df_market['Date'] = pd.to_datetime(df_market['Date'])
    
    # Rename to standardize
    df_market.rename(columns={'Date': 'date'}, inplace=True)
    
    print(f"   Total trading days: {len(df_market)}")
    print(f"   Period: {df_market['date'].min().date()} to {df_market['date'].max().date()}")
    print(f"\nFirst rows:")
    display(df_market.head())

## 3. Data Merge

In [None]:
END_DATE = datetime(2025, 12, 31)
# Merge by date (inner join - only days with both data)
print("Performing merge...\n")

df_merged = merge_with_market_data(
    sentiment_df=df_sentiment,
    market_df=df_market,
    date_column='date',
    how='inner'  # Only days that have both (news AND trading session)
)

print(f"Merge complete!")
print(f"   Days in final dataset: {len(df_merged)}")
print(f"   Period: {df_merged['date'].min().date()} to {df_merged['date'].max().date()}")

# Check for missing values
missing = df_merged.isnull().sum()
if missing.sum() > 0:
    print(f"\nMissing values:")
    print(missing[missing > 0])
else:
    print(f"\nNo missing values!")

df_merged = df_merged[df_merged['date'] <= END_DATE]
df_merged.head(10)


## 4. Creating Lagged Variables

We create lags to test different hypotheses:
- **Lag 0**: Sentiment at t vs Return at t (contemporary correlation)
- **Lag 1**: Sentiment at t vs Return at t+1 (sentiment precedes return)
- **Lag 2, 3**: Longer-term effects

In [None]:
# Select sentiment columns to create lags
sentiment_columns = [
    'sentiment_mean',
    'sentiment_std',
    'sentiment_momentum',
    'news_count',
    'count_positive',
    'count_negative'
]

# Create lags of 1 to 3 days
df_with_lags = create_lagged_features(
    df=df_merged,
    columns_to_lag=sentiment_columns,
    lags=[1, 2, 3],
    date_column='date'
)

print(f"Lagged variables created!")
print(f"   Total columns: {len(df_with_lags.columns)}")
print(f"\nNew columns created:")
lag_cols = [col for col in df_with_lags.columns if 'lag' in col]
print(lag_cols[:10])  # Show only the first 10

# Remove rows with NaN (first days without complete lags)
df_final = df_with_lags.dropna()
print(f"\nAfter removing NaN from lags: {len(df_final)} days")

## 5. Preliminary Correlation Analysis

### 5.1 Contemporary Correlation (t vs t)

In [None]:
# Correlation between sentiment and return on the same day
corr_t0 = df_final[['sentiment_mean', 'Log_Return']].corr()

print("Contemporary Correlation (t vs t):")
print(f"\nsentiment_mean vs Log_Return: {corr_t0.iloc[0, 1]:.4f}")

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(df_final['sentiment_mean'], df_final['Log_Return'], alpha=0.5)
plt.xlabel('Average Sentiment Score')
plt.ylabel('Log Return (BOVA11)')
plt.title(f'Contemporary Correlation: {corr_t0.iloc[0, 1]:.4f}')
plt.axhline(y=0, color='red', linestyle='--', linewidth=1)
plt.axvline(x=0, color='red', linestyle='--', linewidth=1)
plt.grid(True, alpha=0.3)
plt.show()

### 5.2 Correlation with Different Lags

In [None]:
# Calculate correlations for different lags
correlation_results = calculate_sentiment_correlation(
    df=df_final,
    sentiment_col='sentiment_mean',
    return_col='Log_Return',
    max_lag=5
)

print("Correlation between Sentiment(t) and Return(t+lag):")
print(correlation_results)

# Visualization
plt.figure(figsize=(10, 6))
plt.bar(correlation_results['lag'], correlation_results['correlation'], alpha=0.7)
plt.axhline(y=0, color='red', linestyle='--', linewidth=1)
plt.xlabel('Lag (days)')
plt.ylabel('Pearson Correlation')
plt.title('Correlation: Sentiment(t) vs Return(t+lag)')
plt.grid(True, alpha=0.3, axis='y')
plt.xticks(correlation_results['lag'])

# Add values on top of bars
for idx, row in correlation_results.iterrows():
    plt.text(row['lag'], row['correlation'], f"{row['correlation']:.3f}",
             ha='center', va='bottom' if row['correlation'] > 0 else 'top')

plt.tight_layout()
plt.show()

### 5.3 Correlation Matrix (All Variables)

In [None]:
# Select relevant columns for correlation
correlation_cols = [
    'Log_Return',
    'sentiment_mean',
    'sentiment_momentum',
    'news_count',
    'count_positive',
    'count_negative',
    'sentiment_mean_lag1',
    'sentiment_mean_lag2'
]

# Calculate correlation matrix
corr_matrix = df_final[correlation_cols].corr()

# Visualization
plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt='.3f',
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=1,
    cbar_kws={"shrink": 0.8}
)
plt.title('Correlation Matrix: Sentiment vs Returns', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Show correlations with Log_Return in descending order
print("\nCorrelations with Log_Return (sorted):")
log_return_corr = corr_matrix['Log_Return'].drop('Log_Return').sort_values(ascending=False)
print(log_return_corr)

## 6. Combined Time Analysis

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True)

# Chart 1: BOVA11 Returns
axes[0].plot(df_final['date'], df_final['Log_Return'], linewidth=1, alpha=0.7, label='Log Return')
axes[0].fill_between(
    df_final['date'],
    df_final['Log_Return'],
    0,
    where=(df_final['Log_Return'] > 0),
    alpha=0.3,
    color='green'
)
axes[0].fill_between(
    df_final['date'],
    df_final['Log_Return'],
    0,
    where=(df_final['Log_Return'] <= 0),
    alpha=0.3,
    color='red'
)
axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[0].set_title('Logarithmic Returns - BOVA11', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Log Return')
axes[0].grid(True, alpha=0.3)
axes[0].legend()

# Chart 2: Average Sentiment
axes[1].plot(df_final['date'], df_final['sentiment_mean'], linewidth=1.5, 
             alpha=0.7, color='purple', label='Sentiment Mean')
axes[1].fill_between(
    df_final['date'],
    df_final['sentiment_mean'],
    0,
    where=(df_final['sentiment_mean'] > 0),
    alpha=0.3,
    color='green'
)
axes[1].fill_between(
    df_final['date'],
    df_final['sentiment_mean'],
    0,
    where=(df_final['sentiment_mean'] <= 0),
    alpha=0.3,
    color='red'
)
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[1].set_title('Daily Average Sentiment', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Sentiment Score')
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

print("Observe visually if there are periods where sentiment and return move together!")

## 7. Save Final Dataset

In [None]:
# Final output path
final_output_path = '../src/dataset/result/sentiment_returns_merged.csv'

# Save
df_final.to_csv(final_output_path, index=False)

print(f"Final dataset saved to: {final_output_path}")
print(f"   Total observations: {len(df_final)}")
print(f"   Total variables: {len(df_final.columns)}")
print(f"   Period: {df_final['date'].min().date()} to {df_final['date'].max().date()}")

## 7. Linear Regression Analysis (OLS)

We use OLS regression (`statsmodels`) to quantify the statistical relationship between news sentiment and BOVA11 returns.

---

In [None]:
import statsmodels.api as sm
from scipy import stats

# Prepare data for regression - remove NaN (created by lags)
df_reg = df_final.dropna().copy()
print(f"Observations available for regression: {len(df_reg)}")
print(f"Period: {df_reg['date'].min().date()} to {df_reg['date'].max().date()}")
print(f"\nAvailable independent variables:")
print(f"  - sentiment_mean (contemporary sentiment)")
print(f"  - sentiment_mean_lag1")
print(f"  - sentiment_mean_lag2")


### 7.1 Simple Linear Regression: Sentiment(t) → Return(t)

We test the hypothesis that news sentiment on day `t` explains BOVA11 returns on the same day.

In [None]:
# --- Simple Regression: sentiment_mean -> Log_Return ---
X_simple = sm.add_constant(df_reg['sentiment_mean'])  # add intercept
y = df_reg['Log_Return']

model_simple = sm.OLS(y, X_simple).fit()
print(model_simple.summary())


### 7.2 Visualization: Simple Regression

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot with regression line
ax1 = axes[0]
ax1.scatter(df_reg['sentiment_mean'], df_reg['Log_Return'], alpha=0.5, s=20, label='Observations')
x_line = np.linspace(df_reg['sentiment_mean'].min(), df_reg['sentiment_mean'].max(), 100)
y_line = model_simple.params['const'] + model_simple.params['sentiment_mean'] * x_line
ax1.plot(x_line, y_line, color='red', linewidth=2, label='OLS Regression')
ax1.set_xlabel('Daily Average Sentiment')
ax1.set_ylabel('Logarithmic Return (BOVA11)')
ax1.set_title('Regression: Sentiment(t) vs Return(t)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Residuals plot
ax2 = axes[1]
ax2.scatter(model_simple.fittedvalues, model_simple.resid, alpha=0.5, s=20)
ax2.axhline(y=0, color='red', linestyle='--', linewidth=1)
ax2.set_xlabel('Fitted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Residuals Plot')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


### 7.3 Multiple Regression: Sentiment(t) + Lags → Return(t)

We include lagged variables (sentiment_mean_lag1, sentiment_mean_lag2) to check if sentiment from previous days has predictive power over the current return.

In [None]:
# --- Multiple Regression ---
features = ['sentiment_mean', 'sentiment_mean_lag1', 'sentiment_mean_lag2']
X_multi = sm.add_constant(df_reg[features])
y = df_reg['Log_Return']

model_multi = sm.OLS(y, X_multi).fit()
print(model_multi.summary())


### 7.4 Pearson Correlation Test (with p-value)

Formal statistical test to verify the significance of the correlation.

In [None]:
print('=' * 60)
print('PEARSON CORRELATION TEST')
print('=' * 60)

corr, pval = stats.pearsonr(df_reg['sentiment_mean'], df_reg['Log_Return'])
sig = '***' if pval < 0.01 else ('**' if pval < 0.05 else ('*' if pval < 0.1 else ''))
print(f"sentiment_mean                 -> r = {corr:+.4f}, p = {pval:.4f} {sig}")

corr, pval = stats.pearsonr(df_reg['sentiment_mean_lag1'], df_reg['Log_Return'])
sig = '***' if pval < 0.01 else ('**' if pval < 0.05 else ('*' if pval < 0.1 else ''))
print(f"sentiment_mean_lag1            -> r = {corr:+.4f}, p = {pval:.4f} {sig}")

corr, pval = stats.pearsonr(df_reg['sentiment_mean_lag2'], df_reg['Log_Return'])
sig = '***' if pval < 0.01 else ('**' if pval < 0.05 else ('*' if pval < 0.1 else ''))
print(f"sentiment_mean_lag2            -> r = {corr:+.4f}, p = {pval:.4f} {sig}")

print('\n--- Legend: *** p<0.01, ** p<0.05, * p<0.10 ---')


### 7.5 Regression Results Summary

Consolidated statistical metrics for the thesis.

In [None]:
# --- Comparative Summary ---
print('=' * 70)
print('REGRESSION MODELS SUMMARY')
print('=' * 70)

print(f"\n{'Model':<40} {'R²':>8} {'Adj R²':>8} {'F-stat':>10} {'p(F)':>12}")
print('-' * 78)

print(f"{'Simple (sentiment_mean)':<40} {model_simple.rsquared:>8.4f} {model_simple.rsquared_adj:>8.4f} {model_simple.fvalue:>10.4f} {model_simple.f_pvalue:>12.4e}")
print(f"{'Multiple (sentiment + lags)':<40} {model_multi.rsquared:>8.4f} {model_multi.rsquared_adj:>8.4f} {model_multi.fvalue:>10.4f} {model_multi.f_pvalue:>12.4e}")

print('\n' + '=' * 70)
print('SIMPLE MODEL COEFFICIENTS')
print('=' * 70)
for param_name, coef in model_simple.params.items():
    pval = model_simple.pvalues[param_name]
    sig = '***' if pval < 0.01 else ('**' if pval < 0.05 else ('*' if pval < 0.1 else ''))
    print(f"  {param_name:<30} coef = {coef:+.6f}  p = {pval:.4f} {sig}")

print('\n' + '=' * 70)
print('MULTIPLE MODEL COEFFICIENTS')
print('=' * 70)
for param_name, coef in model_multi.params.items():
    pval = model_multi.pvalues[param_name]
    sig = '***' if pval < 0.01 else ('**' if pval < 0.05 else ('*' if pval < 0.1 else ''))
    print(f"  {param_name:<30} coef = {coef:+.6f}  p = {pval:.4f} {sig}")

print('\n--- Legend: *** p<0.01, ** p<0.05, * p<0.10 ---')


### 7.6 Stationarity Test (Augmented Dickey-Fuller)

Before trusting correlation and regression results with time series, we must verify whether the series are **stationary**.
If they are not, the correlation found may be **spurious** (false).

- **H0**: The series has a unit root (not stationary)
- **H1**: The series is stationary
- **Criterion**: If `p-value < 0.05`, we reject H0 → series is stationary

In [None]:
from statsmodels.tsa.stattools import adfuller

def run_adf_test(series, name):
    """Runs the ADF test and prints formatted results."""
    result = adfuller(series.dropna(), autolag='AIC')
    adf_stat = result[0]
    p_value = result[1]
    used_lag = result[2]
    n_obs = result[3]
    critical_values = result[4]
    
    status = 'STATIONARY' if p_value < 0.05 else 'NOT STATIONARY'
    
    print(f"\n{'='*60}")
    print(f"ADF Test: {name}")
    print(f"{'='*60}")
    print(f"  ADF Statistic   : {adf_stat:.6f}")
    print(f"  p-value         : {p_value:.6f}")
    print(f"  Lags used       : {used_lag}")
    print(f"  Observations    : {n_obs}")
    print(f"  Critical Values:")
    for key, val in critical_values.items():
        print(f"    {key}: {val:.6f}")
    print(f"\n  -> Result: {status}")
    
    return p_value < 0.05

# Test the main series
print('STATIONARITY TESTS (ADF)')
print('Criterion: p-value < 0.05 -> stationary series')

is_return_stationary = run_adf_test(df_reg['Log_Return'], 'Log_Return (BOVA11 Return)')
is_sentiment_stationary = run_adf_test(df_reg['sentiment_mean'], 'sentiment_mean (Daily Sentiment)')

# If sentiment is not stationary, create and test the difference
if not is_sentiment_stationary:
    print('\n' + '!'*60)
    print('WARNING: sentiment_mean is NOT stationary!')
    print('Creating differenced series: d_sentiment = sentiment(t) - sentiment(t-1)')
    print('!'*60)
    
    df_reg['sentiment_diff'] = df_reg['sentiment_mean'].diff()
    is_diff_stationary = run_adf_test(df_reg['sentiment_diff'].dropna(), 'd_sentiment_mean (Differenced Sentiment)')
    
    if is_diff_stationary:
        print('\nThe differenced series IS stationary. Correlation/regression results are reliable.')
    else:
        print('\nEven the differenced series is not stationary. Interpret results with caution.')
else:
    print('\nBoth series are stationary. Correlation/regression results are reliable.')


### 7.7 Granger Causality Test

The Granger test evaluates whether one time series **helps predict** another.
We test in **both directions**:

1. **Sentiment → Return**: Does news sentiment anticipate market returns?
2. **Return → Sentiment**: Do market returns influence news sentiment?

- **H0**: Series X does **not** Granger-cause Y
- **Criterion**: If `p-value < 0.05`, we reject H0 → X Granger-causes Y

> **Important**: If we prove Sentiment → Return (p < 0.05), this supports the central hypothesis of the thesis.

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests

max_lags = 5  # Test from 1 to 5 lags

# Prepare data without NaN
df_granger = df_reg[['Log_Return', 'sentiment_mean']].dropna()

# --- Test 1: Sentiment -> Return ---
print('=' * 70)
print('GRANGER TEST: Sentiment -> Return')
print('H0: sentiment_mean does NOT Granger-cause Log_Return')
print('=' * 70)
# grangercausalitytests expects [Y, X] where we test if X causes Y
gc_sent_to_ret = grangercausalitytests(df_granger[['Log_Return', 'sentiment_mean']], maxlag=max_lags, verbose=True)

print('\n')

# --- Test 2: Return -> Sentiment ---
print('=' * 70)
print('GRANGER TEST: Return -> Sentiment')
print('H0: Log_Return does NOT Granger-cause sentiment_mean')
print('=' * 70)
gc_ret_to_sent = grangercausalitytests(df_granger[['sentiment_mean', 'Log_Return']], maxlag=max_lags, verbose=True)


In [None]:
# --- Granger Test Summary ---
print('\n' + '=' * 70)
print('SUMMARY: GRANGER CAUSALITY')
print('=' * 70)
print(f"{'Direction':<35} {'Lag':>4} {'F-stat':>10} {'p-value':>10} {'Sig':>5}")
print('-' * 70)

for lag in range(1, max_lags + 1):
    # Sentiment -> Return
    f_stat_sr = gc_sent_to_ret[lag][0]['ssr_ftest'][0]
    p_val_sr = gc_sent_to_ret[lag][0]['ssr_ftest'][1]
    sig_sr = '***' if p_val_sr < 0.01 else ('**' if p_val_sr < 0.05 else ('*' if p_val_sr < 0.1 else ''))
    print(f"{'Sentiment -> Return':<35} {lag:>4} {f_stat_sr:>10.4f} {p_val_sr:>10.4f} {sig_sr:>5}")

print('-' * 70)

for lag in range(1, max_lags + 1):
    # Return -> Sentiment
    f_stat_rs = gc_ret_to_sent[lag][0]['ssr_ftest'][0]
    p_val_rs = gc_ret_to_sent[lag][0]['ssr_ftest'][1]
    sig_rs = '***' if p_val_rs < 0.01 else ('**' if p_val_rs < 0.05 else ('*' if p_val_rs < 0.1 else ''))
    print(f"{'Return -> Sentiment':<35} {lag:>4} {f_stat_rs:>10.4f} {p_val_rs:>10.4f} {sig_rs:>5}")

print('\n--- Legend: *** p<0.01, ** p<0.05, * p<0.10 ---')


### 7.8 Rolling Correlation

The annual average correlation may hide important variations over time.
During crisis periods, the correlation tends to be stronger.

We calculate the correlation in **30-day rolling windows** to visualize how the relationship between sentiment and return varies throughout the year.

In [None]:
# --- Rolling Correlation (Window = 30 days) ---
window = 30

df_rolling = df_reg[['date', 'sentiment_mean', 'Log_Return']].copy()
df_rolling = df_rolling.set_index('date').sort_index()

# Calculate rolling correlation
rolling_corr = df_rolling['sentiment_mean'].rolling(window=window, min_periods=10).corr(df_rolling['Log_Return'])

# Visualization
fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)

# 1. Sentiment over time
ax1 = axes[0]
ax1.plot(df_rolling.index, df_rolling['sentiment_mean'], color='steelblue', linewidth=1, alpha=0.7)
ax1.fill_between(df_rolling.index, df_rolling['sentiment_mean'], alpha=0.3, color='steelblue')
ax1.set_ylabel('Average Sentiment')
ax1.set_title('Daily Sentiment')
ax1.axhline(y=0, color='gray', linestyle='--', linewidth=0.8)
ax1.grid(True, alpha=0.3)

# 2. Return over time
ax2 = axes[1]
colors = ['green' if v >= 0 else 'red' for v in df_rolling['Log_Return']]
ax2.bar(df_rolling.index, df_rolling['Log_Return'], color=colors, alpha=0.6, width=1)
ax2.set_ylabel('Log Return')
ax2.set_title('BOVA11 Daily Return')
ax2.grid(True, alpha=0.3)

# 3. Rolling Correlation
ax3 = axes[2]
ax3.plot(rolling_corr.index, rolling_corr.values, color='purple', linewidth=2)
ax3.fill_between(rolling_corr.index, rolling_corr.values, alpha=0.2, color='purple')
ax3.axhline(y=0, color='gray', linestyle='--', linewidth=1)
ax3.axhline(y=rolling_corr.mean(), color='red', linestyle=':', linewidth=1.5, label=f'Mean: {rolling_corr.mean():.3f}')
ax3.set_ylabel(f'Correlation ({window}d)')
ax3.set_xlabel('Date')
ax3.set_title(f'Rolling Correlation (Window = {window} days)')
ax3.legend(loc='upper right')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Rolling correlation statistics
print(f'\nRolling Correlation Statistics ({window} days):')
print(f'  Mean     : {rolling_corr.mean():+.4f}')
print(f'  Median   : {rolling_corr.median():+.4f}')
print(f'  Min      : {rolling_corr.min():+.4f}')
print(f'  Max      : {rolling_corr.max():+.4f}')
print(f'  Std      : {rolling_corr.std():.4f}')
print(f'  %% days>0: {(rolling_corr > 0).sum() / rolling_corr.notna().sum() * 100:.1f}%%')
