# S&P Data
Covert S&P data into weekly log returns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import seaborn as sns

In [None]:
Data_path = "Data/SPX_07-14.csv"

In [None]:
spx = pd.read_csv(Data_path)

In [None]:
spx.head()

In [None]:
# Convert Date column to datetime
spx['Date'] = pd.to_datetime(spx['Date'])

In [None]:
# Sort by date
spx = spx.sort_values(by='Date')

In [None]:
spx.head()

In [None]:
# Resample to weekly (Sunday closing price)
weekly_spx = spx.set_index('Date').resample('W-SUN').last()

In [None]:
# Calculate log returns
weekly_spx['log_return'] = (weekly_spx['Close'] / weekly_spx['Close'].shift(1)).apply(lambda x: pd.NA if pd.isna(x) else np.log(x))

In [None]:
weekly_spx.head()

In [None]:
# Reset index for future joins
weekly_spx = weekly_spx.reset_index()

In [None]:
weekly_spx.dropna(inplace=True)

In [None]:
weekly_spx.to_csv('Data/SPX_Weekly_06-14.csv')

# NEWS

In [None]:
news_path = 'Data/news_sentiment_scores.csv'

In [None]:
file_news = pd.read_csv(news_path)

In [None]:
# Step 1: Prepare the News Data
file_news['Date'] = pd.to_datetime(file_news['Date'])

In [None]:
file_news.head()

In [None]:
# Step 2: Add sentiment labels to count each type
file_news['sentiment_positive'] = file_news['Sentiment Label'].str.lower() == 'positive'
file_news['sentiment_negative'] = file_news['Sentiment Label'].str.lower() == 'negative'
file_news['sentiment_neutral']  = file_news['Sentiment Label'].str.lower() == 'neutral'

In [None]:
# Step 3: Resample to weekly based on published date
file_news.set_index('Date', inplace=True)
weekly_news = file_news.resample('W-FRI').agg({
    'Sentiment Score': ['mean', 'std'],
    'sentiment_positive': 'sum',
    'sentiment_negative': 'sum',
    'sentiment_neutral': 'sum',
    'Article': 'count'
})

In [None]:
# Step 4: Flatten MultiIndex columns
weekly_news.columns = ['_'.join(col).strip() for col in weekly_news.columns.values]
weekly_news = weekly_news.reset_index()

In [None]:
# Rename for clarity
weekly_news.rename(columns={
    'Sentiment Score_mean': 'mean_news_sentiment',
    'Sentiment Score_std': 'std_news_sentiment',
    'Article_count': 'num_articles',
    'sentiment_positive_sum': 'num_positive',
    'sentiment_negative_sum': 'num_negative',
    'sentiment_neutral_sum': 'num_neutral'
}, inplace=True)

In [None]:
weekly_news.head()

In [None]:
# Set up plots
plt.figure(figsize=(12, 6))
plt.plot(weekly_news['Date'], weekly_news['num_articles'], label='Number of Articles per Week', color='navy')
plt.title('Number of News Articles per Week')
plt.xlabel('Date')
plt.ylabel('Article Count')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_article_count.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot sentiment counts (positive, negative, neutral)
plt.figure(figsize=(12, 6))
plt.plot(weekly_news['Date'], weekly_news['num_positive'], label='Positive', color='green')
plt.plot(weekly_news['Date'], weekly_news['num_negative'], label='Negative', color='red')
plt.plot(weekly_news['Date'], weekly_news['num_neutral'], label='Neutral', color='gray')
plt.title('Weekly News Sentiment Count Distribution')
plt.xlabel('Date')
plt.ylabel('Article Count')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_sentiment_dist.png", dpi=300, bbox_inches="tight")
plt.show()

Plot 2: Weekly Sentiment Type Counts
Insight 1: Neutral Articles Dominate
For most weeks, neutral sentiment articles outnumber both positive and negative ones.

This aligns with expectations in financial reporting: many articles are factual and cautious.

Insight 2: Negative Sentiment is Episodic but Spiky
we see intermittent spikes in negative articles.

These likely align with market downturns, earnings shocks, or crises (e.g., 2008 financial crash).

Insight 3: Positive Sentiment Shows Sustained Runs
Periods with consistent high positive sentiment suggest phases of bullish outlook (e.g., 2009–2010 post-recovery optimism).

May help predict return direction if used in aggregation.

In [None]:
# Plot mean sentiment score over time
plt.figure(figsize=(12, 6))
plt.plot(weekly_news['Date'], weekly_news['mean_news_sentiment'], label='Mean Sentiment Score', color='purple')
plt.fill_between(weekly_news['Date'],
                 weekly_news['mean_news_sentiment'] - weekly_news['std_news_sentiment'],
                 weekly_news['mean_news_sentiment'] + weekly_news['std_news_sentiment'],
                 color='purple', alpha=0.2, label='±1 Std Dev')
plt.title('Weekly Mean News Sentiment Score')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_mean_score.png", dpi=300, bbox_inches="tight")
plt.show()

Plot 3: Mean Sentiment Score + Variability

Insight 1: Average Sentiment is Mildly Positive
The overall mean = ~0.12, suggesting a slight upward bias in financial news tone.

This supports findings in prior research that financial news leans optimistic unless triggered otherwise.

Insight 2: Standard Deviation Ranges from 0.46 to 0.56+
High std dev weeks = news with both extremes (mixed or uncertain narrative).

Peaks in variability may signal volatility in markets or company events.

Insight 3: Signal Quality Varies
Some weeks have high mean sentiment and low std dev → strong consistent signal.

Others have mean ~0 but high std dev → noise, disagreement, or uncertain interpretation.

# Removing Bias

In “Using Financial News Sentiment for Stock Price Direction Prediction”, they mention:

“To reduce potential bias caused by excessive coverage on certain days, we randomly select a maximum of 10 news articles per day.”

This means:

Max 10 articles per day, → ~50/week if equally distributed

Random sampling to prevent domination by high-frequency companies or topics


“To control for disproportionate news coverage and maintain feature consistency across time, we cap the number of articles per week to 50, randomly sampled. This follows the precedent set in prior literature (e.g., [Paper 2]), which found excessive repetition and skewed coverage can bias sentiment aggregation.”


In [None]:
file_news = pd.read_csv(news_path)

In [None]:
# Repeat the logic now that 'Date' is accessible
file_news['Date'] = pd.to_datetime(file_news['Date'])
file_news['Week'] = file_news['Date'].dt.to_period('W').apply(lambda r: r.end_time)

In [None]:
# Group by week and sample up to 50 articles per week
sampled_news = (
    file_news.groupby('Week', group_keys=False)
    .apply(lambda x: x.sample(n=50, random_state=42) if len(x) > 50 else x)
    .reset_index(drop=True)
)

In [None]:
# Add sentiment type flags
sampled_news['sentiment_positive'] = sampled_news['Sentiment Label'].str.lower() == 'positive'
sampled_news['sentiment_negative'] = sampled_news['Sentiment Label'].str.lower() == 'negative'
sampled_news['sentiment_neutral']  = sampled_news['Sentiment Label'].str.lower() == 'neutral'

In [None]:
# Group again by week to aggregate after capping
weekly_sampled_news = sampled_news.groupby('Week').agg({
    'Sentiment Score': ['mean', 'std'],
    'sentiment_positive': 'sum',
    'sentiment_negative': 'sum',
    'sentiment_neutral': 'sum',
    'Article': 'count'
})

In [None]:
# Flatten column names
weekly_sampled_news.columns = ['_'.join(col).strip() for col in weekly_sampled_news.columns.values]
weekly_sampled_news = weekly_sampled_news.reset_index()

In [None]:
# Rename for clarity
weekly_sampled_news.rename(columns={
    'Week': 'Date',
    'Sentiment Score_mean': 'mean_news_sentiment',
    'Sentiment Score_std': 'std_news_sentiment',
    'Article_count': 'num_articles',
    'sentiment_positive_sum': 'num_positive',
    'sentiment_negative_sum': 'num_negative',
    'sentiment_neutral_sum': 'num_neutral'
}, inplace=True)

In [None]:
weekly_sampled_news.head()

In [None]:
# weekly_sampled_news.to_csv('Data/Weekly_Sampled_News_50.csv')

In [None]:
# Plot capped article counts
plt.figure(figsize=(12, 6))
plt.plot(weekly_sampled_news['Date'], weekly_sampled_news['num_articles'], label='Articles per Week (Capped at 50)', color='teal')
plt.title('Weekly News Article Counts (After Capping at 50)')
plt.xlabel('Date')
plt.ylabel('Article Count')
plt.grid(True)
plt.legend()
plt.tight_layout
plt.savefig("Data/10_day_run/News_EDA_weekly_article_count_capped.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Sentiment type counts
plt.figure(figsize=(12, 6))
plt.plot(weekly_sampled_news['Date'], weekly_sampled_news['num_positive'], label='Positive', color='green')
plt.plot(weekly_sampled_news['Date'], weekly_sampled_news['num_negative'], label='Negative', color='red')
plt.plot(weekly_sampled_news['Date'], weekly_sampled_news['num_neutral'], label='Neutral', color='gray')
plt.title('Weekly Sentiment Distribution (Capped at 50 Articles)')
plt.xlabel('Date')
plt.ylabel('Count')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_sentiment_dist_capped.png", dpi=300, bbox_inches="tight")
plt.show()


Plot 2: Weekly Sentiment Counts (Positive, Negative, Neutral)
Neutral-Dominant Weeks
Neutral sentiment prevails most weeks   typical for routine reporting or balanced coverage.

Spikes in Negative Sentiment
Late 2007: Rise in negative articles coincides with the collapse of New Century Financial (April ’07) and early subprime crisis signs 

Sept–Oct 2008: Peaks align with Lehman Brothers collapse (Sept 15, 2008), federal bailouts, and emergency legislation 
The Guardian

May 2010: Another spike during the Flash Crash event 
Council on Foreign Relations

Bursts of Positive Sentiment
2009–2010: Sustained positive coverage during recovery and policy reforms like TARP and Dodd-Frank 

Insight: Strong alignment between sentiment spikes and major financial events negative coverage during downturns, positive during recovery phases. This underscores the potential predictive signal in sentiment trends.

In [None]:
# Variability with mean and std
plot_data = weekly_sampled_news.dropna(subset=['mean_news_sentiment', 'std_news_sentiment']).copy()
# plot_data = plot_data[
#     (plot_data['Date'] >= '2008-01-01') & 
#     (plot_data['Date'] <  '2009-01-01')
# ]
plot_data['x'] = date2num(plot_data['Date'])
y_mean = plot_data['mean_news_sentiment'].to_numpy(dtype='float64')
y1 = y_mean - plot_data['std_news_sentiment'].to_numpy(dtype='float64')
y2 = y_mean + plot_data['std_news_sentiment'].to_numpy(dtype='float64')
x = plot_data['x'].to_numpy(dtype='float64')

In [None]:
# plot_data

In [None]:
plt.figure(figsize=(12, 6))
plt.plot_date(x, y_mean, '-', label='Mean Sentiment Score', color='purple')
plt.fill_between(x, y1, y2, color='purple', alpha=0.2, label='±1 Std Dev')
plt.title('Weekly Mean News Sentiment Score (Capped @ 50) + Variability')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_mean_capped.png", dpi=300, bbox_inches="tight")
plt.show()

Plot 3: Mean Sentiment Score with Variability (±1 Std Dev Band)
Persistent Mildly Positive Tone
Mean sentiment hovers slightly positive (~0.1–0.2), even as standard deviation widens, reflecting cautious optimism amid uncertainty.

Weeks with High Sentiment Volatility
Late 2007 – Early 2009: Broad bands indicating media disagreement and market turmoil during subprime crisis and recession 

May 2010: Greater standard deviation reflects chaotic news coverage during the Flash Crash 

Low Variability Periods
Post-2010: Narrower bands suggest calmer, more predictable sentiment after recovery from crises and as reforms stabilize markets 

Insight: Weeks of elevated emotional dispersion (high std dev) often coincide with high-stress events and sharp market turns highlighting volatility as a potential signal worth modeling.

“Our analysis reveals that while the average news tone remains slightly positive, significant peaks in negative sentiment and volatility are closely aligned with financial turning points subprime crisis onset (2007–2008), the Lehman collapse (Sept 2008), and the May 2010 Flash Crash. Conversely, periods of low sentiment variability are seen during market recovery and stabilization phases (post-2010). These patterns suggest that both the direction and dispersion of news sentiment capture meaningful signals tied to market regime shifts, underscoring their value in forecasting strategies.”


# ADDITIONAL EDA

In [None]:
weekly_news  = weekly_sampled_news.copy()
# news_df['Week'] = news_df['DateTime'].dt.to_period('W').apply(lambda r: r.start_time)

In [None]:
count_10 = weekly_news[weekly_news['num_articles'] < 10].shape[0]
count_50 = weekly_news[weekly_news['num_articles'] < 50].shape[0]

In [None]:
# Plot 1: Time series of mean weekly sentiment
plt.figure(figsize=(10,4))
plt.plot(weekly_news['Date'], weekly_news['mean_news_sentiment'], marker='o', linestyle='-')
plt.title('Weekly Mean News Sentiment Over Time')
plt.xlabel('Week')
plt.ylabel('Mean Sentiment Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plot 2: Distribution of weekly mean sentiment
plt.figure(figsize=(6,4))
plt.hist(weekly_news['mean_news_sentiment'], bins=30, edgecolor='k')
plt.title('Distribution of Weekly Mean News Sentiment')
plt.xlabel('Mean Sentiment Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_mean_dist_capped.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 3: Scatter num_articles vs mean_sentiment
plt.figure(figsize=(6,4))
plt.scatter(weekly_news['num_articles'], weekly_news['mean_news_sentiment'], alpha=0.7)
plt.title('Weekly Article Count vs. Mean Sentiment')
plt.xlabel('Number of Articles')
plt.ylabel('Mean Sentiment Score')
plt.tight_layout()
plt.show()

Weekly Time‐Series of Mean Sentiment

We see a lot of noise, but a few very positive spikes (late-2008/Early-2009) and deep negative dips around the Financial Crisis. After 2010 the series oscillates more tightly around zero, suggesting sentiment “regimes” that wer model might learn.

Distribution of Weekly Mean Sentiment
The histogram is tightly centered around 0–0.2, with a long negative tail (down to –0.8) and a few extreme positives (up to +1). That tells us most weeks have mildly positive aggregate sentiment, punctuated by occasional extreme sentiment events.

Scatter of Article Count vs. Mean Sentiment

Weeks with fewer articles (left) show the widest range of mean sentiment i.e. with only 1–10 articles, a single strongly positive/negative headline dominates the average. Weeks capped at 50 articles (right) cluster tighter around mild positive values.

Sentiment Volatility: Weeks with few articles are extremely noisy. Once we cap at 50/week, the aggregated score stabilizes but still carries crisis‐era outliers. we may want to record num_articles as a feature (or weight wer score by count) so wer model “knows” when sentiment is less reliable.


### Aggreagte NEWS can be noisy due to teh fact that 131 weeks have less than 10 articels

In [None]:
print(f'Total - {weekly_news.shape[0]}, less than 10 - {count_10}, less than 50 - {count_50}')

Since over a third of wer weeks (131/359) have fewer than 10 articles

1. Treat **<10-article weeks as “low coverage”**:

   * **Flag** them in wer feature set (e.g. `low_coverage_week = 1` if num\_articles < 10).
   * **Include** `num_articles` (or its log) so wer model “knows” how reliable the sentiment is.

2. **Smooth or back-fill sparse weeks** rather than trust a tiny sample:

   * For any week with `< 10` articles, replace its raw mean sentiment with a **2-week rolling average**:

   * This borrows information from the neighboring week, cutting the wild swings we saw in the scatter.

In [None]:
# 1. Flag low coverage weeks (<10 articles)
weekly_news['low_coverage_week'] = weekly_news['num_articles'] < 10

In [None]:
# 2. Smooth sentiment: borrow from previous week for low coverage
weekly_news['smoothed_sentiment'] = weekly_news['mean_news_sentiment']
mask = weekly_news['low_coverage_week']
weekly_news.loc[mask, 'smoothed_sentiment'] = (
    weekly_news['smoothed_sentiment']
    .rolling(2, min_periods=1)
    .mean()
)[mask]

In [None]:
# Plot A: Time-series of raw vs. smoothed sentiment
plt.figure(figsize=(10,4))
plt.plot(weekly_news['Date'], weekly_news['mean_news_sentiment'], label='Raw Weekly Mean', alpha=0.5)
plt.plot(weekly_news['Date'], weekly_news['smoothed_sentiment'], label='Smoothed Sentiment', linewidth=2)
plt.title('Weekly News Sentiment: Raw vs. Smoothed')
plt.xlabel('Week')
plt.ylabel('Sentiment Score')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_mean_smoothed.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot B: Distribution of Smoothed Sentiment
plt.figure(figsize=(6,4))
plt.hist(weekly_news['smoothed_sentiment'], bins=30, edgecolor='k')
plt.title('Distribution of Smoothed Weekly Sentiment')
plt.xlabel('Smoothed Sentiment Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("Data/10_day_run/News_EDA_weekly_mean_dist_smoothed.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot C: Scatter num_articles vs. smoothed_sentiment
plt.figure(figsize=(6,4))
plt.scatter(weekly_news['num_articles'], weekly_news['smoothed_sentiment'], alpha=0.7)
plt.title('Articles per Week vs. Smoothed Sentiment')
plt.xlabel('Number of Articles')
plt.ylabel('Smoothed Sentiment Score')
plt.tight_layout()
plt.show()

In [None]:
weekly_news

In [None]:
# assuming weekly_news as before
pct = weekly_news['num_articles'].quantile([.5, .75, .9, .95]).to_dict()
print(f"50th: {pct[0.5]:.0f}, 75th: {pct[0.75]:.0f}, 90th: {pct[0.9]:.0f}, 95th: {pct[0.95]:.0f}")

## 1. Time-Series Comparison

### **Before (Raw Weekly Mean)**

* **Wild jumps** almost every week that had fewer than 10 articles.
* Those tiny‐sample weeks often produced sentiment scores as high as +0.8 or as low as –0.5, even in “normal” periods.
* This made the entire series look extremely jagged, obscuring any broader trends.

### **After (Smoothed Sentiment)**

* **Low-coverage weeks (<10 articles)** have now been **replaced** by a two-week rolling average.
* In “normal” non‐crisis times, those previously extreme spikes/dips are now gently pulled back toward the prior week’s value.
* The only places where we still see big swings are during genuinely high‐volatility events (late 2008, early 2009) which is exactly what we want wer model to pick up as true market signals, not random noise.

**Bottom line:** we’ve knocked down the “random” outliers caused by having just 1–5 headlines in a week, while **retaining** the real, big sentiment moves around known crises.


## 2. Scatter Plot Comparison

### **Before (Raw vs. # of Articles)**

* Weeks with 1–9 articles sat all over the place from –0.8 to +0.8 making it impossible to know if a +0.6 score came from genuine optimism or just a single glowing headline.
* Weeks capped at 50 clustered tighter, but the “tail” of tiny‐sample weeks overwhelmed any pattern.

### **After (Smoothed vs. # of Articles)**

* Weeks with few articles now sit much closer to the 0–0.3 range, because we’ve overwritten their raw mean with a blend of that week and the prior one.
* we still see more dispersion for 10–50‐article weeks (they legitimately vary), but the **tiny**‐article weeks are no longer dominating the extremes.

**Bottom line:** the smoothed scatter shows that, once we account for data sparsity, nearly all wer weeks whether 12 articles or 50 produce reasonable sentiment estimates. we’ve removed the artificial “extreme outliers” without throwing away the genuine signals.


### Why This Matters for Modeling

* **Raw sentiment** would teach wer model that “1-article weeks = huge swings.” That’s not real market behavior → it’d likely over-fit those noise points.
* **Smoothed sentiment** now better reflects the underlying, multi-headline mood of the market each week. wer model can learn true market-level sentiment patterns, not the whims of a lone headline.

In [None]:
weekly_news.to_csv('Data/weekly_smoothed_news.csv', index=False)

## Lag structure: ACF and PACF up to lag 6 on smoothed series

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
sm_series = weekly_news["smoothed_sentiment"].dropna()

acf_path =  'Data/10_day_run/eda_news_acf_smoothed.png'
pacf_path = 'Data/10_day_run/eda_news_pacf_smoothed.png'

# ACF
fig1 = plt.figure(figsize=(6,4))
plot_acf(sm_series, lags=6, zero=False)
plt.title("ACF of smoothed weekly news sentiment (lags 1–6)")
plt.tight_layout()
plt.savefig(acf_path, bbox_inches="tight")

# PACF
fig2 = plt.figure(figsize=(6,4))
plot_pacf(sm_series, lags=6, zero=False, method="ywm")
plt.title("PACF of smoothed weekly news sentiment (lags 1–6)")
plt.tight_layout()
plt.savefig(pacf_path, bbox_inches="tight")