# 05 â€” Trend Analysis

**Objective:** Analyze time series sentiment trends, correlate with tech events, detect anomalies, and measure engagement-sentiment relationships.

In [1]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

# Load data
df = pd.read_parquet('../data/processed/posts_final.parquet')

# Load events timeline
events = pd.read_csv('../data/external/tech_events_timeline.csv')

print(f"Loaded {len(df):,} posts and {len(events)} tech events")

Loaded 294,704 posts and 19 tech events


## 5.1 Sentiment Trend with Event Annotations

In [2]:
# Weekly sentiment with event overlays
df['week_start'] = pd.to_datetime(df['created_utc']).dt.tz_localize(None).dt.to_period('W').apply(lambda x: x.start_time)
weekly = df.groupby('week_start').agg(
    avg_sentiment=('vader_compound', 'mean'),
    post_count=('id', 'count'),
    avg_score=('score', 'mean'),
).reset_index()

fig = make_subplots(rows=2, cols=1, shared_xaxes=True, row_heights=[0.7, 0.3],
                    subplot_titles=['Weekly Sentiment with Tech Events', 'Post Volume'])

# Sentiment line
fig.add_trace(go.Scatter(x=weekly['week_start'], y=weekly['avg_sentiment'],
                         mode='lines', name='Avg Sentiment',
                         line=dict(color='#3498db', width=2)), row=1, col=1)
fig.add_hline(y=0, line_dash='dash', line_color='gray', row=1, col=1)

# Event annotations
for _, event in events.iterrows():
    try:
        event_date = pd.to_datetime(event['date'])
        if weekly['week_start'].min() <= event_date <= weekly['week_start'].max():
            fig.add_vline(x=event_date, line_dash='dot', line_color='rgba(255,0,0,0.3)', row=1, col=1)
            fig.add_annotation(x=event_date, y=weekly['avg_sentiment'].max(),
                             text=event['event'][:30], showarrow=True,
                             arrowhead=2, font=dict(size=8), row=1, col=1)
    except:
        pass

# Volume bars
fig.add_trace(go.Bar(x=weekly['week_start'], y=weekly['post_count'],
                     name='Post Volume', marker_color='#95a5a6', opacity=0.5), row=2, col=1)

fig.update_layout(height=600, showlegend=True)
fig.show()

## 5.2 Subreddit-Level Trends

In [3]:
# Sentiment per subreddit over time
df['month_str'] = pd.to_datetime(df['created_utc']).dt.tz_localize(None).dt.to_period('M').astype(str)
monthly_sub = df.groupby(['month_str', 'subreddit'])['vader_compound'].mean().reset_index()

fig = px.line(monthly_sub, x='month_str', y='vader_compound', color='subreddit',
              markers=True, title='Monthly Sentiment Trend by Subreddit')
fig.add_hline(y=0, line_dash='dash', line_color='gray')
fig.update_layout(height=450, xaxis_title='Month', yaxis_title='Avg VADER Compound',
                  xaxis_tickangle=45)
fig.show()

## 5.3 Rolling Sentiment & Anomaly Detection

In [4]:
# 7-day rolling sentiment with anomaly detection
daily = df.set_index(pd.to_datetime(df['created_utc'])).resample('D')['vader_compound'].mean().dropna()
rolling_mean = daily.rolling(7).mean()
rolling_std = daily.rolling(7).std()

# Z-score based anomalies
z_scores = (daily - rolling_mean) / rolling_std
anomalies = daily[z_scores.abs() > 2]

fig = go.Figure()
fig.add_trace(go.Scatter(x=daily.index, y=daily.values, mode='lines',
                         name='Daily Sentiment', line=dict(color='#bdc3c7', width=1)))
fig.add_trace(go.Scatter(x=rolling_mean.index, y=rolling_mean.values, mode='lines',
                         name='7-Day Rolling Mean', line=dict(color='#3498db', width=2)))
fig.add_trace(go.Scatter(x=anomalies.index, y=anomalies.values, mode='markers',
                         name='Anomalies (|z|>2)', marker=dict(color='#e74c3c', size=8)))
fig.update_layout(height=400, title='Daily Sentiment with Anomaly Detection')
fig.show()

n_anomalies = len(anomalies)
print(f"Detected {n_anomalies} anomalous days")
print(anomalies.sort_values().head(10))

Detected 49 anomalous days
created_utc
2014-01-11 00:00:00+00:00   -0.867000
2010-09-22 00:00:00+00:00   -0.790600
2012-07-15 00:00:00+00:00   -0.781600
2011-07-14 00:00:00+00:00   -0.645100
2012-04-13 00:00:00+00:00   -0.551550
2014-02-16 00:00:00+00:00   -0.438800
2021-04-10 00:00:00+00:00   -0.214667
2011-12-11 00:00:00+00:00    0.000000
2014-07-12 00:00:00+00:00    0.000000
2014-06-10 00:00:00+00:00    0.000000
Name: vader_compound, dtype: float64


## 5.4 Engagement-Sentiment Correlation

In [5]:
# Does negative content get more engagement?
corr_score = stats.pearsonr(df['vader_compound'], np.log1p(df['score']))
corr_comments = stats.pearsonr(df['vader_compound'], np.log1p(df['num_comments']))

print(f"Sentiment â†” Score correlation:    r={corr_score[0]:+.4f}, p={corr_score[1]:.4e}")
print(f"Sentiment â†” Comments correlation: r={corr_comments[0]:+.4f}, p={corr_comments[1]:.4e}")

# Interpret direction
score_dir = 'positive' if corr_score[0] > 0 else 'negative'
comment_dir = 'positive' if corr_comments[0] > 0 else 'negative'
print(f"\nInterpretation:")
print(f"  Score: {score_dir} correlation â€” {'more positive posts tend to score higher' if corr_score[0] > 0 else 'more negative posts tend to score higher'}")
print(f"  Comments: {comment_dir} correlation â€” {'more positive posts get more comments' if corr_comments[0] > 0 else 'more negative posts get more comments'}")

fig = px.scatter(df.sample(2000, random_state=42),
                 x='vader_compound', y='log_score', color='subreddit',
                 title='Sentiment vs Engagement (2K sample)',
                 labels={'vader_compound': 'VADER Compound', 'log_score': 'Log(Score+1)'},
                 opacity=0.5)
fig.update_layout(height=400)
fig.show()

Sentiment â†” Score correlation:    r=+0.0109, p=3.0099e-09
Sentiment â†” Comments correlation: r=+0.1765, p=0.0000e+00

Interpretation:
  Score: positive correlation â€” more positive posts tend to score higher
  Comments: positive correlation â€” more positive posts get more comments


In [6]:
# Engagement by sentiment bucket
df['sent_bucket'] = pd.cut(df['vader_compound'], bins=[-1, -0.5, -0.05, 0.05, 0.5, 1],
                           labels=['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'])

bucket_eng = df.groupby('sent_bucket', observed=True).agg(
    n=('id', 'count'),
    avg_score=('score', 'mean'),
    avg_comments=('num_comments', 'mean'),
    median_score=('score', 'median'),
).round(1)

print("Engagement by Sentiment Bucket:")
print(bucket_eng.to_string())

Engagement by Sentiment Bucket:
                    n  avg_score  avg_comments  median_score
sent_bucket                                                 
Very Negative   11096        6.0           8.1           1.0
Negative        18896        6.1           4.9           1.0
Neutral        105815        5.3           2.3           1.0
Positive        52975        4.8           3.5           1.0
Very Positive  105922        5.1           6.7           1.0


## Summary

- Sentiment follows identifiable patterns over time with clear event-driven shifts
- Z-score anomaly detection identified significant sentiment shift days (see count above)
- Engagement-sentiment correlation measured empirically â€” direction and strength reported from actual data
- Subreddit-level trends show distinct sentiment trajectories across communities
- Bucketed analysis reveals how engagement varies across sentiment levels

**Next:** 06_insights_and_findings.ipynb