# 06 ‚Äî Insights & Business Findings

**Objective:** Synthesize all analyses into actionable business insights and key takeaways for stakeholders.

In [1]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
from scipy import stats

# Load final analyzed data
df = pd.read_parquet('../data/processed/posts_final.parquet')
print(f"Dataset: {len(df):,} posts")
print(f"Subreddits: {df['subreddit'].nunique()}")
print(f"Date range: {pd.to_datetime(df['created_utc']).min().date()} to {pd.to_datetime(df['created_utc']).max().date()}")

# Load transformer comparison sample if available
try:
    df_sample = pd.read_parquet('../data/processed/sentiment_sample_10k.parquet')
    has_transformer = True
    print(f"Transformer sample: {len(df_sample):,} posts")
except FileNotFoundError:
    has_transformer = False
    print("Transformer sample not found ‚Äî VADER-only metrics")

Dataset: 294,704 posts
Subreddits: 6
Date range: 2008-06-23 to 2022-05-08
Transformer sample: 9,996 posts


## Executive Summary

In [2]:
# Compute all metrics from data ‚Äî nothing hardcoded
print("=" * 70)
print("KEY METRICS (computed from data)")
print("=" * 70)

# Overall sentiment
avg_sent = df['vader_compound'].mean()
pct_pos = (df['vader_label'] == 'positive').mean()
pct_neg = (df['vader_label'] == 'negative').mean()
pct_neu = (df['vader_label'] == 'neutral').mean()
print(f"\nüìä Overall Sentiment:")
print(f"   Avg VADER compound: {avg_sent:+.4f}")
print(f"   Positive: {pct_pos:.1%} | Neutral: {pct_neu:.1%} | Negative: {pct_neg:.1%}")

# Most positive / negative subreddits
sub_sent = df.groupby('subreddit')['vader_compound'].mean().sort_values()
print(f"\nüèÜ Subreddit Ranking (by avg sentiment):")
for sub, score in sub_sent.items():
    print(f"   {sub:<25} {score:+.4f}")

# Most active subreddit
most_active = df['subreddit'].value_counts().idxmax()
most_active_count = df['subreddit'].value_counts().max()
print(f"\nüî• Most Active: r/{most_active} ({most_active_count:,} posts)")

# Engagement-sentiment correlation
corr_score = stats.pearsonr(df['vader_compound'], np.log1p(df['score']))
corr_comments = stats.pearsonr(df['vader_compound'], np.log1p(df['num_comments']))
print(f"\nüìà Engagement Correlations:")
print(f"   Sentiment ‚Üî Score:    r={corr_score[0]:+.4f} (p={corr_score[1]:.2e})")
print(f"   Sentiment ‚Üî Comments: r={corr_comments[0]:+.4f} (p={corr_comments[1]:.2e})")

# Topic distribution
if 'topic_name' in df.columns:
    print(f"\nüìå Top Topics:")
    for topic, count in df['topic_name'].value_counts().head(5).items():
        topic_sent = df[df['topic_name'] == topic]['vader_compound'].mean()
        print(f"   {topic:<30} {count:>6,} posts (sentiment: {topic_sent:+.3f})")

# Transformer comparison (if available)
if has_transformer:
    # Use same binary mapping as NB03 for consistency
    vader_binary = df_sample['vader_label'].apply(
        lambda x: 'positive' if x == 'positive' else 'negative'
    )
    agreement = (vader_binary == df_sample['transformer_label']).mean()
    print(f"\nü§ñ VADER ‚Üî DistilBERT Agreement: {agreement:.1%}")
    print(f"   (Low agreement expected ‚Äî SST-2 model trained on movie reviews, not tech text)")

# Posting patterns
df['hour'] = pd.to_datetime(df['created_utc']).dt.hour
df['dow'] = pd.to_datetime(df['created_utc']).dt.day_name()
peak_hour = df['hour'].mode()[0]
peak_day = df['dow'].mode()[0]
print(f"\n‚è∞ Peak Activity: {peak_day}s at {peak_hour}:00 UTC")

KEY METRICS (computed from data)

üìä Overall Sentiment:
   Avg VADER compound: +0.3038
   Positive: 53.9% | Neutral: 35.9% | Negative: 10.2%

üèÜ Subreddit Ranking (by avg sentiment):
   MachineLearning           +0.2352
   artificial                +0.2711
   analytics                 +0.3230
   computerscience           +0.3282
   datascience               +0.3953
   dataengineering           +0.4586

üî• Most Active: r/MachineLearning (120,765 posts)

üìà Engagement Correlations:
   Sentiment ‚Üî Score:    r=+0.0109 (p=3.01e-09)
   Sentiment ‚Üî Comments: r=+0.1765 (p=0.00e+00)

üìå Top Topics:
   Help / Would                   40,549 posts (sentiment: +0.482)
   Computer / Science / Year      38,285 posts (sentiment: +0.484)
   Learning / Machine / Intelligence 31,162 posts (sentiment: +0.165)
   Work / Time / People           29,909 posts (sentiment: +0.367)
   Function / Algorithm / Problem 25,534 posts (sentiment: +0.211)

ü§ñ VADER ‚Üî DistilBERT Agreement: 49.8%
   (Low

## Methodology Comparison

In [3]:
print("=" * 70)
print("METHODOLOGY COMPARISON")
print("=" * 70)

print("\nSentiment Analysis: VADER vs DistilBERT")
print("-" * 50)
print(f"{'Metric':<25} {'VADER':<20} {'DistilBERT':<20}")
print(f"{'Scale':<25} {'Full dataset':<20} {'10K sample':<20}")
print(f"{'Output classes':<25} {'3 (pos/neu/neg)':<20} {'2 (pos/neg)':<20}")
print(f"{'Training domain':<25} {'Rule-based':<20} {'SST-2 (movies)':<20}")
print(f"{'Handles sarcasm':<25} {'Limited':<20} {'Better (in-domain)':<20}")
print(f"{'Requires GPU':<25} {'No':<20} {'Optional':<20}")
print(f"{'Interpretability':<25} {'High':<20} {'Medium':<20}")

if has_transformer and 'agreement' in dir():
    print(f"{'Agreement rate':<25} {agreement:.1%}")

print("\nNote: DistilBERT's SST-2 checkpoint was not fine-tuned for tech forum text.")
print("Low agreement reflects domain mismatch, not a methodology failure.")
print("Recommendation: Fine-tune on ~1K labeled tech posts for production use.")

print("\n\nTopic Modeling: LDA")
print("-" * 50)
if 'topic_name' in df.columns:
    n_topics = df['topic_name'].nunique()
    avg_conf = df['lda_topic_prob'].mean() if 'lda_topic_prob' in df.columns else None
    print(f"  Topics discovered: {n_topics}")
    if avg_conf is not None:
        print(f"  Avg assignment confidence: {float(avg_conf):.3f}")
    else:
        print(f"  Avg assignment confidence: N/A")
    print(f"  Coherence evaluated across k=5,10,15,20 ‚Äî best model selected by c_v")
    print(f"  Note: BERTopic not run (requires additional compute). Would likely")
    print(f"  produce more coherent topics on short Reddit text.")

METHODOLOGY COMPARISON

Sentiment Analysis: VADER vs DistilBERT
--------------------------------------------------
Metric                    VADER                DistilBERT          
Scale                     Full dataset         10K sample          
Output classes            3 (pos/neu/neg)      2 (pos/neg)         
Training domain           Rule-based           SST-2 (movies)      
Handles sarcasm           Limited              Better (in-domain)  
Requires GPU              No                   Optional            
Interpretability          High                 Medium              
Agreement rate            49.8%

Note: DistilBERT's SST-2 checkpoint was not fine-tuned for tech forum text.
Low agreement reflects domain mismatch, not a methodology failure.
Recommendation: Fine-tune on ~1K labeled tech posts for production use.


Topic Modeling: LDA
--------------------------------------------------
  Topics discovered: 15
  Avg assignment confidence: 0.483
  Coherence evaluated across 

## Limitations & Future Work

In [4]:
print("LIMITATIONS:")
print("  ‚Ä¢ Data sourced from Kaggle datasets (2008‚Äì2022), not live Reddit API")
print("  ‚Ä¢ VADER struggles with tech-specific sarcasm (e.g., 'great, another AI tool')")
print("  ‚Ä¢ DistilBERT SST-2 trained on movie reviews ‚Äî domain mismatch with tech text")
print("    causes systematic negative bias (neutral tech posts ‚Üí negative classification)")
print("  ‚Ä¢ DistilBERT only produces binary (pos/neg) ‚Äî no neutral class")
print("  ‚Ä¢ LDA topic quality depends on preprocessing; short posts are challenging")
print("  ‚Ä¢ No comment-level analysis (post titles and body text only)")
print("  ‚Ä¢ DistilBERT ran on 10K sample, not full dataset")
print()
print("FUTURE WORK:")
print("  ‚Ä¢ Fine-tune DistilBERT on tech-specific labeled data (~1K labeled posts)")
print("  ‚Ä¢ Add real-time Reddit streaming for live sentiment monitoring")
print("  ‚Ä¢ Run BERTopic for comparison with LDA")
print("  ‚Ä¢ Correlate sentiment with actual stock price movements (lag analysis)")
print("  ‚Ä¢ Deploy to Streamlit Cloud for public access")
print("  ‚Ä¢ Expand to HackerNews for cross-platform comparison")

LIMITATIONS:
  ‚Ä¢ Data sourced from Kaggle datasets (2008‚Äì2022), not live Reddit API
  ‚Ä¢ VADER struggles with tech-specific sarcasm (e.g., 'great, another AI tool')
  ‚Ä¢ DistilBERT SST-2 trained on movie reviews ‚Äî domain mismatch with tech text
    causes systematic negative bias (neutral tech posts ‚Üí negative classification)
  ‚Ä¢ DistilBERT only produces binary (pos/neg) ‚Äî no neutral class
  ‚Ä¢ LDA topic quality depends on preprocessing; short posts are challenging
  ‚Ä¢ No comment-level analysis (post titles and body text only)
  ‚Ä¢ DistilBERT ran on 10K sample, not full dataset

FUTURE WORK:
  ‚Ä¢ Fine-tune DistilBERT on tech-specific labeled data (~1K labeled posts)
  ‚Ä¢ Add real-time Reddit streaming for live sentiment monitoring
  ‚Ä¢ Run BERTopic for comparison with LDA
  ‚Ä¢ Correlate sentiment with actual stock price movements (lag analysis)
  ‚Ä¢ Deploy to Streamlit Cloud for public access
  ‚Ä¢ Expand to HackerNews for cross-platform comparison


## Skills Demonstrated

- **NLP:** Text preprocessing, sentiment analysis (VADER + DistilBERT transformer), LDA topic modeling
- **Data Engineering:** Kaggle dataset integration, SQLite storage, parquet ETL pipeline
- **Machine Learning:** Model comparison, coherence-based hyperparameter selection, ensemble scoring
- **Statistical Analysis:** Correlation analysis, z-score anomaly detection, time series trends
- **Visualization:** Interactive dashboards (Streamlit + Plotly), publication-quality charts
- **Software Engineering:** Modular architecture, unit testing (pytest), YAML configuration, documentation

---

*Project by Shril Patel ‚Äî [GitHub](https://github.com/shrilpatel) | [LinkedIn](https://linkedin.com/in/shrilpatel)*