# Reddit Sentiment Analysis
## Analyze sentiment and insights from Reddit posts on a given topic

This notebook provides comprehensive sentiment analysis of Reddit posts including:
- Data collection from Reddit API
- Sentiment analysis using TextBlob and VADER
- Visualizations and insights
- Export capabilities

## 1. Setup and Imports

In [None]:
import os
import praw
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import warnings
from dotenv import load_dotenv

warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All libraries imported successfully!")

## 2. Initialize Reddit API Connection

In [None]:
# Initialize Reddit API
reddit = praw.Reddit(
    client_id=os.getenv('REDDIT_CLIENT_ID'),
    client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
    user_agent=os.getenv('REDDIT_USER_AGENT', 'SentimentAnalysis/1.0')
)

# Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()

print(f"✓ Connected to Reddit API")
print(f"✓ Read-only mode: {reddit.read_only}")

## 3. Configuration - Set Your Analysis Parameters

In [None]:
# ===== CONFIGURE YOUR ANALYSIS HERE =====
TOPIC = "artificial intelligence"  # Change this to your topic of interest
SUBREDDIT = "all"  # Change to specific subreddit (e.g., 'technology') or keep 'all'
LIMIT = 100  # Number of posts to analyze (max depends on Reddit API limits)
TIME_FILTER = "week"  # Options: 'hour', 'day', 'week', 'month', 'year', 'all'
# ========================================

print(f"Analysis Configuration:")
print(f"  Topic: {TOPIC}")
print(f"  Subreddit: r/{SUBREDDIT}")
print(f"  Posts to collect: {LIMIT}")
print(f"  Time filter: {TIME_FILTER}")

## 4. Collect Reddit Posts

In [None]:
print(f"Collecting {LIMIT} posts about '{TOPIC}' from r/{SUBREDDIT}...")

posts_data = []
subreddit_obj = reddit.subreddit(SUBREDDIT)

for post in subreddit_obj.search(TOPIC, limit=LIMIT, time_filter=TIME_FILTER):
    posts_data.append({
        'id': post.id,
        'title': post.title,
        'text': post.selftext,
        'score': post.score,
        'upvote_ratio': post.upvote_ratio,
        'num_comments': post.num_comments,
        'created_utc': datetime.fromtimestamp(post.created_utc),
        'subreddit': post.subreddit.display_name,
        'author': str(post.author),
        'url': post.url,
        'permalink': f"https://reddit.com{post.permalink}"
    })

df = pd.DataFrame(posts_data)
print(f"✓ Collected {len(df)} posts")
print(f"\nDate range: {df['created_utc'].min()} to {df['created_utc'].max()}")

# Display first few posts
df.head()

## 5. Perform Sentiment Analysis

In [None]:
print("Analyzing sentiment...")

# Combine title and text for analysis
df['full_text'] = df['title'] + ' ' + df['text']

# TextBlob sentiment
df['textblob_polarity'] = df['full_text'].apply(
    lambda x: TextBlob(str(x)).sentiment.polarity
)
df['textblob_subjectivity'] = df['full_text'].apply(
    lambda x: TextBlob(str(x)).sentiment.subjectivity
)

# VADER sentiment
vader_scores = df['full_text'].apply(
    lambda x: vader.polarity_scores(str(x))
)
df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
df['vader_pos'] = vader_scores.apply(lambda x: x['pos'])
df['vader_neu'] = vader_scores.apply(lambda x: x['neu'])
df['vader_neg'] = vader_scores.apply(lambda x: x['neg'])

# Classify sentiment
df['sentiment_label'] = df['vader_compound'].apply(
    lambda x: 'Positive' if x >= 0.05 else ('Negative' if x <= -0.05 else 'Neutral')
)

print("✓ Sentiment analysis complete!")

# Display sample results
df[['title', 'vader_compound', 'sentiment_label', 'score']].head(10)

## 6. Summary Statistics

In [None]:
print("="*60)
print("SUMMARY STATISTICS")
print("="*60)

print(f"\nTotal posts analyzed: {len(df)}")
print(f"Average score: {df['score'].mean():.2f}")
print(f"Average comments: {df['num_comments'].mean():.2f}")
print(f"Average upvote ratio: {df['upvote_ratio'].mean():.2%}")

print(f"\nSentiment Distribution:")
sentiment_dist = df['sentiment_label'].value_counts()
for sentiment, count in sentiment_dist.items():
    percentage = (count / len(df)) * 100
    print(f"  {sentiment}: {count} ({percentage:.1f}%)")

print(f"\nAverage VADER Compound Score: {df['vader_compound'].mean():.3f}")
print(f"Average TextBlob Polarity: {df['textblob_polarity'].mean():.3f}")

print(f"\nMost Active Subreddits:")
top_subreddits = df['subreddit'].value_counts().head(5)
for subreddit, count in top_subreddits.items():
    print(f"  r/{subreddit}: {count} posts")

## 7. Visualizations

### 7.1 Sentiment Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sentiment label distribution (Pie chart)
sentiment_counts = df['sentiment_label'].value_counts()
axes[0, 0].pie(sentiment_counts.values, labels=sentiment_counts.index, 
               autopct='%1.1f%%', startangle=90, colors=['#2ecc71', '#95a5a6', '#e74c3c'])
axes[0, 0].set_title('Sentiment Distribution', fontsize=14, fontweight='bold')

# VADER compound score distribution
axes[0, 1].hist(df['vader_compound'], bins=30, color='#3498db', alpha=0.7, edgecolor='black')
axes[0, 1].axvline(df['vader_compound'].mean(), color='red', 
                  linestyle='--', label=f"Mean: {df['vader_compound'].mean():.3f}")
axes[0, 1].set_xlabel('VADER Compound Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('VADER Sentiment Score Distribution', fontsize=14, fontweight='bold')
axes[0, 1].legend()

# Sentiment over time
time_sentiment = df.groupby([df['created_utc'].dt.date, 'sentiment_label']).size().unstack(fill_value=0)
time_sentiment.plot(kind='area', stacked=True, ax=axes[1, 0], 
                   color=['#2ecc71', '#95a5a6', '#e74c3c'], alpha=0.7)
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Number of Posts')
axes[1, 0].set_title('Sentiment Trends Over Time', fontsize=14, fontweight='bold')
axes[1, 0].legend(title='Sentiment')

# Score vs Sentiment
sentiment_colors = {'Positive': '#2ecc71', 'Neutral': '#95a5a6', 'Negative': '#e74c3c'}
for sentiment in df['sentiment_label'].unique():
    data_subset = df[df['sentiment_label'] == sentiment]
    axes[1, 1].scatter(data_subset['vader_compound'], data_subset['score'], 
                     alpha=0.5, label=sentiment, color=sentiment_colors[sentiment])
axes[1, 1].set_xlabel('VADER Compound Score')
axes[1, 1].set_ylabel('Post Score')
axes[1, 1].set_title('Post Score vs Sentiment', fontsize=14, fontweight='bold')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

### 7.2 Word Clouds

In [None]:
# Word cloud for all posts
text_all = ' '.join(df['full_text'].astype(str))
wordcloud_all = WordCloud(width=800, height=400, background_color='white', 
                         colormap='viridis', max_words=100).generate(text_all)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud_all, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - All Posts', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Word clouds by sentiment
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sentiments = ['Positive', 'Neutral', 'Negative']
colors = ['Greens', 'Greys', 'Reds']

for idx, (sentiment, cmap) in enumerate(zip(sentiments, colors)):
    data_filtered = df[df['sentiment_label'] == sentiment]
    if len(data_filtered) > 0:
        text = ' '.join(data_filtered['full_text'].astype(str))
        wordcloud = WordCloud(width=600, height=400, background_color='white', 
                            colormap=cmap, max_words=50).generate(text)
        axes[idx].imshow(wordcloud, interpolation='bilinear')
        axes[idx].axis('off')
        axes[idx].set_title(f'{sentiment} Posts ({len(data_filtered)})', 
                          fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

### 7.3 Engagement Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Average score by sentiment
sentiment_score = df.groupby('sentiment_label')['score'].mean().sort_values(ascending=False)
sentiment_score.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#95a5a6', '#e74c3c'])
axes[0].set_title('Average Post Score by Sentiment', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Average Score')
axes[0].tick_params(axis='x', rotation=0)

# Average comments by sentiment
sentiment_comments = df.groupby('sentiment_label')['num_comments'].mean().sort_values(ascending=False)
sentiment_comments.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#95a5a6', '#e74c3c'])
axes[1].set_title('Average Comments by Sentiment', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Sentiment')
axes[1].set_ylabel('Average Comments')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 8. Top Posts Analysis

In [None]:
print("TOP 10 POSTS BY SCORE")
print("="*60)
top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 
                                       'sentiment_label', 'vader_compound', 'permalink']]
display(top_posts)

In [None]:
print("TOP 10 MOST POSITIVE POSTS")
print("="*60)
most_positive = df.nlargest(10, 'vader_compound')[['title', 'vader_compound', 
                                                    'score', 'num_comments', 'permalink']]
display(most_positive)

In [None]:
print("TOP 10 MOST NEGATIVE POSTS")
print("="*60)
most_negative = df.nsmallest(10, 'vader_compound')[['title', 'vader_compound', 
                                                     'score', 'num_comments', 'permalink']]
display(most_negative)

## 9. Export Results

In [None]:
# Export to CSV
output_filename = f'reddit_analysis_{TOPIC.replace(" ", "_")}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
df.to_csv(output_filename, index=False)
print(f"✓ Results exported to '{output_filename}'")

# Export summary statistics
summary_filename = f'reddit_summary_{TOPIC.replace(" ", "_")}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
with open(summary_filename, 'w') as f:
    f.write(f"Reddit Sentiment Analysis Summary\n")
    f.write(f"Topic: {TOPIC}\n")
    f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"\nTotal posts: {len(df)}\n")
    f.write(f"Average score: {df['score'].mean():.2f}\n")
    f.write(f"Average comments: {df['num_comments'].mean():.2f}\n")
    f.write(f"\nSentiment Distribution:\n")
    for sentiment, count in df['sentiment_label'].value_counts().items():
        f.write(f"  {sentiment}: {count} ({count/len(df)*100:.1f}%)\n")
    f.write(f"\nAverage VADER Score: {df['vader_compound'].mean():.3f}\n")

print(f"✓ Summary exported to '{summary_filename}'")

## 10. Conclusion

This notebook has provided a comprehensive sentiment analysis of Reddit posts on your chosen topic. Key insights include:

- Overall sentiment distribution across posts
- Temporal trends in sentiment
- Engagement metrics by sentiment
- Most discussed topics (via word clouds)
- Top performing posts

You can modify the configuration parameters in Section 3 to analyze different topics, subreddits, or time periods.