In [6]:
import os  # File path handling
import sqlite3  # Database connection
import pandas as pd  # Data analysis
import numpy as np  # Numerical computations
from lets_plot import *  # Data visualization

# âœ… Enable Lets-Plot for Jupyter
LetsPlot.setup_html()

# ðŸ“¥ Load Data from SQLite
DB_PATH = "/files/ds105a-2024-alternative-summative-ajchan03/data/reddit_data.db"
conn = sqlite3.connect(DB_PATH)

# âœ… Load posts & comments data with JOIN to include 'subreddit'
df_comments = pd.read_sql_query("""
    SELECT comments.*, posts.subreddit
    FROM comments
    JOIN posts ON comments.post_id = posts.id;
""", conn)

# âœ… Close database connection
conn.close()

# âœ… Add Comment Length Column (Word Count)
df_comments["comment_length"] = df_comments["body"].apply(lambda x: len(x.split()))

# âœ… Categorize Sentiment
df_comments["sentiment_category"] = pd.cut(
    df_comments["comment_sentiment"],
    bins=[-1, -0.05, 0.05, 1],
    labels=["Negative", "Neutral", "Positive"]
)

# âœ… Display Filtered Data
print("\nðŸ“Š Sample Trump-Related Comments:")
display(df_comments.head())

# ðŸ“Š Sentiment Distribution by Subreddit
print("\nðŸ“Š Generating Sentiment Distribution Plot...")
p1 = ggplot(df_comments, aes(x="subreddit", y="comment_sentiment", fill="subreddit")) + \
    geom_violin(alpha=0.6) + \
    geom_boxplot(width=0.2, outlier_alpha=0.3) + \
    ggtitle("ðŸ”´ Sentiment Distribution of Trump-Related Comments") + \
    xlab("Subreddit") + ylab("Sentiment Score (-1 to 1)") + \
    theme_minimal()
display(p1)

# âœ… Filter for only 'politics' and 'Conservative' subreddits
df_comments = df_comments[df_comments["subreddit"].isin(["politics", "Conservative"])]

# ðŸ“Š Comment Length Distribution by Subreddit
print("\nðŸ“Š Generating Comment Length Distribution Plot...")
p2 = ggplot(df_comments, aes(x="comment_length", fill="subreddit")) + \
    geom_histogram(bins=30, alpha=0.6, position="identity") + \
    ggtitle("ðŸ”µ Comment Length Distribution") + \
    xlab("Comment Length (Words)") + ylab("Frequency") + \
    xlim(0, 400) + \
    theme_minimal()
display(p2)



ðŸ“Š Sample Trump-Related Comments:


Unnamed: 0,post_id,comment_id,body,score,created_utc,subreddit,comment_sentiment,subreddit.1,comment_length,sentiment_category
0,1d4emcb,l6drhd7,"You talking about twice impeached, convicted f...",1246,2024-05-30 21:16:40,politics,0.0,politics,10,Neutral
1,1d4emcb,l6dycuu,Kinda crazy to think that Donald Trump had com...,39,2024-05-30 21:56:35,politics,-0.5434,politics,19,Negative
2,1d4emcb,l6dwst2,"We can now officially add ""convicted felon"" to...",36,2024-05-30 21:47:25,politics,0.0,politics,17,Neutral
3,1d4emcb,l6dueig,"Wooo a 34/34, perfect 100%! Highest grade Trum...",72,2024-05-30 21:33:33,politics,0.6114,politics,13,Positive
4,1d4emcb,l6dvis0,The first line of his wikipedia page (not by m...,74,2024-05-30 21:39:59,politics,0.4215,politics,42,Positive



ðŸ“Š Generating Sentiment Distribution Plot...


AttributeError: 'DataFrame' object has no attribute 'dtype'