In [None]:
from collections import Counter
from datetime import datetime
import random
import re
from gensim import corpora, models
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from wordcloud import WordCloud
import os
import praw
import io
from dotenv import load_dotenv
load_dotenv()

Method 1: Scrape Reddit using Reddit's public API endpoint

In [None]:
%%script false
url = 'https://raw.githubusercontent.com/chapmanjacobd/reddit_mining/main/top_text_subreddits.csv'
response = requests.get(url)
subreddits_df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
count = 0
save_to = 'data/posts.csv'
if not os.path.exists('data'):
    os.makedirs('data')

for index, row in subreddits_df.iterrows():
    subreddit = row['subreddit']
    headers = {'User-agent': 'Mozilla/5.0'}
    after, limit, posts_data, new_count = None, 100, [], 0
    try:
        for _ in range(5):
            url = f'https://www.reddit.com/r/{subreddit}/hot.json'
            params = {'limit': limit, 'after': after, 't': 'year'}
            response = requests.get(url, headers=headers, params=params)
            data = json.loads(response.text)['data']
            posts = data['children']
            after = data['after']
            count += len(posts)
            new_count += len(posts)
            posts_data_fetched = [[post['data']['author'], post['data']['title'], post['data']['selftext'], post['data']['created_utc'], subreddit] for post in posts]
            posts_data.extend(posts_data_fetched)
    except:
        print(f'Error collecting posts from {subreddit}')
        continue
    print(f'{count} total posts collected, {new_count} posts collected from {subreddit}')
    posts_df = pd.DataFrame(posts_data, columns=['author', 'title', 'selftext', 'datetime', 'subreddit'])
    posts_df.to_csv(save_to, mode='a', header=False, index=False)

Method 2: Scrape Reddit using PRAW

In [None]:
%%script false
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
user_agent = os.getenv('USER_AGENT')
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

url = 'https://raw.githubusercontent.com/chapmanjacobd/reddit_mining/main/top_text_subreddits.csv'
response = requests.get(url)
subreddits_df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
count, added_count, limit = 0, 0, None
columns = ['author', 'title', 'selftext', 'created_utc', 'subreddit', 'upvote_ratio', 'score', 'num_comments', 'over_18']
posts_parsed, new_posts, posts_df = [], [], pd.DataFrame(columns=columns)
save_to = 'data/posts.csv'
if not os.path.exists('data'):
    os.makedirs('data')
    
continue_index = 0
if os.path.exists(save_to) and os.path.getsize(save_to) > 0:
    try:
        posts_df = pd.read_csv(save_to)
        last_row = posts_df.iloc[-1]
        last_subreddit = last_row['subreddit']
        continue_index = subreddits_df[subreddits_df['subreddit'] == last_subreddit].index[0] + 1
        print(f'Previously scraped posts found, continuing from index {continue_index} in subreddit list')
        count = len(posts_df)
    except:
        print('Error reading posts.csv')
        os.exit(0)

for index, row in subreddits_df.iterrows():
    if index < continue_index:
        continue
    try:
        subreddit = reddit.subreddit(row['subreddit'])
        new_posts = list(subreddit.new(limit=limit))
        count += len(new_posts)
        added_count = len(new_posts)
        print(f'{count} total posts collected, {added_count} new posts collected from {subreddit}')
        posts_parsed = [[post.author, post.title, post.selftext, post.created_utc, post.subreddit, post.upvote_ratio, post.score, post.num_comments, post.over_18] for post in new_posts]
        posts_df = pd.DataFrame(posts_parsed, columns=columns)
        if os.path.exists(save_to) and os.path.getsize(save_to) > 0:
            posts_df.to_csv(save_to, mode='a', header=False, index=False)
        else:
            posts_df.to_csv(save_to, mode='a', header=True, index=False)
    except:
        print(f'Error collecting posts from {subreddit}')
        continue

Define sample size, merge title and selftext, and filter by text length

In [None]:
posts_df = pd.read_csv('posts.csv')
sample_size = -1
if sample_size > 0:
    posts_df = posts_df.sample(sample_size, random_state=42)
posts_df['selftext'] = posts_df['selftext'].fillna('')
posts_df['title'] = posts_df['title'].fillna('')
posts_df['text'] = posts_df['selftext'] + posts_df['title']
minimum_text_length = 20

before_filtering = posts_df.shape[0]
posts_df = posts_df[posts_df.apply(lambda row: not (row['selftext'] == '[removed]' or row['selftext'] == '[deleted]' or row['title'] == '[removed]' or row['title'] == '[deleted]' or len(row['text']) < minimum_text_length), axis=1)]
after_filtering = posts_df.shape[0]
print(f'Filtered {before_filtering - after_filtering} posts')
print(posts_df.shape)
print(display(posts_df.head()))

Remove nonalphanumeric chars, links, and whitespace and lowercase all text

In [None]:
posts_df['cleaned-text'] = posts_df['text'].apply(lambda text: re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Z_\s]', '', re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE))).strip().lower())
before_filtering = posts_df.shape[0]
posts_df = posts_df[posts_df['cleaned-text'].str.strip() != '']
posts_df = posts_df.dropna(subset=['cleaned-text'])
after_filtering = posts_df.shape[0]
print(f'Filtered {before_filtering - after_filtering} posts')
print(display(posts_df[['text', 'cleaned-text']].head()))

Remove stopwords using Kaggle and Princeton stopword datasets

In [None]:
url1 = "https://storage.googleapis.com/kagglesdsdata/datasets/1003424/1692967/stopwords.txt?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240331%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240331T232544Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1dbfcfe70fdc7772a2c63eaf9ded9ed6f841f982e31615e846a0c4a935920c54aff228819773d9eb48d5758e2a283d99929cf2449bd43cccf4d4deeeb62445ddcfd1ffa7465afa2dc6dffe074dd0a5f3f442d745c9c7a32dfe71be4be8c678319282033a5334dbf73ed423319dc441b7066cf7c3051e63c5720c5678fbda846c810e69db9bd2378a9103827c21a114dad6aa103e783b05e58cb8206213106708cf4107fa0f1de75b9571c3a8534d4bdb986e9167ee11f04cb537cbbb2ccef67a75e710782c23195e6f68a84c8f0c5cb930d93680b7dd67b1f7093f37ea841319e1554a519e6ee841c5965c8ab8afdd38a7b219714fc3414d0aeeebc56180d8c2"
response = requests.get(url1)
stopwords1 = response.text.splitlines()
url2 = "https://algs4.cs.princeton.edu/35applications/stopwords.txt"
response = requests.get(url2)
stopwords2 = response.text.splitlines()

minimum_word_length = 5
posts_df['cleaned-text'] = posts_df['cleaned-text'].apply(lambda sentence: ' '.join([word for word in sentence.split() if word not in stopwords1 and word not in stopwords2 and len(word) >= minimum_word_length]))
posts_df = posts_df[posts_df['cleaned-text'].str.strip() != '']
posts_df = posts_df.dropna(subset=['cleaned-text'])
posts_df['datetime'] = posts_df['created_utc'].apply(lambda x: datetime.utcfromtimestamp(x))

columns_to_drop = ['selftext', 'title', 'text', 'created_utc', 'author']
posts_df = posts_df.drop(columns=columns_to_drop)
if not os.path.exists('data'):
    os.makedirs('data')
posts_df.to_csv('data/posts_cleaned.csv', index=False)
print(display(posts_df[['cleaned-text', 'datetime']].head()))

Create time of day vs post frequency bar graph

In [None]:
hour_series = posts_df['datetime'].apply(lambda x: x.hour)
activity_by_hour = hour_series.value_counts().sort_index()
plt.figure(figsize=(10, 6))
plt.bar(activity_by_hour.index, activity_by_hour.values)
plt.xlabel('Hour of the Day (UTC)')
plt.ylabel('Number of Posts')
plt.title('Number of Posts by Hour of the Day')
plt.xticks(range(0, 24))
plt.show()

Create wordcloud of most common words and most common bad words

In [None]:
text = ' '.join(posts_df['cleaned-text'])
wordcloud = WordCloud(width=800, height=400).generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

url = "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
response = requests.get(url)
bad_words = response.text.splitlines()
words = ' '.join(posts_df['cleaned-text']).split()
word_counts = Counter(words)
bad_word_counts = {word: count for word, count in word_counts.items() if word in bad_words}

wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(bad_word_counts)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Perform topic modeling on entire dataset

In [None]:
number_of_topics = 3
minimum_word_length = 5
documents = posts_df['cleaned-text'].tolist()
texts = [[word for word in document.split() if len(word) > minimum_word_length] for document in documents]
texts = [text for text in texts if len(text) > 0]
if len(texts) > 0:
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = models.LdaModel(corpus, num_topics=number_of_topics, id2word=dictionary, passes=5)
    topics = lda.print_topics(num_words=5)
    for topic in topics:
        print(topic)
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    if not os.path.exists('pyLDAvis'):
        os.makedirs('pyLDAvis')
    pyLDAvis.save_html(vis_data, 'pyLDAvis/all_lda.html')
    pyLDAvis.display(vis_data)
else:
    print('No documents to analyze')

Perform topic modeling on specific subreddits

In [None]:
unique_subreddits = posts_df['subreddit'].unique().tolist()
selected_subreddits = random.sample(unique_subreddits, 5)
number_of_topics = 3
minimum_word_length = 5

for subreddit in selected_subreddits:
    subreddit_df = posts_df[posts_df['subreddit'] == subreddit]
    documents = subreddit_df['subreddit'].tolist()
    texts = [[word for word in document.split() if len(word) > minimum_word_length] for document in documents]
    texts = [text for text in texts if len(text) > 0]
    if len(texts) > 0:
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        lda = models.LdaModel(corpus, num_topics=number_of_topics, id2word=dictionary, passes=5)
        topics = lda.print_topics(num_words=5)
        print(f'Subreddit: {subreddit}')
        print('Topics:')
        for topic in topics:
            print(topic)
        vis_data = gensimvis.prepare(lda, corpus, dictionary)
        if not os.path.exists('pyLDAvis'):
            os.makedirs('pyLDAvis')
        pyLDAvis.save_html(vis_data, f'pyLDAvis/{subreddit}_lda.html')
    else:
        print(f'No documents to analyze for {subreddit}')

Create time vs posts graphs for specific subreddits

In [None]:
unique_subreddits = posts_df['subreddit'].unique().tolist()
selected_subreddits = random.sample(unique_subreddits, 5)

for subreddit in selected_subreddits:
    subreddit_df = posts_df[posts_df['subreddit'] == subreddit].copy()
    subreddit_df['datetime'] = pd.to_datetime(subreddit_df['datetime'])
    subreddit_df.set_index('datetime', inplace=True)
    daily_counts = subreddit_df.resample('D').size()
    if len(daily_counts) > 50:
        daily_counts = subreddit_df.resample('W').size()
    plt.figure(figsize=(10, 6))
    plt.plot(daily_counts.index, daily_counts.values, label='Post Count')
    plt.scatter(daily_counts.index, daily_counts.values, color='red')
    z = np.polyfit(range(len(daily_counts)), daily_counts.values, 1)
    p = np.poly1d(z)
    plt.plot(daily_counts.index, p(range(len(daily_counts))), "r--", label='Trend Line')
    plt.title('Post Frequency Over Time for ' + subreddit)
    plt.xlabel('Time')
    plt.ylabel('Post Count')
    plt.legend()
    plt.xticks(rotation=45)
    plt.ylim(0, daily_counts.max() + 1)
    start_date = pd.to_datetime(daily_counts.index.min())
    end_date = pd.to_datetime(daily_counts.index.max())
    plt.xlim(start_date, end_date)
    plt.show()

Perform sentiment analysis by subreddit

In [None]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
posts_df['sentiment_score'] = posts_df['cleaned-text'].apply(lambda text: sia.polarity_scores(text)['compound'])
average_scores = posts_df.groupby('subreddit')['sentiment_score'].mean().sort_values()

number_subreddits_every = 100
num_unique_subreddits = posts_df['subreddit'].nunique()
subreddit_every = num_unique_subreddits // number_subreddits_every
average_scores_spliced = average_scores[::subreddit_every]
plt.figure(figsize=(10, 25))
plt.barh(average_scores_spliced.index, average_scores_spliced.values)
plt.xlabel('Sentiment Score')
plt.ylabel('Subreddit')
if subreddit_every == 1:
    plt.title('Sentiment Score by Subreddit')
elif str(subreddit_every)[-1] == '1':
    plt.title(f'Sentiment Score by Every {subreddit_every}st Subreddit')
elif str(subreddit_every)[-1] == '2':
    plt.title(f'Sentiment Score by Every {subreddit_every}nd Subreddit')
elif str(subreddit_every)[-1] == '3':
    plt.title(f'Sentiment Score by Every {subreddit_every}rd Subreddit')
else:   
    plt.title(f'Sentiment Score by Every {subreddit_every}th Subreddit')
plt.xticks(rotation=45)
plt.xlim(-1, 1)
plt.show()

number_of_subreddits_highest = 100
average_scores_highest = average_scores.nlargest(number_of_subreddits_highest).sort_values()
plt.figure(figsize=(10, 25))
plt.barh(average_scores_highest.index, average_scores_highest.values)
plt.xlabel('Sentiment Score')
plt.ylabel('Subreddit')
plt.title(f'Sentiment Score by Highest {number_of_subreddits_highest} Scoring Subreddits')
plt.xticks(rotation=45)
plt.xlim(-1, 1)
plt.show()

number_of_subreddits_lowest = 100
average_scores_lowest = average_scores.nsmallest(number_of_subreddits_lowest).sort_values()[::-1]
plt.figure(figsize=(10, 25))
plt.barh(average_scores_lowest.index, average_scores_lowest.values)
plt.xlabel('Sentiment Score')
plt.ylabel('Subreddit')
plt.title(f'Sentiment Score by Lowest {number_of_subreddits_lowest} Scoring Subreddits')
plt.xticks(rotation=45)
plt.xlim(-1, 1)
plt.show()

Create subreddit prediction model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(posts_df['cleaned-text'], posts_df['subreddit'], test_size=0.2, random_state=42)
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])
text_clf.fit(X_train, y_train)

predictions = text_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")
report = classification_report(y_test, predictions)
print(report)

custom_text = "I am a huge fan of the show Friends. I have watched every episode multiple times and I can't get enough of it. I love the characters and the storylines. I think it is one of the best TV shows ever made."
prediction = text_clf.predict([custom_text])
print(f"The predicted subreddit for the given post is: {prediction[0]}")