In [1]:
## Install praw for scrapping data from Reddit
!pip3 install praw

Collecting praw
  Downloading praw-7.7.1-py3-none-any.whl (191 kB)
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Installing collected packages: prawcore, praw
Successfully installed praw-7.7.1 prawcore-2.4.0


In [2]:
# Import necessary libraries
import praw
import pandas as pd
from datetime import datetime, timedelta
from prawcore import NotFound

In [None]:
# Reddit API credentials
reddit = praw.Reddit(
    client_id = , # Add your cliet ID
    client_secret = , # Add your secret key
    user_agent = 'Scrapper 1.0 by /u/anzumbivor', # Format is: 'Scrapper version_number by reddit_username'
    username = , # Add your username
    password = # Add your password
)

In [None]:
def search_reddit(keywords, subreddits, start_date, end_date):
    """
    Search Reddit posts and comments based on multiple keywords, subreddits, date range, and region.
    
    :param keywords: List of keywords to search for
    :param subreddits: List of subreddits to search within
    :param start_date: Start date in the format 'YYYY-MM-DD'
    :param end_date: End date in the format 'YYYY-MM-DD'
    :return: DataFrame with the results
    """
    # Convert dates to Unix timestamps
    start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
    end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())

    data = []

    for subreddit in subreddits:
        for keyword in keywords:
            try:
                for submission in reddit.subreddit(subreddit).search(keyword, limit=None, time_filter='all'):
                    if start_timestamp <= submission.created_utc <= end_timestamp:
                        # Fetch comments for each submission
                        submission.comments.replace_more(limit=None)
                        comments = []
                        for comment in submission.comments.list():
                            if start_timestamp <= comment.created_utc <= end_timestamp:
                                comments.append({
                                    'Comment_Text': comment.body,
                                    'Comment_Created_UTC': comment.created_utc,
                                    'Comment_Score': comment.score
                                })

                        data.append({
                            'Keyword': keyword,
                            'Subreddit': subreddit,
                            'Title': submission.title,
                            'Text': submission.selftext,
                            'Created_UTC': submission.created_utc,
                            'Score': submission.score,
                            'Num_Comments': submission.num_comments,
                            'URL': submission.url,
                            'Comments': comments
                        })
            except NotFound:
                print(f"Subreddit {subreddit} not found or inaccessible.")
            except Exception as e:
                print(f"An error occurred while processing subreddit {subreddit} with keyword {keyword}: {e}")

    return pd.DataFrame(data)

In [None]:
# Parameters
keywords = ['health', 'trust', 'maternal', 'underrepresented']  # Add your list of keywords here
# subreddits = ['CanadaHealth', 'CanadianHealthcare', 'HealthCanada', 'Canada',
#               'BabyBumpsCanada', 'CanadianParents', 'Parenting',
#               'IndigenousCanada', 'FirstNations', 'BlackCanada', 'NewcomersCanada', 'LGBTCanada']  # Add your list of subreddits here

subreddits = ['BabyBumpsCanada', 'CanadianParents',
              'IndigenousCanada', 'lgbtcanada',
              'alberta', 'AlbertaHealthServices', 'AskACanadian', 'ontario', 
              'publichealth', 'ottawa', 'britishcolumbia', 'Manitoba', 'saskatchewan', 'NovaScotia',
              'newfoundland', 'nunavut', 'PEI', 'newbrunswickcanada', 'canada',
              'FirstNationsCanada']  # Add your list of subreddits here

start_date = '2023-01-01'
end_date = '2023-12-31'

In [None]:
# Get the data
df = search_reddit(keywords, subreddits, start_date, end_date)

In [None]:
# Expand the comments to separate rows while maintaining the post information
expanded_rows = []

for _, row in df.iterrows():
    post_info = {
        'Keyword': row['Keyword'],
        'Subreddit': row['Subreddit'],
        'Title': row['Title'],
        'Text': row['Text'],
        'Created_UTC': row['Created_UTC'],
        'Score': row['Score'],
        'Num_Comments': row['Num_Comments'],
        'URL': row['URL']
    }
    if row['Comments']:
        for comment in row['Comments']:
            expanded_row = post_info.copy()
            expanded_row.update(comment)
            expanded_rows.append(expanded_row)
    else:
        expanded_rows.append(post_info)

In [None]:
# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

In [3]:
# Write code to inspect the scrapped data from the dataframe



In [None]:
# Group by post details to count the number of comments per post
posts_with_comment_counts = expanded_df.groupby(['Keyword', 'Subreddit', 'Title', 'Text', 'Created_UTC', 'Score', 'Num_Comments', 'URL']).size().reset_index(name='Comment_Count')
# Print the result
posts_with_comment_counts

# Visualizing the Scrapped Data 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Group by subreddit and count the number of comments per subreddit
subreddit_comment_counts = expanded_df.groupby('Subreddit').size().reset_index(name='Comment_Count')

# Sort the data by Comment_Count
subreddit_comment_counts = subreddit_comment_counts.sort_values(by='Comment_Count', ascending=False)

# Set up the plot
plt.figure(figsize=(12, 8))
sns.barplot(data=subreddit_comment_counts, x='Subreddit', y='Comment_Count', palette='viridis')

# Customize the plot
plt.title('Number of Comments per Subreddit')
plt.xlabel('Subreddit')
plt.ylabel('Number of Comments')
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Group by keyword and count the number of comments per keyword
keyword_comment_counts = expanded_df.groupby('Keyword').size().reset_index(name='Comment_Count')

# Sort the data by Comment_Count
keyword_comment_counts = keyword_comment_counts.sort_values(by='Comment_Count', ascending=False)

# Set up the plot
plt.figure(figsize=(12, 8))
plot = sns.barplot(data=keyword_comment_counts, x='Keyword', y='Comment_Count', palette='viridis')

# Add numbers on top of each bar
for index, row in keyword_comment_counts.iterrows():
    plot.text(index, row.Comment_Count, row.Comment_Count, color='black', ha="center")

# Customize the plot
plt.title('Number of Comments per Keyword')
plt.xlabel('Keyword')
plt.ylabel('Number of Comments')
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()

#### ***Note: Before storing the data into a csv file, please remove the rows where there are no comments for any particular reddit posts.

# Save the Data to CSV File

In [None]:
# Save to CSV
expanded_df.to_csv('FA_reddit_posts_and_comments_Jun2.csv', index=False)