## Scraping Reddit for mentions of ChatGPT in Posts

### Requirements for the scraping

In [4]:
#Importing the required libraries
import praw
from praw.models import MoreComments
import datetime as dt
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from PIL import Image
import easyocr
import io

In [5]:
#Getting authorized by filling in personal reddit account details
reddit_authorized = praw.Reddit(client_id="#############",
                                client_secret="############",
                                user_agent="Getting_Scraped_By_############",
                                username="#############",
                                password="#############")

### Functions for scraping

In [6]:
#Getting the posts from the subreddit(s)
def get_subreddit_data(subreddits:praw.models.Subreddits):
    """
    Assign a PRAW subreddit object for subreddits:praw.models.Subreddits
    Manipulates the information about the subreddits to obtain what we want
    Output: A Pandas DataFrame containing information on the subreddit's posts
    """
    #Placing the data in a list
    data_subreddits:list = []
    for submission in subreddits:
        data_subreddits.append([submission.id,
                     submission.subreddit.display_name,
                     submission.title,
                     submission.selftext,
                     submission.author,
                     submission.score,
                     submission.num_comments,
                     submission.shortlink,
                     dt.datetime.fromtimestamp(submission.created_utc)
                     ])

    #Creating a dataframe with the obtained information
    df_posts:pd.DataFrame = pd.DataFrame(data_subreddits, columns=['ID of Post',
                                                     'Subreddit',
                                                     'Title Post',
                                                     'Post Text',
                                                     'Author',
                                                     'Score', #upvotes - downvotes
                                                     'Number of Comments',
                                                     'URL of Post',
                                                     'Date & Time'
                                                     ])
    return df_posts

In [7]:
#Getting all the comments per post
def get_subreddit_comments(reddit_authorized, df_posts):
    """
    Takes in a PRAW authorized instance and a Pandas DataFrame of post IDs,
    returns a Pandas DataFrame containing information on the subreddit's comments.
    """
    data_comments:list = []
    post_id:str = ''
    for post_id in tqdm(df_posts["ID of Post"], total=len(df_posts), desc="Getting Comments"):
        post = reddit_authorized.submission(id=post_id)
        post.comments.replace_more(limit=None, threshold=0)
        for comment in post.comments.list():
            if type(comment) == MoreComments:
                continue
            if "AutoModerator" in str(comment.author):
                continue
            data_comments.append([post.subreddit.display_name,
                                  post.id,
                                  comment.body,
                                  comment.score,
                                  comment.author,
                                  dt.datetime.fromtimestamp(comment.created)])
    
    # Creating the dataframe
    df_comments:pd.DataFrame = pd.DataFrame(data_comments, columns=[
                                'Subreddit',
                                'ID of Post',
                                'Comment Text',
                                'Score',
                                'Author',
                                'Date & Time'])
    return df_comments

In [8]:
#Getting text from the images and articles from the URL's in the posts
def get_text_from_URL(df_submissions:pd.DataFrame, reddit_authorized):
    """
    Assign a PRAW authorized instance for reddit_authorized
    Assign a Pandas DataFrame for df_submissions
    Creates a new column in the original dataframe with the text from the image or article
    Output: A Pandas DataFrame containing more information on the subreddit's posts
    """
    new_columns:list = []
    reader = easyocr.Reader(['en'])
    #Loop through each row in the dataset
    for index, row in tqdm(df_submissions.iterrows(), total=len(df_submissions), desc="Processing URL of Post"):
        post_url = row['URL of Post']
        url_submission = reddit_authorized.submission(url=post_url)
        #Check if the link in the post goes to an image
        if url_submission.url.endswith(('jpg', 'jpeg', 'png', 'gif')):
            response = requests.get(url_submission.url)
            img = Image.open(io.BytesIO(response.content))
            # image_text = pytesseract.image_to_string(img)
            # new_columns.append(image_text)
            img_np = np.array(img)
            results = reader.readtext(img_np)
            image_text = ' '.join([result[1] for result in results])
            new_columns.append(image_text)
        #Check if the link in the post goes to a news article
        elif url_submission.url.startswith(('http://', 'https://')):
            response = requests.get(url_submission.url)
            soup = BeautifulSoup(response.text, 'html.parser')
            article_text = ''
            for paragraph in soup.find_all('p'):
                article_text += paragraph.text
            new_columns.append(article_text)
        #If the link does not go to an image or article, add an empty string
        else:
            new_columns.append('')
    #Add the new column to the original dataframe
    df_submissions['Text of URL Post'] = new_columns
    #Return a new df with the same columns as the original, plus the new column
    return df_submissions.copy()

### Selecting subreddits to scrape

In [9]:
#Searching through all the subreddits
subs_mention_chatgpt = reddit_authorized.subreddit('all').search('ChatGPT', time_filter='year', limit=None)
#Removing the posts from the ChatGPT subreddit
subs_mention_chatgpt = [submission for submission in subs_mention_chatgpt if submission.subreddit.display_name.lower() != 'chatgpt']
#Only saving the posts from specific time period
subs_mention_chatgpt = [submission for submission in subs_mention_chatgpt if
                              dt.datetime.fromtimestamp(submission.created_utc) > dt.datetime(year=2022, month=11, day=29) and
                              dt.datetime.fromtimestamp(submission.created_utc) < dt.datetime(year=2023, month=4, day=21)]

In [10]:
#Selecting the subreddit ChatGPT
sub_chatgpt = reddit_authorized.subreddit('ChatGPT')
sub_chatgpt = sub_chatgpt.top(time_filter='year', limit=None)
#Only saving the posts from specific time period
sub_chatgpt = [submission for submission in sub_chatgpt if
                              dt.datetime.fromtimestamp(submission.created_utc) > dt.datetime(year=2022, month=11, day=29) and
                              dt.datetime.fromtimestamp(submission.created_utc) < dt.datetime(year=2023, month=4, day=21)]

The reason for splitting the mentions of ChatGPT in subreddits and the ChatGPT subreddit is because of the large amount of posts in the ChatGPT subreddit. By using this method, a maximum of a 1000 posts can be scraped. This means the top 1000 posts (based on score) of the given time period. Because we are splitting the datasets, we can get more information on the conversation.

### Performing the scraping and saving the information

In [11]:
#Calling the functions
df_subs_ment_chatgpt = get_subreddit_data(subs_mention_chatgpt)
df_sub_chatgpt = get_subreddit_data(sub_chatgpt)

df_subs_ment_chatgpt = get_text_from_URL(df_subs_ment_chatgpt, reddit_authorized)
df_sub_chatgpt = get_text_from_URL(df_sub_chatgpt, reddit_authorized)

df_subs_comments = get_subreddit_comments(reddit_authorized, df_subs_ment_chatgpt)
df_chatgpt_comments = get_subreddit_comments(reddit_authorized, df_sub_chatgpt)

#Saving the dataframes to csv files
df_subs_ment_chatgpt.to_csv('F1sub_subreddits.csv', index=False)
df_sub_chatgpt.to_csv('F1sub_chatgpt.csv', index=False)
df_subs_comments.to_csv('F1comments_subreddits.csv', index=False)
df_chatgpt_comments.to_csv('F1comments_chatgpt.csv', index=False)

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Processing URL of Post: 100%|██████████| 93/93 [08:47<00:00,  5.67s/it]
CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.
Processing URL of Post:  41%|████      | 362/892 [1:15:58<1:51:14, 12.59s/it]


UnboundLocalError: local variable 'img' referenced before assignment