## Reddit Scraper

### Requirements

In [None]:
# %pip install praw
# %pip install psaw

In [None]:
import json
import praw
from psaw import PushshiftAPI
from prawcore.exceptions import Forbidden
import pandas as pd
import datetime as dt
import calendar
import requests
import time 
import re 

### Set Search Parameters

In [None]:
# set subreddits and keywords
search_dict = {
    "subreddits": ['SOME_SUBREDDIT'], # Enter the Subreddit(s) you want to scrape
    "keywords": ['SOME_KEYWORD'] # Enter the Keywords(s) you want to scrape reddit for
}

years = [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015] # Enter the years you want to scrape reddit for
months = [12,11,10,9,8,7,6,5,4,3,2,1] 

submission_fields = 'id,score,full_link,subreddit,title,selftext,created_utc,author,num_comments' # chose the data you need

### Set your praw API credentials 
##### Check this article on how to set it up:
https://towardsdatascience.com/scraping-reddit-data-1c0af3040768

In [None]:
# load Reddit authentication for PRAW
reddit = praw.Reddit(
    client_id="your_client_id",           # client id
    client_secret="your_client_secret",   # client secret
    user_agent="your_user_agent"          # user agent
)

### Scraping

#### Exporting scraped data to csv

In [None]:
def export_to_csv(comment_or_post, word, comments, year, month):
    
    exported_file_name = f'scraped_reddit_{comment_or_post}_for_{word}_in_{month}_{year}'

    comments.to_csv(f'{exported_file_name}.csv', index=False, header=True)

####  Timestamps for pushshift API

In [None]:
def before_after_timestamps(year,month):
    
    if year == dt.datetime.now().year and month == (dt.datetime.now().month):
        before = int(time.time())
        after = int(dt.datetime.strptime(f'01/{month}/2022 00:00:00', '%d/%m/%Y %H:%M:%S').timestamp())        
    else:
        before = int(dt.datetime.strptime(f'{calendar.monthrange(year, month)[1]}/{month}/{year} 23:59:59', '%d/%m/%Y %H:%M:%S').timestamp()) 
        after = int(dt.datetime.strptime(f'01/{month}/{year} 00:00:00', '%d/%m/%Y %H:%M:%S').timestamp()) 
    
    return before, after


#### Text formatting of scraped texts

In [None]:
# function that formats text for readability 
def clean_text(text):
    
    text = text.strip()
    text = re.sub('\n+', '\n', text)
    text = re.sub('&amp;', '&', text)
    text = re.sub('&lt;', '<', text)
    text = re.sub('&gt;', '>', text)
    text = re.sub('&#x200B;', '', text)
    text = re.sub('&nbsp;', ' ', text)
    
    return text

In [None]:
def clean_and_format_dataframe(df):

    df['body'] = df['body'].apply(lambda text: clean_text(text))
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['date'] = df['created_utc'].apply(lambda x: pd.Timestamp.to_pydatetime(x))
    df['link'] = 'https://www.reddit.com' + df['permalink']
        
    return df

#### Scraping the comments for each post

In [None]:
def scrape_comments_of_posts(posts):
        
    # search Reddit comments  using Pushshift
    
    data = []

    error_count = 0
    
    for index, post in posts.iterrows():
        
        print(f'Scraping comments for post {index + 1} of {len(posts.id)}', end='\x1b[1K\r')
        
        submission = reddit.submission(id=post.id)
        submission.comments.replace_more(limit=None)
        
        for comment in submission.comments.list():
            try:
                row = [
                    comment.parent_id, 
                    comment.id, 
                    comment.score, 
                    comment.created, 
                    comment.body, 
                    comment.score, 
                    comment.permalink,
                    comment.is_submitter,
                    comment.author
                ]
                data.append(row)
            except Exception as e: 
                print(e)
                continue
    
    data = [x for x in data if x != []] # delete empty lists in data
    
    df = pd.DataFrame(data, columns=[
        'parent_id', 
        'comment_id', 
        'score_id', 
        'created_utc', 
        'body','score', 
        'permalink', 
        'is_submitter',
        'author'
    ])
    
    df = clean_and_format_dataframe(df)
    
    print(f'total_posts_found {len(data)}')
    
    return df

#### Correcting data of each post with Praw API

In [None]:
def get_data_from_praw(data):
    
    data = data

    for count, d in enumerate(data):
        try:
            print(f"Progress: {count+1} / {len(data)}", end='\x1b[1K\r')
            # get data from PRAW based on unique post ID from Pushshift
            submission = reddit.submission(id=d['id'])
            submission.comment_sort = 'top'

            d.update({'score': submission.score})
            #d.update({'post keywords': keywords}) # for reference in csv
            d.update({'date': dt.datetime.fromtimestamp(d['created_utc']).date()})
            try:
                d.update({'comment_score': submission.comments[0].score})
                d.update({'top_comment': clean_text(submission.comments[0].body)})
            except:
                d.update({'comment_score': "N/A"})
                d.update({'top_comment': "N/A"})
            d.update({'title': clean_text(d.get("title","N/A"))})
            d.update({'selftext': clean_text(d.get("selftext","N/A"))})

            column_order = ['full_link', 'subreddit', 'post keywords', 'id', 'date', 'score', 'num_comments', 'author', 'title', 'selftext', 'top_comment', 'comment_score']
            df = pd.DataFrame(data, columns=column_order).drop_duplicates()
        except Forbidden:
            continue
        
    return df

#### Scraping Pushshift API for posts

In [None]:
def scrape_reddit_posts(keyword_or_subreddit, word, year, month):
    
    if keyword_or_subreddit == 'keywords':
        search = f"q={word}"  
    elif keyword_or_subreddit == 'subreddits':
        search = f"subreddit={word}"
    
    before, after = before_after_timestamps(year, month)
    
    url = f"https://api.pushshift.io/reddit/search/submission/?{search}&fields={submission_fields}&after={after}&size=1000&sort=desc&metadata=true"

    # search Reddit submissions (posts) using Pushshift
    start_from = '&before=' + str(before)
    first_pass = True
    data = []

    total_posts_found = 0 
    error_count = 0

    while True:
        if first_pass: 
            print(f"Collecting Reddit data for {word} in {month}/{year}...\n")
            try:
                request = requests.get(url+start_from)
                posts = request.json()
                
                first_pass = False
            except ValueError:
                error_count += 1
                first_pass = True
                continue

            difference = posts['metadata']['shards']["successful"] - posts['metadata']['shards']["total"]
            total_posts_found = posts['metadata']['total_results']
            
            print(f"{posts['metadata']['total_results']} {word}-posts found")
            
            if abs(difference) > 0:
                print(f"Warning {abs(difference)} shards are missing.")
        else:
            try:
                request = requests.get(url+start_from)
                posts = request.json()
            except ValueError:
                error_count += 1
                continue
        
        if abs(difference) > 0:
            print(f"JSONDecodeError count: {error_count}", end="\r")
        
        print(f"Progress: {len(data)} / {total_posts_found} Remaining: {posts['metadata']['total_results']}", end='\x1b[1K\r')
 
        data.extend(posts["data"])
        if len(posts["data"]) == 0:
            break # stop collecting data once there's nothing left to collect

        last_utc = data[-1]['created_utc']
        start_from = '&before=' + str(last_utc)
    
    if abs(difference) > 0:
        print(f"\r\nsuccessful data collection!\n{len(data)} of {total_posts_found} total collected. Missing posts due to missing shards")
    else:
        print(f"\r\nsuccessful data collection!\n{len(data)} of {total_posts_found} total collected.")
    
    # updating and completeing post data with differnt PRAW API
    print(f"\nUpdating and completeing data of {word}-posts with PRAW API. Approx 30 min/1000 posts")
    df = get_data_from_praw(data)
    
    return df

#### Putting together functions from above

In [None]:
def scrape_reddit(keyword_or_subreddit, word, year, month):
    
    # scraping posts
    posts = scrape_reddit_posts(keyword_or_subreddit,word, year, month)
    export_to_csv('post', word, posts, year, month)
        
    # scraping comments
    comments = scrape_comments_of_posts(posts)
    export_to_csv('comments', word, comments, year, month)

In [None]:
def main():
  
    for key in search_dict:
        for year in years:
            for month in months:
                if year == dt.datetime.now().year and month > dt.datetime.now().month:
                    continue
                else:
                    for word in search_dict[key]:
                        scrape_reddit(key, word, year, month)
    print('Done')

In [None]:
main()

Collecting Reddit data for whatcarshouldIbuy in 5/2022...

3729 whatcarshouldIbuy-posts found
Progress: 3729 / 3729 Remaining: 0[1KKKK
successful data collection!
3729 of 3729 total collected.

Updating and completeing data of whatcarshouldIbuy-posts with PRAW API. Approx 30 min/1000 posts
total_posts_found 36689ost 3729 of 3729[1K
Collecting Reddit data for whatcarshouldIbuy in 4/2022...

3417 whatcarshouldIbuy-posts found
Progress: 3415 / 3417 Remaining: 0[1KKKK
successful data collection!
3415 of 3417 total collected.

Updating and completeing data of whatcarshouldIbuy-posts with PRAW API. Approx 30 min/1000 posts
total_posts_found 34700ost 3415 of 3415[1K
Collecting Reddit data for whatcarshouldIbuy in 3/2022...

3135 whatcarshouldIbuy-posts found
Progress: 3134 / 3135 Remaining: 0[1KKKK
successful data collection!
3134 of 3135 total collected.

Updating and completeing data of whatcarshouldIbuy-posts with PRAW API. Approx 30 min/1000 posts
total_posts_found 30761ost 3134 of 3