# Reddit Scraper

In [1]:
import os
import time
import csv
import praw
import json
import numpy as np
import pandas as pd

In [2]:
# Opening JSON file containing authentication info
with open('authentication.json', 'r') as openfile: 
    auth = json.load(openfile)

In [3]:
reddit_obj = praw.Reddit(client_id=auth["client_id"], 
                         client_secret=auth["client_secret"],
                         user_agent=auth["user_agent"],
                         username=auth["username"],
                         password=auth["password"])

print('Authenticated as /u/{}'.format(reddit_obj.user.me()))

Authenticated as /u/cam_man_can


### Extract posts, save them to .csv file

In [4]:
directory = "reddit_data/"
subreddit_name = 'NeutralPolitics'
posts_file_path = directory + subreddit_name + '_posts.csv'

In [5]:
def scrape_posts(reddit, subreddit, posts_file):
    """
    Scrapes posts from specified subreddit, logs post information into csv file.
    
    Currently configured to scrape "top" posts, with date range set to "all" (all time).
    
    Reddit API usually limits to 1000 posts. 
    
    Submission attributes: https://praw.readthedocs.io/en/latest/code_overview/models/submission.html
    """
    
    if not os.path.exists(os.path.dirname(posts_file)):
        os.makedirs(os.path.dirname(posts_file))

    with open(posts_file, "a",  encoding="utf-8") as outfile:
        for submission in reddit.subreddit(subreddit).top('all', limit=1000):
            print("\r \n", submission.title, end='')
            data = [
                submission.title,
                submission.author,
                submission.created_utc,
                submission.score,
                submission.domain,
                "%r" % submission.selftext,
                submission.id,
                submission.upvote_ratio,
                submission.num_comments
            ]
            writer = csv.writer(outfile)
            writer.writerow(data)
            # time.sleep(1)      

In [None]:
# scrape_posts(reddit_obj, subreddit_name, posts_file_path)

### Extract comments from scraped posts, save to another .csv file

In [7]:
comments_file_path = directory + subreddit_name + '_comments.csv'

In [8]:
def get_comments(reddit, posts_file, comments_file):
    """
    Takes csv file containing post information, and scrapes the comments for each post. 
    
    The amount of comments to scrape is controlled by the paramaters of submission.comments.replace_more().     
    
    Comment scraping tutorial: https://praw.readthedocs.io/en/latest/tutorials/comments.html
    Comment attributes: https://praw.readthedocs.io/en/latest/code_overview/models/comment.html#praw.models.Comment
    """
    
    if not os.path.exists(os.path.dirname(posts_file)):
        os.makedirs(os.path.dirname(posts_file))
        
    print("Fetching comments ...")
    with open(posts_file, "r", encoding="utf-8") as infile:
        reader = csv.reader(infile)
        row_counter = 0

        for row in reader:
            
            if (len(row) == 0):
                continue
            
            post_id = str(row[6])
            row_counter+=1

            if post_id == "6am00f":
                continue

            submission = reddit.submission(id=post_id)
            time.sleep(1)
            print("\r post count:%s" % (row_counter), end='')
            submission.comments.replace_more(limit=None) 

            for comment in submission.comments.list():                    
                if isinstance(comment, praw.models.MoreComments): 
                    continue
                
                comment_str = comment.body
                comment_author = comment.author
                comment_score = comment.score
                comment_created_utc = comment.created_utc
                
                if comment_str == "[deleted]" or comment_str == "[removed]":
                    continue
                
                with open(comments_file, "a",  encoding="utf-8") as outfile:
                    writer = csv.writer(outfile)
                    writer.writerow(["%r" % comment_str, comment_author, comment_score, comment_created_utc, post_id])

In [None]:
get_comments(reddit_obj, posts_file_path, comments_file_path)

Fetching comments ...
 post count:474