# Reddit Scraper

In [1]:
import os
import time
import csv
import praw
import json
import numpy as np
import pandas as pd

In [2]:
# Opening JSON file containing authentication info
with open('authentication.json', 'r') as openfile: 
    auth = json.load(openfile)

In [3]:
reddit_obj = praw.Reddit(client_id=auth["client_id"], 
                         client_secret=auth["client_secret"],
                         user_agent=auth["user_agent"],
                         username=auth["username"],
                         password=auth["password"])

print('Authenticated as /u/{}'.format(reddit_obj.user.me()))

Authenticated as /u/cam_man_can


### Extract posts, save them to .csv file

In [4]:
directory = "reddit_data/"
subreddit_name = 'Conservative'
posts_file_path = directory + subreddit_name + '_posts_new.csv'

In [5]:
def scrape_posts(reddit, subreddit, posts_file):
    """
    Scrapes posts from specified subreddit, logs post information into csv file.
    
    Currently configured to scrape "top" posts, with date range set to "all" (all time).
    
    Reddit API usually limits to 1000 posts. 
    
    Submission attributes: https://praw.readthedocs.io/en/latest/code_overview/models/submission.html
    """
    
    if not os.path.exists(os.path.dirname(posts_file)):
        os.makedirs(os.path.dirname(posts_file))

    with open(posts_file, "a",  encoding="utf-8") as outfile:
        for submission in reddit.subreddit(subreddit).top('all', limit=1000):
            print("\r \n", submission.title, end='')
            data = [
                submission.title,
                submission.author,
                submission.created_utc,
                submission.score,
                submission.domain,
                "%r" % submission.selftext,
                submission.id,
                submission.upvote_ratio,
                submission.num_comments
            ]
            writer = csv.writer(outfile)
            writer.writerow(data)
            # time.sleep(1)      

In [6]:
scrape_posts(reddit_obj, subreddit_name, posts_file_path)

 
 AP Has called the election for Joe Biden
 China is paying reddit to take down pictures of this. I entrust this to you
 Trump declaring the election over
 Terry Crews articulates everything wrong with “woke culture” & doesn’t back down
 Vote for who you want. Just don’t push the hate
 Newly Forged Common Ground
 saw this on r/memes and had to share lol
 Biden takes lead in Georgia
 Genuinely, please help me understand
 Trump Administration To Start Transition Process For Biden
 Trump calls for delay to 2020 US election
 Never Forget
 Still Counting...
 We need NV
 So let me get this straight...
 RESUBMITTED: Behold: reddit after they received $300 million from China's company, Tencent
 NY Times: Trump paid $750 in US income taxes in 2016, 2017
 Justice Ruth Bader Ginsburg, Champion Of Gender Equality, Dies At 87
 Bloomberg pays fines for 32,000 felons in Florida so they can vote
 Biden takes in lead in Pennsylvania
 Sen. Hawley Introduces Bill To Fine American Companies Relying On Ch

 Trump campaign says Sidney Powell not a member of legal team
 An Unexpected Coalition, A Formidable Opponent
 What the SJW really does
 69 Million Trump Supporters Take To Streets To Drive To Work And Go About Their Lives As Normal
 It’s been a pleasure, fellas
 'We Must Cancel Thanksgiving,' Says CDC Scientist Who Looks Suspiciously Like A Turkey In A Lab Coat
 Just gonna drop this here...
 A liberal echo chamber
 Dan Sullivan (R) wins reelection in Alaska, giving Republicans 50 seats in Senate
 Nick Cannon calls white and Jewish people “the true savages,” claims they are “closer to animals”
 Terry Crews addresses Nick Cannon supporters: 'I was never afraid of the KKK ... it was people like you'
 Petition to have racist Cambridge University Professor fired for tweeting "Abolish Whiteness" and "White Lives Don't Matter"
 Steven Crowder Surpasses ‘The Young Turks’ As Largest Online News Channel | The Daily Wire
 ....but that's none of my business....
 This shit
 After Wiping Out Half T

 Father of 19-year-old killed in CHOP says he got condolence call from President Trump, but not Seattle&amp;#x27;s mayor 
 Tuesday: Chris Cuomo mocks Republicans for claiming that crime is rising in American cities claiming that they are just trying to scare people into voting for Trump. Wednesday: Chris Cuomo claims crime is rising and criticizes Trump for inaction. 
 Recently Pardoned Turkey Found Dead After Claiming To Have Dirt On Hillary Clinton 
 Florida Recount Finally Wraps Up, Al Gore Declared President 
 Los Angeles: Instant Karma for the antifa/blm supporter smoking his tires at Trump supporters! 
 The only asian left 
 ironic 
 The New York Times is like a self-parody that keeps delivering 
 Democrats Reminding People That They're Black 
 Trump Campaign Makes WiFi Password ‘WhoBuiltTheCagesJoe?’ For Press At Latest Rally 
 **NOT SATIRE** Philip Anderson, the black man who recently got his teeth punched out by Antifa domestic terrorists for holding a 'free speech a

 McConnell: We’ll ‘Have No Lectures’ From Those Who Spent 4 Years ‘Refusing To Accept’ 2016 Election | The Daily Wire
 Charles Barkley is Not Happy About Anti-White, Anti-Semitic Comments From Black Celebrities -- "Listen, DeSean Jackson, Stephen Jackson, Nick Canon, Ice Cube - Man, what the hell are y'all doing?"
 It is not enough
 Black Man Erupts At BLM For Blocking Road: ‘I Got To Go To Work. I Got Bills. I Got Kids. Get The F*** Out My Way.’
 Far too few people understand this principle.
 WHO Screw-up
 UFC fighter calls out Lebron James virtue signaling
 VOTE! So monsters like this don't win!
 History always repeats itself
 I feel it coming
 Petition to "Recognize Thomas Sowell with the Presidential Medal of Freedom"
 Portland Killer Michael Reinoehl shot dead by police - Reinoehl fired off 40 to 50 rounds before being killed by members of U.S. Marshals task force
 Biden supporters flood streets over reported Biden victory — but media called Trump rallies 'super spreader' events
 

 BREAKING: Over 100 Police Agencies Pull Out Of Agreements To Guard DNC Convention
 Vaccine
 James wood locked by twitter for naming police officers killed in the line of duty in 2020
 Bruh
 Based woods as always
 Well, it's day 117
 Poll: 81% of Black Americans want police to maintain or increase local presence
 President Trump Quietly Signs Largest Wilderness Preservation Bill in a Decade
 State With No Electricity Orders Everyone To Drive Cars That Run On Electricity
 Asian kids officially kicked out of "students of color" category by a WA school district.
 The fact that someone took the time to tweet this is actually pretty scary. This is what voter suppression actually looks like.
 WHO BUILT THE CAGES JOE
 Elon Musk says he plans to move Tesla out of California and sue county after coronavirus restrictions
 I’m a liberal and I finally quit r/Politics
 FaCt ChEcKeRs
 White House Petition to ‘Impeach Nancy Pelosi for Treason’ Gains 404,000 Signatures
 Democrats Have High Standards
 

 Biden: ‘I’ll End Trump’s Chaos.’ Crenshaw: ‘Businesses Weren’t Boarding Up Their Windows Because They Were Afraid Of Trump Supporters’ | The Daily Wire 
 What could go wrong guys? Surely toxic yelpers won’t abuse this feature at all /s 
 Poll: 70% of Republicans Say 2020 Election Was Neither Free, Nor Fair 
 double standards... 
 Stochastic terrorism. They have already labeled 70 million Americans as deserving of violence against us. Take note and make sure you protect yourselves. 
 The irony over the CNN outrage is inspiring. 
 Reason number infinity to not take the MSM seriously... 
 Republican Young Kim takes down Democrat in California House rematch 
 Airport Revenues Soar After Allowing Travelers To Pay To Turn Off CNN 
 Kavanaugh boomerang: Harris said she's 'proud of' Jacob Blake, man accused of felony sexual assault 
 Voting Wrong Turned Me White 
 Remember: Soleimani Wasn't Just Responsible For US Deaths 
 Are you tired of winning yet? 
 I wonder? 
 Bernie's sub

### Extract comments from scraped posts, save to another .csv file

In [7]:
comments_file_path = directory + subreddit_name + '_comments_new.csv'

In [8]:
def get_comments(reddit, posts_file, comments_file):
    """
    Takes csv file containing post information, and scrapes the comments for each post. 
    
    The amount of comments to scrape is controlled by the paramaters of submission.comments.replace_more().     
    
    Comment scraping tutorial: https://praw.readthedocs.io/en/latest/tutorials/comments.html
    Comment attributes: https://praw.readthedocs.io/en/latest/code_overview/models/comment.html#praw.models.Comment
    """
    
    if not os.path.exists(os.path.dirname(posts_file)):
        os.makedirs(os.path.dirname(posts_file))
        
    print("Fetching comments ...")
    with open(posts_file, "r", encoding="utf-8") as infile:
        reader = csv.reader(infile)
        row_counter = 0

        for row in reader:
            
            if (len(row) == 0):
                continue
            
            post_id = str(row[6])
            row_counter+=1

            if post_id == "6am00f":
                continue

            submission = reddit.submission(id=post_id)
            time.sleep(1)
            print("\r post count:%s" % (row_counter), end='')
            submission.comments.replace_more(limit=30, threshold=10) 

            for comment in submission.comments.list():                    
                if isinstance(comment, praw.models.MoreComments): 
                    continue
                
                comment_str = comment.body
                comment_author = comment.author
                comment_score = comment.score
                comment_created_utc = comment.created_utc
                
                if comment_str == "[deleted]" or comment_str == "[removed]":
                    continue
                
                with open(comments_file, "a",  encoding="utf-8") as outfile:
                    writer = csv.writer(outfile)
                    writer.writerow(["%r" % comment_str, comment_author, comment_score, comment_created_utc, post_id])

In [9]:
get_comments(reddit_obj, posts_file_path, comments_file_path)

Fetching comments ...
 post count:995