# Webscraping Reddit

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import os
import re
import praw
import requests
import json
from datetime import datetime
from bs4 import BeautifulSoup
import pprint

## Reddit

In [None]:
# Create credentials file:
# !touch reddit_credentials.txt
# !echo "OUR_CLIENT_ID\nOUR_SECRET" > reddit_credentials.txt
# !chmod 400 reddit_credentials.txt

In [None]:
with open('../_credentials/reddit_credentials.txt') as f:
    contents = f.read().split('\n')
    OUR_CLIENT_ID = contents[0]
    OUR_SECRET = contents[1]
APP = 'reddit_test_app/1.0'

In [None]:
reddit = praw.Reddit(client_id=OUR_CLIENT_ID, client_secret=OUR_SECRET,
                     grant_type='client_credentials', user_agent=APP)
subs = reddit.subreddit('Python').top(limit=10)
pprint.pprint([(s.score, s.title) for s in subs])

In [None]:
def get_reddit():
    return praw.Reddit(client_id=OUR_CLIENT_ID, client_secret=OUR_SECRET,
                       grant_type='client_credentials', user_agent=APP)

In [None]:
def get_top(subreddit_name, top_n=50):
    
    today = datetime.now().strftime(r'%Y-%m-%d')
    dirname = os.path.join('../_data/news-{} ({})'.format(today, subreddit_name))
    os.makedirs(dirname, exist_ok=True)

    # Get top n submissions from reddit
    reddit = get_reddit()
    top_subs = reddit.subreddit(subreddit_name).top(limit=top_n)

    # Remove those submissions that belongs to reddit
    subs = [sub for sub in top_subs if not sub.domain.startswith('self.')]

    count = 10
    while subs and count > 0:
        sub = subs.pop(0)
        article = get_article(sub.url)
        if article:
            text = '\n\n'.join(article['content'])
            filename = re.sub(r'\W+', '_', article['title']) + '.md'
            open(os.path.join(dirname, filename), 'w').write(text)
            count -= 1

In [None]:
def get_article(url):
    print('  - Retrieving {}'.format(url))
    try:
        res = requests.get(url)
        if (res.status_code == 200 and 'content-type' in res.headers and
                res.headers.get('content-type').startswith('text/html')):
            article = parse_article(res.text)
            print('      => done, title = "{}"'.format(article['title']))
            return article
        else:
            print('      x fail or not html')
    except Exception:
        pass

In [None]:
def parse_article(text):
    soup = BeautifulSoup(text, 'html.parser')

    # find the article title
    h1 = soup.body.find('h1')

    # find the common parent for <h1> and all <p>s.
    root = h1
    while root.name != 'body' and len(root.find_all('p')) < 5:
        root = root.parent

    if len(root.find_all('p')) < 5:
        return None

    # find all the content elements.
    ps = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre'])
    ps.insert(0, h1)
    content = [tag2md(p) for p in ps]

    return {'title': h1.text, 'content': content}

In [None]:
def tag2md(tag):
    if tag.name == 'p':
        return tag.text
    elif tag.name == 'h1':
        return f'{tag.text}\n{"=" * len(tag.text)}'
    elif tag.name == 'h2':
        return f'{tag.text}\n{"-" * len(tag.text)}'
    elif tag.name in ['h3', 'h4', 'h5', 'h6']:
        return f'{"#" * int(tag.name[1:])} {tag.text}'
    elif tag.name == 'pre':
        return f'```\n{tag.text}\n```'

In [None]:
for sub in subs:
    res = requests.get(sub.url)
    if (res.status_code == 200 and 'content-type' in res.headers and
        res.headers.get('content-type').startswith('text/html')):
        html = res.text

In [None]:
# Main
subreddits = ['javascript', 'Python', 'news']
for sr in subreddits:
    print('\nScraping: {}...'.format(sr))
    get_top(sr)