In [3]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup
from urllib import parse
import logging
import re

stripedoc = 'data/stripe-pages.csv'
starting_url = 'https://stripe.com'
cols = ['url', 'title', 'heading', 'text', 'word_count', 'num_links', 'link_text', 'link_word_count']
hrefs_to_skip = ['#', 'javascript', 'mailto']

visited_urls = []
urls_to_visit = []

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

def init_archive():
    df = pd.DataFrame(columns = cols)
    os.makedirs("data", exist_ok=True)
    df.to_csv(stripedoc)

def load_csv_archive():
    return pd.read_csv(stripedoc)

def save_csv_archive(df):
    os.makedirs("data", exist_ok=True)
    df.to_csv(stripedoc, index=False)

def add_links(links):
    for link in links:
        #print(link)
        href = link.get('href')
        # prepend starting url to relative url
        if href[0] == '/':
            href = parse.urljoin(starting_url, href)
        # don't add urls that are in the hrefs_to_skip list
        if any(skippable in href for skippable in hrefs_to_skip):
            continue
        # ignore anything not in domain
        if not href.startswith(starting_url):
            continue
        # if we haven't already visited it and it's not already on our list to visit
        if href not in urls_to_visit and href not in visited_urls:
            urls_to_visit.append(href)


def process_page(url):
    try:
        page = requests.get(url)
    except Exception:
        logging.exception(f'Failed to crawl: {url}')
        pass

    soup = BeautifulSoup(page.content, 'html.parser')

    title = soup.find('title').get_text()

    if '/docs' in url:
        real_content = soup.find('article', id='content')
        heading = soup.find('h1', class_='Title')
        if real_content is not None:
            text = real_content.get_text(" ")
            all_links = real_content.find_all('a', href=True)
        else:
            text = soup.get_text(strip=True)
            all_links = soup.find_all('a', href=True)

    if '/newsroom' in url:
        real_content = soup.find('article', class_='Section')
        heading = soup.find('h1', class_='NewsroomPost__title')
        if real_content is not None:
            text = real_content.get_text(" ")
            all_links = real_content.find_all('a', href=True)
        else:
            text = soup.get_text(strip=True)
            all_links = soup.find_all('a', href=True)

    else:
        real_content = soup.find_all('section', class_='Section')
        heading = soup.find('h1', class_='Copy__title')
        text = ''
        for section in real_content:
            text += ' '.join(section.get_text(" ").split()) + ' '
        all_links = []
        for section in real_content:
            for link in section.find_all('a', href=True):
                all_links.append(link)

    word_count = len(text)
    num_links = len(all_links)

    link_text = ''
    for link in all_links:
        link_text += ' '.join(link.get_text(" ").split()) + ' '

    link_word_count = len(link_text)

    # add urls to lists:
    if len(all_links) > 0:
        add_links(all_links)
    
    return { 'url': url, 
            'title': title, 
            'heading': heading, 
            'text': text, 
            'word_count': word_count, 
            'num_links': num_links, 
            'link_text': link_text, 
            'link_word_count': link_word_count }


init_archive()
df = load_csv_archive()

def run():
    first_page = process_page(starting_url)
    existing_df = load_csv_archive()
    new_row = pd.Series(first_page).to_frame().T
    df = pd.concat([load_csv_archive(), new_row], ignore_index=True)
    save_csv_archive(df)

    while urls_to_visit:
        url = urls_to_visit.pop(0)
        visited_urls.append(url)
        logging.info(f'Crawling: {url}')
        row = process_page(url)
        
        updated = pd.concat([load_csv_archive(), pd.Series(row).to_frame().T], ignore_index=True)
        save_csv_archive(updated)



run()


UnboundLocalError: local variable 'df' referenced before assignment

In [None]:
stripe = load_csv_archive()
stripe.drop('Unnamed: 0', inplace=True, axis=1)

stripe.describe()


In [None]:
#print(stripe[stripe.word_count == stripe.word_count.max()])

c = stripe[stripe.word_count != 0]

c.describe()