Crawling an org website to collect information:

1. page URL
2. page depth
3. inbound links
4. outbound links
5. links to Qgiv (count, type?)
6. links to social (Twitter, Facebook, etc.)
7. links to competitors (count, depth)
8. qgiv widgets (count, type?)
9. word count
10. image count

In [3]:
from bs4 import BeautifulSoup
import urllib2

In [40]:
PATH = "https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup"

In [35]:
href = 'https://stackoverflow.com'
base_domain = PATH.split('/')[2].replace('www.', '')

print("{} in {}: {}".format(base_domain, href, str(base_domain) in str(href)))

stackoverflow.com in https://stackoverflow.com: True


In [48]:
resp = urllib2.urlopen(PATH)
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
len(soup.text.replace('\n', '').replace('\r', ''))

37497

In [49]:
def crawl_page(url, base_domain=None):
    print("crawling page: {}".format(url))
    
    protocol = 'http'
    if 'https' in url:
        protocol = 'https'
    pre_www = 'www' in url

    if base_domain is None:
        base_domain = url.split('/')[2].replace('www.', '')
    
    resp = urllib2.urlopen(url, timeout=15)
    soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))

    # look for links
    outbound_links = 0
    internal_links = []
    calls_to_action = 0
    link_targets = {
        "qgiv": 0,
        "paypal": 0,
        "blackbaud": 0,
        "classy": 0,
        "onecause": 0,
        "mobilecause": 0,
        "networkforgood": 0,
        "givelively": 0,
        "giveeffect": 0,
        "engagingnetworks": 0,
        "donordrive": 0,
        "facebook": 0,
        "twitter": 0,
        "instagram": 0
    }
    for link in soup.find_all('a', href=True):        
        if len(link['href'].replace('#', '')) > 0 and '.jpg' not in link['href'] and '.gif' not in link['href'] and '.png' not in link['href'] and '.pdf' not in link['href']:
            if ('http' in link['href'] and base_domain in link['href']) or ('http' not in link['href'] and link['href'][0] == '/'):
                if 'http' not in link['href'] and link['href'][0] == '/':
                    _url = protocol + '://'
                    if pre_www:
                        _url += 'www.'
                    _url += base_domain + link['href']
                else:
                    _url = link['href']

                internal_links.append(_url)
            else:
                outbound_links += 1
        
        # look for calls to action
        for call_to_action in ["give", "donate", "register", "join", "gift", "get started", "be a part", "be part", "change"]:
            if call_to_action in link.text.lower():
                calls_to_action += 1
                break
                
        # look for categorized link targets
        for target in link_targets.keys():
            if target in link['href'].lower() or target in link.text.lower():
                link_targets[target] += 1
            
    # look for scripts
    script_sources = {
        "qgiv": 0,
        "paypal": 0,
        "blackbaud": 0,
        "classy": 0,
        "onecause": 0,
        "mobilecause": 0,
        "networkforgood": 0,
        "givelively": 0,
        "giveeffect": 0,
        "engagingnetworks": 0,
        "donordrive": 0,
        "facebook": 0,
        "twitter": 0,
        "instagram": 0
    }
    for script in soup.find_all('script'):
        # trying to account for static script includes as well as dynamic loading,
        # so we look at the script content if there is no SRC attribute
        if 'src' in script:
            js_check = script['src']
        else:
            js_check = script.text
        
        for source in script_sources.keys():
            if source in js_check:
                script_sources[source] += 1
    
    # word & image count
    word_count = len(soup.text.replace('\n', '').replace('\r', ''))
    image_count = len(soup.find_all('img'))
            
    return {
        'url': url,
        'outbound_links': outbound_links, 
        'internal_links': len(internal_links),
        'link_targets': link_targets,
        'scripts': script_sources,
        'calls_to_action': calls_to_action,
        'word_count': word_count,
        'image_count': image_count,
        '_other_pages': internal_links
    }

In [52]:
rsp = crawl_page(PATH)
rsp.keys()

['url', 'outbound_links', 'image_count', 'word_count', 'internal_links']

In [58]:
len(rsp['internal_links'])

243

In [47]:
def crawl(url):
    base_domain = url.split('/')[2].replace('www.', '')
    
    pages = []
    pages_to_crawl = []
    pages_crawled = [url]
    
    # scrape home page
    try:
        page_stats = crawl_page(url, base_domain=base_domain)
    except urllib2.HTTPError:
        print("could not open url: {}".format(url))
        return []
    
    pages.append(page_stats)
    pages_to_crawl = page_stats['_other_pages']
    
    # crawl through sub pages
    for p in pages_to_crawl:
        if len(pages_crawled) >= 100:
            break
        if p not in pages_crawled:
            pages_crawled.append(p)
            
            try:
                page_stats = crawl_page(p, base_domain=base_domain)
                pages.append(page_stats)
            except urllib2.HTTPError:
                print("could not open url: {}".format(url))
                continue
    
            
            for new_page in page_stats['_other_pages']:
                if new_page not in pages_to_crawl and new_page not in pages_crawled:
                    pages_to_crawl.append(new_page)
    
    # process page stats
    for i in pages:
        del(pages[i]['_other_pages'])
        
    return pages

# Testing

In [1]:
url = "https://www.kidswishnetwork.org"

In [42]:
# crawl home page
home_page = crawl_page(url)

crawling page: https://www.kidswishnetwork.org


In [43]:
home_page

{'_other_pages': ['https://www.kidswishnetwork.org/donate-now/',
  'https://www.kidswishnetwork.org/',
  'https://www.kidswishnetwork.org/donate-now/',
  'https://www.kidswishnetwork.org/wishes/refer-a-child/',
  'https://www.kidswishnetwork.org/category/blog/',
  'https://www.kidswishnetwork.org/faq/',
  'https://www.kidswishnetwork.org/wishes/',
  'https://www.kidswishnetwork.org/programs/',
  'https://www.kidswishnetwork.org/programs/a-child-forever/',
  'https://www.kidswishnetwork.org/memorial/',
  'https://www.kidswishnetwork.org/programs/hero-of-the-month/',
  'https//www.kidswishnetwork.org/programs/holiday-of-hope/',
  'https://www.kidswishnetwork.org/kidz-klub/kids-helping-kids/',
  'https://www.kidswishnetwork.org/programs/project-toy-drop/',
  'https://www.kidswishnetwork.org/programs/wish-granting/',
  'https://www.kidswishnetwork.org/wishes/hero-wishes/',
  'https://www.kidswishnetwork.org/wishes/refer-a-child/',
  'https://www.kidswishnetwork.org/wishes/granted-wishes/',

In [50]:
# crawl entire site
all_pages = crawl(url)

crawling page: https://www.kidswishnetwork.org
crawling page: https://www.kidswishnetwork.org/donate-now/
crawling page: https://www.kidswishnetwork.org/
crawling page: https://www.kidswishnetwork.org/wishes/refer-a-child/
crawling page: https://www.kidswishnetwork.org/category/blog/
crawling page: https://www.kidswishnetwork.org/faq/
crawling page: https://www.kidswishnetwork.org/wishes/
crawling page: https://www.kidswishnetwork.org/programs/
crawling page: https://www.kidswishnetwork.org/programs/a-child-forever/
crawling page: https://www.kidswishnetwork.org/memorial/
crawling page: https://www.kidswishnetwork.org/programs/hero-of-the-month/
crawling page: https://www.kidswishnetwork.org/programs/holiday-of-hope/
crawling page: https://www.kidswishnetwork.org/kidz-klub/kids-helping-kids/
crawling page: https://www.kidswishnetwork.org/programs/project-toy-drop/
crawling page: https://www.kidswishnetwork.org/programs/wish-granting/
crawling page: https://www.kidswishnetwork.org/wishe

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

# Updating org data from secure

In [1]:
DOMAIN = "https://secure.qgiv.com/"
URL = "admin/qgivadmin/utilities/export_tables.php"

post_data = {'key': 'DSQR59VwyFhw21PKDF4K', 'table': 'org'}

In [4]:
import requests, json
import pandas as pd

In [5]:
rsp = requests.post(DOMAIN + URL, data=post_data)
org_data = json.loads(rsp.text)

In [8]:
org_d = pd.DataFrame(org_data[0])

In [9]:
org_d.head()

Unnamed: 0,dateCreated,dateLive,id,mail_addr_country,mail_addr_state,status,website
0,2006-05-18 00:00:00,2006-05-18 00:00:00,6,US,FL,1,www.campfire-sunshine.org
1,2007-08-01 00:00:00,2000-01-01 00:00:00,9,US,FL,1,http://www.qgiv.com/contact
2,2006-08-22 00:00:00,2006-08-22 00:00:00,13,US,FL,1,www.viste.org
3,2007-02-06 00:00:00,2007-02-06 00:00:00,17,,,0,http://www.keithshousefoundation.com
4,2007-04-05 00:00:00,2007-04-05 00:00:00,23,,,0,www.marthafoundation.org
